Commit | Line | Data |
---|---|---|
142781e1 TG |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | #include <linux/context_tracking.h> | |
4 | #include <linux/entry-common.h> | |
03248add | 5 | #include <linux/resume_user_mode.h> |
5fbda3ec | 6 | #include <linux/highmem.h> |
99cf983c | 7 | #include <linux/jump_label.h> |
a9f3a74a TG |
8 | #include <linux/livepatch.h> |
9 | #include <linux/audit.h> | |
f268c373 | 10 | #include <linux/tick.h> |
142781e1 | 11 | |
11894468 GKB |
12 | #include "common.h" |
13 | ||
142781e1 TG |
14 | #define CREATE_TRACE_POINTS |
15 | #include <trace/events/syscalls.h> | |
16 | ||
96e2fbcc | 17 | /* See comment for enter_from_user_mode() in entry-common.h */ |
6666bb71 | 18 | static __always_inline void __enter_from_user_mode(struct pt_regs *regs) |
142781e1 TG |
19 | { |
20 | arch_check_user_regs(regs); | |
21 | lockdep_hardirqs_off(CALLER_ADDR0); | |
22 | ||
23 | CT_WARN_ON(ct_state() != CONTEXT_USER); | |
24 | user_exit_irqoff(); | |
25 | ||
26 | instrumentation_begin(); | |
27 | trace_hardirqs_off_finish(); | |
28 | instrumentation_end(); | |
29 | } | |
30 | ||
96e2fbcc SS |
31 | void noinstr enter_from_user_mode(struct pt_regs *regs) |
32 | { | |
33 | __enter_from_user_mode(regs); | |
34 | } | |
35 | ||
142781e1 TG |
36 | static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) |
37 | { | |
38 | if (unlikely(audit_context())) { | |
39 | unsigned long args[6]; | |
40 | ||
41 | syscall_get_arguments(current, regs, args); | |
42 | audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); | |
43 | } | |
44 | } | |
45 | ||
46 | static long syscall_trace_enter(struct pt_regs *regs, long syscall, | |
29915524 | 47 | unsigned long work) |
142781e1 TG |
48 | { |
49 | long ret = 0; | |
50 | ||
11894468 GKB |
51 | /* |
52 | * Handle Syscall User Dispatch. This must comes first, since | |
53 | * the ABI here can be something that doesn't make sense for | |
54 | * other syscall_work features. | |
55 | */ | |
56 | if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { | |
57 | if (syscall_user_dispatch(regs)) | |
58 | return -1L; | |
59 | } | |
60 | ||
142781e1 | 61 | /* Handle ptrace */ |
64eb35f7 | 62 | if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { |
0cfcb2b9 | 63 | ret = ptrace_report_syscall_entry(regs); |
64eb35f7 | 64 | if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) |
142781e1 TG |
65 | return -1L; |
66 | } | |
67 | ||
68 | /* Do seccomp after ptrace, to catch any tracer changes. */ | |
23d67a54 | 69 | if (work & SYSCALL_WORK_SECCOMP) { |
142781e1 TG |
70 | ret = __secure_computing(NULL); |
71 | if (ret == -1L) | |
72 | return ret; | |
73 | } | |
74 | ||
b6ec4134 KC |
75 | /* Either of the above might have changed the syscall number */ |
76 | syscall = syscall_get_nr(current, regs); | |
77 | ||
524666cb | 78 | if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) |
142781e1 TG |
79 | trace_sys_enter(regs, syscall); |
80 | ||
81 | syscall_enter_audit(regs, syscall); | |
82 | ||
83 | return ret ? : syscall; | |
84 | } | |
85 | ||
4facb95b TG |
86 | static __always_inline long |
87 | __syscall_enter_from_user_work(struct pt_regs *regs, long syscall) | |
142781e1 | 88 | { |
b86678cf | 89 | unsigned long work = READ_ONCE(current_thread_info()->syscall_work); |
142781e1 | 90 | |
29915524 GKB |
91 | if (work & SYSCALL_WORK_ENTER) |
92 | syscall = syscall_trace_enter(regs, syscall, work); | |
142781e1 TG |
93 | |
94 | return syscall; | |
95 | } | |
96 | ||
4facb95b TG |
97 | long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) |
98 | { | |
99 | return __syscall_enter_from_user_work(regs, syscall); | |
100 | } | |
101 | ||
102 | noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) | |
103 | { | |
104 | long ret; | |
105 | ||
6666bb71 | 106 | __enter_from_user_mode(regs); |
4facb95b TG |
107 | |
108 | instrumentation_begin(); | |
109 | local_irq_enable(); | |
110 | ret = __syscall_enter_from_user_work(regs, syscall); | |
111 | instrumentation_end(); | |
112 | ||
113 | return ret; | |
114 | } | |
115 | ||
116 | noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) | |
117 | { | |
6666bb71 | 118 | __enter_from_user_mode(regs); |
4facb95b TG |
119 | instrumentation_begin(); |
120 | local_irq_enable(); | |
121 | instrumentation_end(); | |
122 | } | |
123 | ||
310de1a6 | 124 | /* See comment for exit_to_user_mode() in entry-common.h */ |
bb793562 | 125 | static __always_inline void __exit_to_user_mode(void) |
a9f3a74a TG |
126 | { |
127 | instrumentation_begin(); | |
128 | trace_hardirqs_on_prepare(); | |
129 | lockdep_hardirqs_on_prepare(CALLER_ADDR0); | |
130 | instrumentation_end(); | |
131 | ||
132 | user_enter_irqoff(); | |
133 | arch_exit_to_user_mode(); | |
134 | lockdep_hardirqs_on(CALLER_ADDR0); | |
135 | } | |
136 | ||
310de1a6 SS |
137 | void noinstr exit_to_user_mode(void) |
138 | { | |
139 | __exit_to_user_mode(); | |
140 | } | |
141 | ||
a9f3a74a | 142 | /* Workaround to allow gradual conversion of architecture code */ |
8ba62d37 | 143 | void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } |
a9f3a74a TG |
144 | |
145 | static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, | |
146 | unsigned long ti_work) | |
147 | { | |
148 | /* | |
149 | * Before returning to user space ensure that all pending work | |
150 | * items have been completed. | |
151 | */ | |
152 | while (ti_work & EXIT_TO_USER_MODE_WORK) { | |
153 | ||
154 | local_irq_enable_exit_to_user(ti_work); | |
155 | ||
156 | if (ti_work & _TIF_NEED_RESCHED) | |
157 | schedule(); | |
158 | ||
159 | if (ti_work & _TIF_UPROBE) | |
160 | uprobe_notify_resume(regs); | |
161 | ||
162 | if (ti_work & _TIF_PATCH_PENDING) | |
163 | klp_update_patch_state(current); | |
164 | ||
12db8b69 | 165 | if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) |
8ba62d37 | 166 | arch_do_signal_or_restart(regs); |
a9f3a74a | 167 | |
a68de80f | 168 | if (ti_work & _TIF_NOTIFY_RESUME) |
03248add | 169 | resume_user_mode_work(regs); |
a9f3a74a TG |
170 | |
171 | /* Architecture specific TIF work */ | |
172 | arch_exit_to_user_mode_work(regs, ti_work); | |
173 | ||
174 | /* | |
175 | * Disable interrupts and reevaluate the work flags as they | |
176 | * might have changed while interrupts and preemption was | |
177 | * enabled above. | |
178 | */ | |
179 | local_irq_disable_exit_to_user(); | |
47b8ff19 FW |
180 | |
181 | /* Check if any of the above work has queued a deferred wakeup */ | |
f268c373 | 182 | tick_nohz_user_enter_prepare(); |
47b8ff19 | 183 | |
6ce89512 | 184 | ti_work = read_thread_flags(); |
a9f3a74a TG |
185 | } |
186 | ||
187 | /* Return the latest work state for arch_exit_to_user_mode() */ | |
188 | return ti_work; | |
189 | } | |
190 | ||
191 | static void exit_to_user_mode_prepare(struct pt_regs *regs) | |
192 | { | |
6ce89512 | 193 | unsigned long ti_work = read_thread_flags(); |
a9f3a74a TG |
194 | |
195 | lockdep_assert_irqs_disabled(); | |
196 | ||
47b8ff19 | 197 | /* Flush pending rcuog wakeup before the last need_resched() check */ |
f268c373 | 198 | tick_nohz_user_enter_prepare(); |
47b8ff19 | 199 | |
a9f3a74a TG |
200 | if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) |
201 | ti_work = exit_to_user_mode_loop(regs, ti_work); | |
202 | ||
203 | arch_exit_to_user_mode_prepare(regs, ti_work); | |
204 | ||
205 | /* Ensure that the address limit is intact and no locks are held */ | |
206 | addr_limit_user_check(); | |
5fbda3ec | 207 | kmap_assert_nomap(); |
a9f3a74a TG |
208 | lockdep_assert_irqs_disabled(); |
209 | lockdep_sys_exit(); | |
210 | } | |
211 | ||
a9f3a74a | 212 | /* |
64eb35f7 | 213 | * If SYSCALL_EMU is set, then the only reason to report is when |
6342adca | 214 | * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall |
900ffe39 | 215 | * instruction has been already reported in syscall_enter_from_user_mode(). |
a9f3a74a | 216 | */ |
64eb35f7 | 217 | static inline bool report_single_step(unsigned long work) |
a9f3a74a | 218 | { |
41c1a06d | 219 | if (work & SYSCALL_WORK_SYSCALL_EMU) |
64eb35f7 GKB |
220 | return false; |
221 | ||
6342adca | 222 | return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; |
a9f3a74a | 223 | } |
29915524 GKB |
224 | |
225 | static void syscall_exit_work(struct pt_regs *regs, unsigned long work) | |
a9f3a74a TG |
226 | { |
227 | bool step; | |
228 | ||
11894468 GKB |
229 | /* |
230 | * If the syscall was rolled back due to syscall user dispatching, | |
231 | * then the tracers below are not invoked for the same reason as | |
232 | * the entry side was not invoked in syscall_trace_enter(): The ABI | |
233 | * of these syscalls is unknown. | |
234 | */ | |
235 | if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { | |
236 | if (unlikely(current->syscall_dispatch.on_dispatch)) { | |
237 | current->syscall_dispatch.on_dispatch = false; | |
238 | return; | |
239 | } | |
240 | } | |
241 | ||
a9f3a74a TG |
242 | audit_syscall_exit(regs); |
243 | ||
524666cb | 244 | if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) |
a9f3a74a TG |
245 | trace_sys_exit(regs, syscall_get_return_value(current, regs)); |
246 | ||
64eb35f7 | 247 | step = report_single_step(work); |
64c19ba2 | 248 | if (step || work & SYSCALL_WORK_SYSCALL_TRACE) |
0cfcb2b9 | 249 | ptrace_report_syscall_exit(regs, step); |
a9f3a74a TG |
250 | } |
251 | ||
252 | /* | |
253 | * Syscall specific exit to user mode preparation. Runs with interrupts | |
254 | * enabled. | |
255 | */ | |
256 | static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) | |
257 | { | |
b86678cf | 258 | unsigned long work = READ_ONCE(current_thread_info()->syscall_work); |
a9f3a74a TG |
259 | unsigned long nr = syscall_get_nr(current, regs); |
260 | ||
261 | CT_WARN_ON(ct_state() != CONTEXT_KERNEL); | |
262 | ||
263 | if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { | |
264 | if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) | |
265 | local_irq_enable(); | |
266 | } | |
267 | ||
268 | rseq_syscall(regs); | |
269 | ||
270 | /* | |
271 | * Do one-time syscall specific work. If these work items are | |
272 | * enabled, we want to run them exactly once per syscall exit with | |
273 | * interrupts enabled. | |
274 | */ | |
29915524 GKB |
275 | if (unlikely(work & SYSCALL_WORK_EXIT)) |
276 | syscall_exit_work(regs, work); | |
a9f3a74a TG |
277 | } |
278 | ||
c6156e1d | 279 | static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs) |
a9f3a74a | 280 | { |
a9f3a74a TG |
281 | syscall_exit_to_user_mode_prepare(regs); |
282 | local_irq_disable_exit_to_user(); | |
283 | exit_to_user_mode_prepare(regs); | |
c6156e1d SS |
284 | } |
285 | ||
286 | void syscall_exit_to_user_mode_work(struct pt_regs *regs) | |
287 | { | |
288 | __syscall_exit_to_user_mode_work(regs); | |
289 | } | |
290 | ||
291 | __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) | |
292 | { | |
293 | instrumentation_begin(); | |
294 | __syscall_exit_to_user_mode_work(regs); | |
a9f3a74a | 295 | instrumentation_end(); |
bb793562 | 296 | __exit_to_user_mode(); |
a9f3a74a TG |
297 | } |
298 | ||
142781e1 TG |
299 | noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) |
300 | { | |
6666bb71 | 301 | __enter_from_user_mode(regs); |
142781e1 | 302 | } |
a9f3a74a TG |
303 | |
304 | noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) | |
305 | { | |
306 | instrumentation_begin(); | |
307 | exit_to_user_mode_prepare(regs); | |
308 | instrumentation_end(); | |
bb793562 | 309 | __exit_to_user_mode(); |
a9f3a74a | 310 | } |
a5497bab | 311 | |
aadfc2f9 | 312 | noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) |
a5497bab TG |
313 | { |
314 | irqentry_state_t ret = { | |
315 | .exit_rcu = false, | |
316 | }; | |
317 | ||
318 | if (user_mode(regs)) { | |
319 | irqentry_enter_from_user_mode(regs); | |
320 | return ret; | |
321 | } | |
322 | ||
323 | /* | |
324 | * If this entry hit the idle task invoke rcu_irq_enter() whether | |
325 | * RCU is watching or not. | |
326 | * | |
78a56e04 | 327 | * Interrupts can nest when the first interrupt invokes softirq |
a5497bab TG |
328 | * processing on return which enables interrupts. |
329 | * | |
330 | * Scheduler ticks in the idle task can mark quiescent state and | |
331 | * terminate a grace period, if and only if the timer interrupt is | |
332 | * not nested into another interrupt. | |
333 | * | |
7f2a53c2 | 334 | * Checking for rcu_is_watching() here would prevent the nesting |
a5497bab TG |
335 | * interrupt to invoke rcu_irq_enter(). If that nested interrupt is |
336 | * the tick then rcu_flavor_sched_clock_irq() would wrongfully | |
97258ce9 | 337 | * assume that it is the first interrupt and eventually claim |
78a56e04 | 338 | * quiescent state and end grace periods prematurely. |
a5497bab TG |
339 | * |
340 | * Unconditionally invoke rcu_irq_enter() so RCU state stays | |
341 | * consistent. | |
342 | * | |
343 | * TINY_RCU does not support EQS, so let the compiler eliminate | |
344 | * this part when enabled. | |
345 | */ | |
346 | if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { | |
347 | /* | |
348 | * If RCU is not watching then the same careful | |
349 | * sequence vs. lockdep and tracing is required | |
45ff5105 | 350 | * as in irqentry_enter_from_user_mode(). |
a5497bab TG |
351 | */ |
352 | lockdep_hardirqs_off(CALLER_ADDR0); | |
353 | rcu_irq_enter(); | |
354 | instrumentation_begin(); | |
355 | trace_hardirqs_off_finish(); | |
356 | instrumentation_end(); | |
357 | ||
358 | ret.exit_rcu = true; | |
359 | return ret; | |
360 | } | |
361 | ||
362 | /* | |
363 | * If RCU is watching then RCU only wants to check whether it needs | |
364 | * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() | |
365 | * already contains a warning when RCU is not watching, so no point | |
366 | * in having another one here. | |
367 | */ | |
9d820f68 | 368 | lockdep_hardirqs_off(CALLER_ADDR0); |
a5497bab TG |
369 | instrumentation_begin(); |
370 | rcu_irq_enter_check_tick(); | |
9d820f68 | 371 | trace_hardirqs_off_finish(); |
a5497bab TG |
372 | instrumentation_end(); |
373 | ||
374 | return ret; | |
375 | } | |
376 | ||
4624a14f | 377 | void raw_irqentry_exit_cond_resched(void) |
a5497bab TG |
378 | { |
379 | if (!preempt_count()) { | |
380 | /* Sanity check RCU and thread stack */ | |
381 | rcu_irq_exit_check_preempt(); | |
382 | if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) | |
383 | WARN_ON_ONCE(!on_thread_stack()); | |
384 | if (need_resched()) | |
385 | preempt_schedule_irq(); | |
386 | } | |
387 | } | |
40607ee9 | 388 | #ifdef CONFIG_PREEMPT_DYNAMIC |
99cf983c | 389 | #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) |
4624a14f | 390 | DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); |
99cf983c MR |
391 | #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) |
392 | DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); | |
393 | void dynamic_irqentry_exit_cond_resched(void) | |
394 | { | |
0a70045e | 395 | if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) |
99cf983c MR |
396 | return; |
397 | raw_irqentry_exit_cond_resched(); | |
398 | } | |
399 | #endif | |
40607ee9 | 400 | #endif |
a5497bab | 401 | |
aadfc2f9 | 402 | noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) |
a5497bab TG |
403 | { |
404 | lockdep_assert_irqs_disabled(); | |
405 | ||
406 | /* Check whether this returns to user mode */ | |
407 | if (user_mode(regs)) { | |
408 | irqentry_exit_to_user_mode(regs); | |
409 | } else if (!regs_irqs_disabled(regs)) { | |
410 | /* | |
411 | * If RCU was not watching on entry this needs to be done | |
412 | * carefully and needs the same ordering of lockdep/tracing | |
413 | * and RCU as the return to user mode path. | |
414 | */ | |
415 | if (state.exit_rcu) { | |
416 | instrumentation_begin(); | |
417 | /* Tell the tracer that IRET will enable interrupts */ | |
418 | trace_hardirqs_on_prepare(); | |
419 | lockdep_hardirqs_on_prepare(CALLER_ADDR0); | |
420 | instrumentation_end(); | |
421 | rcu_irq_exit(); | |
422 | lockdep_hardirqs_on(CALLER_ADDR0); | |
423 | return; | |
424 | } | |
425 | ||
426 | instrumentation_begin(); | |
4624a14f | 427 | if (IS_ENABLED(CONFIG_PREEMPTION)) |
a5497bab | 428 | irqentry_exit_cond_resched(); |
4624a14f | 429 | |
a5497bab TG |
430 | /* Covers both tracing and lockdep */ |
431 | trace_hardirqs_on(); | |
432 | instrumentation_end(); | |
433 | } else { | |
434 | /* | |
435 | * IRQ flags state is correct already. Just tell RCU if it | |
436 | * was not watching on entry. | |
437 | */ | |
438 | if (state.exit_rcu) | |
439 | rcu_irq_exit(); | |
440 | } | |
441 | } | |
b6be002b TG |
442 | |
443 | irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) | |
444 | { | |
445 | irqentry_state_t irq_state; | |
446 | ||
447 | irq_state.lockdep = lockdep_hardirqs_enabled(); | |
448 | ||
449 | __nmi_enter(); | |
450 | lockdep_hardirqs_off(CALLER_ADDR0); | |
451 | lockdep_hardirq_enter(); | |
452 | rcu_nmi_enter(); | |
453 | ||
454 | instrumentation_begin(); | |
455 | trace_hardirqs_off_finish(); | |
456 | ftrace_nmi_enter(); | |
457 | instrumentation_end(); | |
458 | ||
459 | return irq_state; | |
460 | } | |
461 | ||
462 | void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state) | |
463 | { | |
464 | instrumentation_begin(); | |
465 | ftrace_nmi_exit(); | |
466 | if (irq_state.lockdep) { | |
467 | trace_hardirqs_on_prepare(); | |
468 | lockdep_hardirqs_on_prepare(CALLER_ADDR0); | |
469 | } | |
470 | instrumentation_end(); | |
471 | ||
472 | rcu_nmi_exit(); | |
473 | lockdep_hardirq_exit(); | |
474 | if (irq_state.lockdep) | |
475 | lockdep_hardirqs_on(CALLER_ADDR0); | |
476 | __nmi_exit(); | |
477 | } |