Commit | Line | Data |
---|---|---|
142781e1 TG |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | #include <linux/context_tracking.h> | |
4 | #include <linux/entry-common.h> | |
03248add | 5 | #include <linux/resume_user_mode.h> |
5fbda3ec | 6 | #include <linux/highmem.h> |
99cf983c | 7 | #include <linux/jump_label.h> |
6cae637f | 8 | #include <linux/kmsan.h> |
a9f3a74a TG |
9 | #include <linux/livepatch.h> |
10 | #include <linux/audit.h> | |
f268c373 | 11 | #include <linux/tick.h> |
142781e1 | 12 | |
11894468 GKB |
13 | #include "common.h" |
14 | ||
142781e1 TG |
15 | #define CREATE_TRACE_POINTS |
16 | #include <trace/events/syscalls.h> | |
17 | ||
142781e1 TG |
18 | static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) |
19 | { | |
20 | if (unlikely(audit_context())) { | |
21 | unsigned long args[6]; | |
22 | ||
23 | syscall_get_arguments(current, regs, args); | |
24 | audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); | |
25 | } | |
26 | } | |
27 | ||
221a1640 | 28 | long syscall_trace_enter(struct pt_regs *regs, long syscall, |
29915524 | 29 | unsigned long work) |
142781e1 TG |
30 | { |
31 | long ret = 0; | |
32 | ||
11894468 GKB |
33 | /* |
34 | * Handle Syscall User Dispatch. This must comes first, since | |
35 | * the ABI here can be something that doesn't make sense for | |
36 | * other syscall_work features. | |
37 | */ | |
38 | if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { | |
39 | if (syscall_user_dispatch(regs)) | |
40 | return -1L; | |
41 | } | |
42 | ||
142781e1 | 43 | /* Handle ptrace */ |
64eb35f7 | 44 | if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { |
0cfcb2b9 | 45 | ret = ptrace_report_syscall_entry(regs); |
64eb35f7 | 46 | if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) |
142781e1 TG |
47 | return -1L; |
48 | } | |
49 | ||
50 | /* Do seccomp after ptrace, to catch any tracer changes. */ | |
23d67a54 | 51 | if (work & SYSCALL_WORK_SECCOMP) { |
142781e1 TG |
52 | ret = __secure_computing(NULL); |
53 | if (ret == -1L) | |
54 | return ret; | |
55 | } | |
56 | ||
b6ec4134 KC |
57 | /* Either of the above might have changed the syscall number */ |
58 | syscall = syscall_get_nr(current, regs); | |
59 | ||
524666cb | 60 | if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) |
142781e1 TG |
61 | trace_sys_enter(regs, syscall); |
62 | ||
63 | syscall_enter_audit(regs, syscall); | |
64 | ||
65 | return ret ? : syscall; | |
66 | } | |
67 | ||
4facb95b TG |
68 | noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) |
69 | { | |
caf4062e | 70 | enter_from_user_mode(regs); |
4facb95b TG |
71 | instrumentation_begin(); |
72 | local_irq_enable(); | |
73 | instrumentation_end(); | |
74 | } | |
75 | ||
a9f3a74a | 76 | /* Workaround to allow gradual conversion of architecture code */ |
8ba62d37 | 77 | void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } |
a9f3a74a | 78 | |
d6801947 SS |
79 | /** |
80 | * exit_to_user_mode_loop - do any pending work before leaving to user space | |
81 | * @regs: Pointer to pt_regs on entry stack | |
82 | * @ti_work: TIF work flags as read by the caller | |
83 | */ | |
84 | __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, | |
85 | unsigned long ti_work) | |
a9f3a74a TG |
86 | { |
87 | /* | |
88 | * Before returning to user space ensure that all pending work | |
89 | * items have been completed. | |
90 | */ | |
91 | while (ti_work & EXIT_TO_USER_MODE_WORK) { | |
92 | ||
93 | local_irq_enable_exit_to_user(ti_work); | |
94 | ||
95 | if (ti_work & _TIF_NEED_RESCHED) | |
96 | schedule(); | |
97 | ||
98 | if (ti_work & _TIF_UPROBE) | |
99 | uprobe_notify_resume(regs); | |
100 | ||
101 | if (ti_work & _TIF_PATCH_PENDING) | |
102 | klp_update_patch_state(current); | |
103 | ||
12db8b69 | 104 | if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) |
8ba62d37 | 105 | arch_do_signal_or_restart(regs); |
a9f3a74a | 106 | |
a68de80f | 107 | if (ti_work & _TIF_NOTIFY_RESUME) |
03248add | 108 | resume_user_mode_work(regs); |
a9f3a74a TG |
109 | |
110 | /* Architecture specific TIF work */ | |
111 | arch_exit_to_user_mode_work(regs, ti_work); | |
112 | ||
113 | /* | |
114 | * Disable interrupts and reevaluate the work flags as they | |
115 | * might have changed while interrupts and preemption was | |
116 | * enabled above. | |
117 | */ | |
118 | local_irq_disable_exit_to_user(); | |
47b8ff19 FW |
119 | |
120 | /* Check if any of the above work has queued a deferred wakeup */ | |
f268c373 | 121 | tick_nohz_user_enter_prepare(); |
47b8ff19 | 122 | |
6ce89512 | 123 | ti_work = read_thread_flags(); |
a9f3a74a TG |
124 | } |
125 | ||
126 | /* Return the latest work state for arch_exit_to_user_mode() */ | |
127 | return ti_work; | |
128 | } | |
129 | ||
a9f3a74a | 130 | /* |
64eb35f7 | 131 | * If SYSCALL_EMU is set, then the only reason to report is when |
6342adca | 132 | * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall |
900ffe39 | 133 | * instruction has been already reported in syscall_enter_from_user_mode(). |
a9f3a74a | 134 | */ |
64eb35f7 | 135 | static inline bool report_single_step(unsigned long work) |
a9f3a74a | 136 | { |
41c1a06d | 137 | if (work & SYSCALL_WORK_SYSCALL_EMU) |
64eb35f7 GKB |
138 | return false; |
139 | ||
6342adca | 140 | return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; |
a9f3a74a | 141 | } |
29915524 GKB |
142 | |
143 | static void syscall_exit_work(struct pt_regs *regs, unsigned long work) | |
a9f3a74a TG |
144 | { |
145 | bool step; | |
146 | ||
11894468 GKB |
147 | /* |
148 | * If the syscall was rolled back due to syscall user dispatching, | |
149 | * then the tracers below are not invoked for the same reason as | |
150 | * the entry side was not invoked in syscall_trace_enter(): The ABI | |
151 | * of these syscalls is unknown. | |
152 | */ | |
153 | if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { | |
154 | if (unlikely(current->syscall_dispatch.on_dispatch)) { | |
155 | current->syscall_dispatch.on_dispatch = false; | |
156 | return; | |
157 | } | |
158 | } | |
159 | ||
a9f3a74a TG |
160 | audit_syscall_exit(regs); |
161 | ||
524666cb | 162 | if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) |
a9f3a74a TG |
163 | trace_sys_exit(regs, syscall_get_return_value(current, regs)); |
164 | ||
64eb35f7 | 165 | step = report_single_step(work); |
64c19ba2 | 166 | if (step || work & SYSCALL_WORK_SYSCALL_TRACE) |
0cfcb2b9 | 167 | ptrace_report_syscall_exit(regs, step); |
a9f3a74a TG |
168 | } |
169 | ||
170 | /* | |
171 | * Syscall specific exit to user mode preparation. Runs with interrupts | |
172 | * enabled. | |
173 | */ | |
174 | static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) | |
175 | { | |
b86678cf | 176 | unsigned long work = READ_ONCE(current_thread_info()->syscall_work); |
a9f3a74a TG |
177 | unsigned long nr = syscall_get_nr(current, regs); |
178 | ||
179 | CT_WARN_ON(ct_state() != CONTEXT_KERNEL); | |
180 | ||
181 | if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { | |
182 | if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) | |
183 | local_irq_enable(); | |
184 | } | |
185 | ||
186 | rseq_syscall(regs); | |
187 | ||
188 | /* | |
189 | * Do one-time syscall specific work. If these work items are | |
190 | * enabled, we want to run them exactly once per syscall exit with | |
191 | * interrupts enabled. | |
192 | */ | |
29915524 GKB |
193 | if (unlikely(work & SYSCALL_WORK_EXIT)) |
194 | syscall_exit_work(regs, work); | |
a9f3a74a TG |
195 | } |
196 | ||
c6156e1d | 197 | static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs) |
a9f3a74a | 198 | { |
a9f3a74a TG |
199 | syscall_exit_to_user_mode_prepare(regs); |
200 | local_irq_disable_exit_to_user(); | |
201 | exit_to_user_mode_prepare(regs); | |
c6156e1d SS |
202 | } |
203 | ||
204 | void syscall_exit_to_user_mode_work(struct pt_regs *regs) | |
205 | { | |
206 | __syscall_exit_to_user_mode_work(regs); | |
207 | } | |
208 | ||
209 | __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) | |
210 | { | |
211 | instrumentation_begin(); | |
212 | __syscall_exit_to_user_mode_work(regs); | |
a9f3a74a | 213 | instrumentation_end(); |
d6801947 | 214 | exit_to_user_mode(); |
a9f3a74a TG |
215 | } |
216 | ||
142781e1 TG |
217 | noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) |
218 | { | |
caf4062e | 219 | enter_from_user_mode(regs); |
142781e1 | 220 | } |
a9f3a74a TG |
221 | |
222 | noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) | |
223 | { | |
224 | instrumentation_begin(); | |
225 | exit_to_user_mode_prepare(regs); | |
226 | instrumentation_end(); | |
d6801947 | 227 | exit_to_user_mode(); |
a9f3a74a | 228 | } |
a5497bab | 229 | |
aadfc2f9 | 230 | noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) |
a5497bab TG |
231 | { |
232 | irqentry_state_t ret = { | |
233 | .exit_rcu = false, | |
234 | }; | |
235 | ||
236 | if (user_mode(regs)) { | |
237 | irqentry_enter_from_user_mode(regs); | |
238 | return ret; | |
239 | } | |
240 | ||
241 | /* | |
6f0e6c15 | 242 | * If this entry hit the idle task invoke ct_irq_enter() whether |
a5497bab TG |
243 | * RCU is watching or not. |
244 | * | |
78a56e04 | 245 | * Interrupts can nest when the first interrupt invokes softirq |
a5497bab TG |
246 | * processing on return which enables interrupts. |
247 | * | |
248 | * Scheduler ticks in the idle task can mark quiescent state and | |
249 | * terminate a grace period, if and only if the timer interrupt is | |
250 | * not nested into another interrupt. | |
251 | * | |
7f2a53c2 | 252 | * Checking for rcu_is_watching() here would prevent the nesting |
6f0e6c15 | 253 | * interrupt to invoke ct_irq_enter(). If that nested interrupt is |
a5497bab | 254 | * the tick then rcu_flavor_sched_clock_irq() would wrongfully |
97258ce9 | 255 | * assume that it is the first interrupt and eventually claim |
78a56e04 | 256 | * quiescent state and end grace periods prematurely. |
a5497bab | 257 | * |
6f0e6c15 | 258 | * Unconditionally invoke ct_irq_enter() so RCU state stays |
a5497bab TG |
259 | * consistent. |
260 | * | |
261 | * TINY_RCU does not support EQS, so let the compiler eliminate | |
262 | * this part when enabled. | |
263 | */ | |
264 | if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { | |
265 | /* | |
266 | * If RCU is not watching then the same careful | |
267 | * sequence vs. lockdep and tracing is required | |
45ff5105 | 268 | * as in irqentry_enter_from_user_mode(). |
a5497bab TG |
269 | */ |
270 | lockdep_hardirqs_off(CALLER_ADDR0); | |
6f0e6c15 | 271 | ct_irq_enter(); |
a5497bab | 272 | instrumentation_begin(); |
6cae637f | 273 | kmsan_unpoison_entry_regs(regs); |
a5497bab TG |
274 | trace_hardirqs_off_finish(); |
275 | instrumentation_end(); | |
276 | ||
277 | ret.exit_rcu = true; | |
278 | return ret; | |
279 | } | |
280 | ||
281 | /* | |
282 | * If RCU is watching then RCU only wants to check whether it needs | |
283 | * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() | |
284 | * already contains a warning when RCU is not watching, so no point | |
285 | * in having another one here. | |
286 | */ | |
9d820f68 | 287 | lockdep_hardirqs_off(CALLER_ADDR0); |
a5497bab | 288 | instrumentation_begin(); |
6cae637f | 289 | kmsan_unpoison_entry_regs(regs); |
a5497bab | 290 | rcu_irq_enter_check_tick(); |
9d820f68 | 291 | trace_hardirqs_off_finish(); |
a5497bab TG |
292 | instrumentation_end(); |
293 | ||
294 | return ret; | |
295 | } | |
296 | ||
4624a14f | 297 | void raw_irqentry_exit_cond_resched(void) |
a5497bab TG |
298 | { |
299 | if (!preempt_count()) { | |
300 | /* Sanity check RCU and thread stack */ | |
301 | rcu_irq_exit_check_preempt(); | |
302 | if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) | |
303 | WARN_ON_ONCE(!on_thread_stack()); | |
304 | if (need_resched()) | |
305 | preempt_schedule_irq(); | |
306 | } | |
307 | } | |
40607ee9 | 308 | #ifdef CONFIG_PREEMPT_DYNAMIC |
99cf983c | 309 | #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) |
4624a14f | 310 | DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); |
99cf983c MR |
311 | #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) |
312 | DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); | |
313 | void dynamic_irqentry_exit_cond_resched(void) | |
314 | { | |
0a70045e | 315 | if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) |
99cf983c MR |
316 | return; |
317 | raw_irqentry_exit_cond_resched(); | |
318 | } | |
319 | #endif | |
40607ee9 | 320 | #endif |
a5497bab | 321 | |
aadfc2f9 | 322 | noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) |
a5497bab TG |
323 | { |
324 | lockdep_assert_irqs_disabled(); | |
325 | ||
326 | /* Check whether this returns to user mode */ | |
327 | if (user_mode(regs)) { | |
328 | irqentry_exit_to_user_mode(regs); | |
329 | } else if (!regs_irqs_disabled(regs)) { | |
330 | /* | |
331 | * If RCU was not watching on entry this needs to be done | |
332 | * carefully and needs the same ordering of lockdep/tracing | |
333 | * and RCU as the return to user mode path. | |
334 | */ | |
335 | if (state.exit_rcu) { | |
336 | instrumentation_begin(); | |
337 | /* Tell the tracer that IRET will enable interrupts */ | |
338 | trace_hardirqs_on_prepare(); | |
8b023acc | 339 | lockdep_hardirqs_on_prepare(); |
a5497bab | 340 | instrumentation_end(); |
6f0e6c15 | 341 | ct_irq_exit(); |
a5497bab TG |
342 | lockdep_hardirqs_on(CALLER_ADDR0); |
343 | return; | |
344 | } | |
345 | ||
346 | instrumentation_begin(); | |
4624a14f | 347 | if (IS_ENABLED(CONFIG_PREEMPTION)) |
a5497bab | 348 | irqentry_exit_cond_resched(); |
4624a14f | 349 | |
a5497bab TG |
350 | /* Covers both tracing and lockdep */ |
351 | trace_hardirqs_on(); | |
352 | instrumentation_end(); | |
353 | } else { | |
354 | /* | |
355 | * IRQ flags state is correct already. Just tell RCU if it | |
356 | * was not watching on entry. | |
357 | */ | |
358 | if (state.exit_rcu) | |
6f0e6c15 | 359 | ct_irq_exit(); |
a5497bab TG |
360 | } |
361 | } | |
b6be002b TG |
362 | |
363 | irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) | |
364 | { | |
365 | irqentry_state_t irq_state; | |
366 | ||
367 | irq_state.lockdep = lockdep_hardirqs_enabled(); | |
368 | ||
369 | __nmi_enter(); | |
370 | lockdep_hardirqs_off(CALLER_ADDR0); | |
371 | lockdep_hardirq_enter(); | |
493c1822 | 372 | ct_nmi_enter(); |
b6be002b TG |
373 | |
374 | instrumentation_begin(); | |
6cae637f | 375 | kmsan_unpoison_entry_regs(regs); |
b6be002b TG |
376 | trace_hardirqs_off_finish(); |
377 | ftrace_nmi_enter(); | |
378 | instrumentation_end(); | |
379 | ||
380 | return irq_state; | |
381 | } | |
382 | ||
383 | void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state) | |
384 | { | |
385 | instrumentation_begin(); | |
386 | ftrace_nmi_exit(); | |
387 | if (irq_state.lockdep) { | |
388 | trace_hardirqs_on_prepare(); | |
8b023acc | 389 | lockdep_hardirqs_on_prepare(); |
b6be002b TG |
390 | } |
391 | instrumentation_end(); | |
392 | ||
493c1822 | 393 | ct_nmi_exit(); |
b6be002b TG |
394 | lockdep_hardirq_exit(); |
395 | if (irq_state.lockdep) | |
396 | lockdep_hardirqs_on(CALLER_ADDR0); | |
397 | __nmi_exit(); | |
398 | } |