Commit | Line | Data |
---|---|---|
bcea3f96 | 1 | // SPDX-License-Identifier: GPL-2.0 |
ac199db0 | 2 | /* |
97d5a220 | 3 | * trace event based perf event profiling/tracing |
ac199db0 | 4 | * |
90eec103 | 5 | * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra |
c530665c | 6 | * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com> |
ac199db0 PZ |
7 | */ |
8 | ||
558e6547 | 9 | #include <linux/module.h> |
430ad5a6 | 10 | #include <linux/kprobes.h> |
ac199db0 | 11 | #include "trace.h" |
e12f03d7 | 12 | #include "trace_probe.h" |
ac199db0 | 13 | |
6016ee13 | 14 | static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; |
20ab4425 | 15 | |
eb1e7961 FW |
16 | /* |
17 | * Force it to be aligned to unsigned long to avoid misaligned accesses | |
18 | * suprises | |
19 | */ | |
20 | typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) | |
21 | perf_trace_t; | |
ce71b9df | 22 | |
20ab4425 | 23 | /* Count the events in use (per event id, not per instance) */ |
97d5a220 | 24 | static int total_ref_count; |
20ab4425 | 25 | |
2425bcb9 | 26 | static int perf_trace_event_perm(struct trace_event_call *tp_event, |
61c32659 FW |
27 | struct perf_event *p_event) |
28 | { | |
d5b5f391 PZ |
29 | if (tp_event->perf_perm) { |
30 | int ret = tp_event->perf_perm(tp_event, p_event); | |
31 | if (ret) | |
32 | return ret; | |
33 | } | |
34 | ||
f4be073d JO |
35 | /* |
36 | * We checked and allowed to create parent, | |
37 | * allow children without checking. | |
38 | */ | |
39 | if (p_event->parent) | |
40 | return 0; | |
41 | ||
42 | /* | |
43 | * It's ok to check current process (owner) permissions in here, | |
44 | * because code below is called only via perf_event_open syscall. | |
45 | */ | |
46 | ||
ced39002 | 47 | /* The ftrace function trace is allowed only for root. */ |
cfa77bc4 JO |
48 | if (ftrace_event_is_function(tp_event)) { |
49 | if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) | |
50 | return -EPERM; | |
51 | ||
0a74c5b3 JO |
52 | if (!is_sampling_event(p_event)) |
53 | return 0; | |
54 | ||
cfa77bc4 JO |
55 | /* |
56 | * We don't allow user space callchains for function trace | |
57 | * event, due to issues with page faults while tracing page | |
58 | * fault handler and its overall trickiness nature. | |
59 | */ | |
60 | if (!p_event->attr.exclude_callchain_user) | |
61 | return -EINVAL; | |
63c45f4b JO |
62 | |
63 | /* | |
64 | * Same reason to disable user stack dump as for user space | |
65 | * callchains above. | |
66 | */ | |
67 | if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER) | |
68 | return -EINVAL; | |
cfa77bc4 | 69 | } |
ced39002 | 70 | |
61c32659 FW |
71 | /* No tracing, just counting, so no obvious leak */ |
72 | if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) | |
73 | return 0; | |
74 | ||
75 | /* Some events are ok to be traced by non-root users... */ | |
76 | if (p_event->attach_state == PERF_ATTACH_TASK) { | |
77 | if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY) | |
78 | return 0; | |
79 | } | |
80 | ||
81 | /* | |
82 | * ...otherwise raw tracepoint data can be a severe data leak, | |
83 | * only allow root to have these. | |
84 | */ | |
85 | if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) | |
86 | return -EPERM; | |
87 | ||
88 | return 0; | |
89 | } | |
90 | ||
2425bcb9 | 91 | static int perf_trace_event_reg(struct trace_event_call *tp_event, |
ceec0b6f | 92 | struct perf_event *p_event) |
e5e25cf4 | 93 | { |
6016ee13 | 94 | struct hlist_head __percpu *list; |
ceec0b6f | 95 | int ret = -ENOMEM; |
1c024eca | 96 | int cpu; |
20ab4425 | 97 | |
1c024eca PZ |
98 | p_event->tp_event = tp_event; |
99 | if (tp_event->perf_refcount++ > 0) | |
e5e25cf4 FW |
100 | return 0; |
101 | ||
1c024eca PZ |
102 | list = alloc_percpu(struct hlist_head); |
103 | if (!list) | |
104 | goto fail; | |
105 | ||
106 | for_each_possible_cpu(cpu) | |
107 | INIT_HLIST_HEAD(per_cpu_ptr(list, cpu)); | |
20ab4425 | 108 | |
1c024eca | 109 | tp_event->perf_events = list; |
e5e25cf4 | 110 | |
97d5a220 | 111 | if (!total_ref_count) { |
6016ee13 | 112 | char __percpu *buf; |
b7e2ecef | 113 | int i; |
20ab4425 | 114 | |
7ae07ea3 | 115 | for (i = 0; i < PERF_NR_CONTEXTS; i++) { |
6016ee13 | 116 | buf = (char __percpu *)alloc_percpu(perf_trace_t); |
b7e2ecef | 117 | if (!buf) |
1c024eca | 118 | goto fail; |
20ab4425 | 119 | |
1c024eca | 120 | perf_trace_buf[i] = buf; |
b7e2ecef | 121 | } |
20ab4425 FW |
122 | } |
123 | ||
ceec0b6f | 124 | ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL); |
1c024eca PZ |
125 | if (ret) |
126 | goto fail; | |
20ab4425 | 127 | |
1c024eca PZ |
128 | total_ref_count++; |
129 | return 0; | |
130 | ||
131 | fail: | |
97d5a220 | 132 | if (!total_ref_count) { |
b7e2ecef PZ |
133 | int i; |
134 | ||
7ae07ea3 | 135 | for (i = 0; i < PERF_NR_CONTEXTS; i++) { |
b7e2ecef PZ |
136 | free_percpu(perf_trace_buf[i]); |
137 | perf_trace_buf[i] = NULL; | |
138 | } | |
fe8e5b5a | 139 | } |
1c024eca PZ |
140 | |
141 | if (!--tp_event->perf_refcount) { | |
142 | free_percpu(tp_event->perf_events); | |
143 | tp_event->perf_events = NULL; | |
fe8e5b5a | 144 | } |
20ab4425 FW |
145 | |
146 | return ret; | |
e5e25cf4 FW |
147 | } |
148 | ||
ceec0b6f JO |
149 | static void perf_trace_event_unreg(struct perf_event *p_event) |
150 | { | |
2425bcb9 | 151 | struct trace_event_call *tp_event = p_event->tp_event; |
ceec0b6f JO |
152 | int i; |
153 | ||
154 | if (--tp_event->perf_refcount > 0) | |
155 | goto out; | |
156 | ||
157 | tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL); | |
158 | ||
159 | /* | |
160 | * Ensure our callback won't be called anymore. The buffers | |
161 | * will be freed after that. | |
162 | */ | |
163 | tracepoint_synchronize_unregister(); | |
164 | ||
165 | free_percpu(tp_event->perf_events); | |
166 | tp_event->perf_events = NULL; | |
167 | ||
168 | if (!--total_ref_count) { | |
169 | for (i = 0; i < PERF_NR_CONTEXTS; i++) { | |
170 | free_percpu(perf_trace_buf[i]); | |
171 | perf_trace_buf[i] = NULL; | |
172 | } | |
173 | } | |
174 | out: | |
175 | module_put(tp_event->mod); | |
176 | } | |
177 | ||
178 | static int perf_trace_event_open(struct perf_event *p_event) | |
179 | { | |
2425bcb9 | 180 | struct trace_event_call *tp_event = p_event->tp_event; |
ceec0b6f JO |
181 | return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event); |
182 | } | |
183 | ||
184 | static void perf_trace_event_close(struct perf_event *p_event) | |
185 | { | |
2425bcb9 | 186 | struct trace_event_call *tp_event = p_event->tp_event; |
ceec0b6f JO |
187 | tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event); |
188 | } | |
189 | ||
2425bcb9 | 190 | static int perf_trace_event_init(struct trace_event_call *tp_event, |
ceec0b6f JO |
191 | struct perf_event *p_event) |
192 | { | |
193 | int ret; | |
194 | ||
195 | ret = perf_trace_event_perm(tp_event, p_event); | |
196 | if (ret) | |
197 | return ret; | |
198 | ||
199 | ret = perf_trace_event_reg(tp_event, p_event); | |
200 | if (ret) | |
201 | return ret; | |
202 | ||
203 | ret = perf_trace_event_open(p_event); | |
204 | if (ret) { | |
205 | perf_trace_event_unreg(p_event); | |
206 | return ret; | |
207 | } | |
208 | ||
209 | return 0; | |
210 | } | |
211 | ||
1c024eca | 212 | int perf_trace_init(struct perf_event *p_event) |
ac199db0 | 213 | { |
2425bcb9 | 214 | struct trace_event_call *tp_event; |
0022cedd | 215 | u64 event_id = p_event->attr.config; |
20c8928a | 216 | int ret = -EINVAL; |
ac199db0 | 217 | |
20c8928a | 218 | mutex_lock(&event_mutex); |
1c024eca | 219 | list_for_each_entry(tp_event, &ftrace_events, list) { |
ff5f149b | 220 | if (tp_event->event.type == event_id && |
a1d0ce82 | 221 | tp_event->class && tp_event->class->reg && |
1c024eca PZ |
222 | try_module_get(tp_event->mod)) { |
223 | ret = perf_trace_event_init(tp_event, p_event); | |
9cb627d5 LZ |
224 | if (ret) |
225 | module_put(tp_event->mod); | |
20c8928a LZ |
226 | break; |
227 | } | |
ac199db0 | 228 | } |
20c8928a | 229 | mutex_unlock(&event_mutex); |
ac199db0 | 230 | |
20c8928a | 231 | return ret; |
ac199db0 PZ |
232 | } |
233 | ||
ceec0b6f JO |
234 | void perf_trace_destroy(struct perf_event *p_event) |
235 | { | |
236 | mutex_lock(&event_mutex); | |
237 | perf_trace_event_close(p_event); | |
238 | perf_trace_event_unreg(p_event); | |
239 | mutex_unlock(&event_mutex); | |
240 | } | |
241 | ||
e12f03d7 SL |
242 | #ifdef CONFIG_KPROBE_EVENTS |
243 | int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe) | |
244 | { | |
245 | int ret; | |
246 | char *func = NULL; | |
247 | struct trace_event_call *tp_event; | |
248 | ||
249 | if (p_event->attr.kprobe_func) { | |
250 | func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL); | |
251 | if (!func) | |
252 | return -ENOMEM; | |
253 | ret = strncpy_from_user( | |
254 | func, u64_to_user_ptr(p_event->attr.kprobe_func), | |
255 | KSYM_NAME_LEN); | |
5da13ab8 MH |
256 | if (ret == KSYM_NAME_LEN) |
257 | ret = -E2BIG; | |
e12f03d7 SL |
258 | if (ret < 0) |
259 | goto out; | |
260 | ||
261 | if (func[0] == '\0') { | |
262 | kfree(func); | |
263 | func = NULL; | |
264 | } | |
265 | } | |
266 | ||
267 | tp_event = create_local_trace_kprobe( | |
268 | func, (void *)(unsigned long)(p_event->attr.kprobe_addr), | |
269 | p_event->attr.probe_offset, is_retprobe); | |
270 | if (IS_ERR(tp_event)) { | |
271 | ret = PTR_ERR(tp_event); | |
272 | goto out; | |
273 | } | |
274 | ||
6b1340cc | 275 | mutex_lock(&event_mutex); |
e12f03d7 SL |
276 | ret = perf_trace_event_init(tp_event, p_event); |
277 | if (ret) | |
278 | destroy_local_trace_kprobe(tp_event); | |
6b1340cc | 279 | mutex_unlock(&event_mutex); |
e12f03d7 SL |
280 | out: |
281 | kfree(func); | |
282 | return ret; | |
283 | } | |
284 | ||
285 | void perf_kprobe_destroy(struct perf_event *p_event) | |
286 | { | |
6b1340cc | 287 | mutex_lock(&event_mutex); |
e12f03d7 SL |
288 | perf_trace_event_close(p_event); |
289 | perf_trace_event_unreg(p_event); | |
6b1340cc | 290 | mutex_unlock(&event_mutex); |
e12f03d7 SL |
291 | |
292 | destroy_local_trace_kprobe(p_event->tp_event); | |
293 | } | |
294 | #endif /* CONFIG_KPROBE_EVENTS */ | |
295 | ||
33ea4b24 | 296 | #ifdef CONFIG_UPROBE_EVENTS |
a6ca88b2 SL |
297 | int perf_uprobe_init(struct perf_event *p_event, |
298 | unsigned long ref_ctr_offset, bool is_retprobe) | |
33ea4b24 SL |
299 | { |
300 | int ret; | |
301 | char *path = NULL; | |
302 | struct trace_event_call *tp_event; | |
303 | ||
304 | if (!p_event->attr.uprobe_path) | |
305 | return -EINVAL; | |
83540fbc JH |
306 | |
307 | path = strndup_user(u64_to_user_ptr(p_event->attr.uprobe_path), | |
308 | PATH_MAX); | |
309 | if (IS_ERR(path)) { | |
310 | ret = PTR_ERR(path); | |
311 | return (ret == -EINVAL) ? -E2BIG : ret; | |
312 | } | |
33ea4b24 SL |
313 | if (path[0] == '\0') { |
314 | ret = -EINVAL; | |
315 | goto out; | |
316 | } | |
317 | ||
a6ca88b2 SL |
318 | tp_event = create_local_trace_uprobe(path, p_event->attr.probe_offset, |
319 | ref_ctr_offset, is_retprobe); | |
33ea4b24 SL |
320 | if (IS_ERR(tp_event)) { |
321 | ret = PTR_ERR(tp_event); | |
322 | goto out; | |
323 | } | |
324 | ||
325 | /* | |
326 | * local trace_uprobe need to hold event_mutex to call | |
327 | * uprobe_buffer_enable() and uprobe_buffer_disable(). | |
328 | * event_mutex is not required for local trace_kprobes. | |
329 | */ | |
330 | mutex_lock(&event_mutex); | |
331 | ret = perf_trace_event_init(tp_event, p_event); | |
332 | if (ret) | |
333 | destroy_local_trace_uprobe(tp_event); | |
334 | mutex_unlock(&event_mutex); | |
335 | out: | |
336 | kfree(path); | |
337 | return ret; | |
338 | } | |
339 | ||
340 | void perf_uprobe_destroy(struct perf_event *p_event) | |
341 | { | |
342 | mutex_lock(&event_mutex); | |
343 | perf_trace_event_close(p_event); | |
344 | perf_trace_event_unreg(p_event); | |
345 | mutex_unlock(&event_mutex); | |
346 | destroy_local_trace_uprobe(p_event->tp_event); | |
347 | } | |
348 | #endif /* CONFIG_UPROBE_EVENTS */ | |
349 | ||
a4eaf7f1 | 350 | int perf_trace_add(struct perf_event *p_event, int flags) |
e5e25cf4 | 351 | { |
2425bcb9 | 352 | struct trace_event_call *tp_event = p_event->tp_event; |
20ab4425 | 353 | |
a4eaf7f1 PZ |
354 | if (!(flags & PERF_EF_START)) |
355 | p_event->hw.state = PERF_HES_STOPPED; | |
356 | ||
466c81c4 PZ |
357 | /* |
358 | * If TRACE_REG_PERF_ADD returns false; no custom action was performed | |
359 | * and we need to take the default action of enqueueing our event on | |
360 | * the right per-cpu hlist. | |
361 | */ | |
362 | if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) { | |
363 | struct hlist_head __percpu *pcpu_list; | |
364 | struct hlist_head *list; | |
365 | ||
366 | pcpu_list = tp_event->perf_events; | |
367 | if (WARN_ON_ONCE(!pcpu_list)) | |
368 | return -EINVAL; | |
369 | ||
370 | list = this_cpu_ptr(pcpu_list); | |
371 | hlist_add_head_rcu(&p_event->hlist_entry, list); | |
372 | } | |
20ab4425 | 373 | |
466c81c4 | 374 | return 0; |
1c024eca | 375 | } |
20ab4425 | 376 | |
a4eaf7f1 | 377 | void perf_trace_del(struct perf_event *p_event, int flags) |
1c024eca | 378 | { |
2425bcb9 | 379 | struct trace_event_call *tp_event = p_event->tp_event; |
466c81c4 PZ |
380 | |
381 | /* | |
382 | * If TRACE_REG_PERF_DEL returns false; no custom action was performed | |
383 | * and we need to take the default action of dequeueing our event from | |
384 | * the right per-cpu hlist. | |
385 | */ | |
386 | if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event)) | |
387 | hlist_del_rcu(&p_event->hlist_entry); | |
e5e25cf4 FW |
388 | } |
389 | ||
1e1dcd93 | 390 | void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp) |
430ad5a6 | 391 | { |
1c024eca | 392 | char *raw_data; |
1e1dcd93 | 393 | int rctx; |
430ad5a6 | 394 | |
eb1e7961 FW |
395 | BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); |
396 | ||
cd92bf61 | 397 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, |
1e1dcd93 | 398 | "perf buffer not large enough")) |
cd92bf61 ON |
399 | return NULL; |
400 | ||
1e1dcd93 AS |
401 | *rctxp = rctx = perf_swevent_get_recursion_context(); |
402 | if (rctx < 0) | |
1c024eca | 403 | return NULL; |
430ad5a6 | 404 | |
86038c5e | 405 | if (regs) |
1e1dcd93 AS |
406 | *regs = this_cpu_ptr(&__perf_regs[rctx]); |
407 | raw_data = this_cpu_ptr(perf_trace_buf[rctx]); | |
430ad5a6 XG |
408 | |
409 | /* zero the dead bytes from align to not leak stack to user */ | |
eb1e7961 | 410 | memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); |
1e1dcd93 AS |
411 | return raw_data; |
412 | } | |
413 | EXPORT_SYMBOL_GPL(perf_trace_buf_alloc); | |
414 | NOKPROBE_SYMBOL(perf_trace_buf_alloc); | |
415 | ||
416 | void perf_trace_buf_update(void *record, u16 type) | |
417 | { | |
418 | struct trace_entry *entry = record; | |
419 | int pc = preempt_count(); | |
420 | unsigned long flags; | |
430ad5a6 | 421 | |
87f44bbc | 422 | local_save_flags(flags); |
46710f3a | 423 | tracing_generic_entry_update(entry, type, flags, pc); |
430ad5a6 | 424 | } |
1e1dcd93 | 425 | NOKPROBE_SYMBOL(perf_trace_buf_update); |
ced39002 JO |
426 | |
427 | #ifdef CONFIG_FUNCTION_TRACER | |
428 | static void | |
2f5f6ad9 | 429 | perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, |
a1e2e31d | 430 | struct ftrace_ops *ops, struct pt_regs *pt_regs) |
ced39002 JO |
431 | { |
432 | struct ftrace_entry *entry; | |
466c81c4 PZ |
433 | struct perf_event *event; |
434 | struct hlist_head head; | |
ced39002 JO |
435 | struct pt_regs regs; |
436 | int rctx; | |
437 | ||
466c81c4 | 438 | if ((unsigned long)ops->private != smp_processor_id()) |
b8ebfd3f ON |
439 | return; |
440 | ||
466c81c4 PZ |
441 | event = container_of(ops, struct perf_event, ftrace_ops); |
442 | ||
443 | /* | |
444 | * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all | |
445 | * the perf code does is hlist_for_each_entry_rcu(), so we can | |
446 | * get away with simply setting the @head.first pointer in order | |
447 | * to create a singular list. | |
448 | */ | |
449 | head.first = &event->hlist_entry; | |
450 | ||
ced39002 JO |
451 | #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ |
452 | sizeof(u64)) - sizeof(u32)) | |
453 | ||
454 | BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE); | |
455 | ||
ec5e099d | 456 | memset(®s, 0, sizeof(regs)); |
ced39002 JO |
457 | perf_fetch_caller_regs(®s); |
458 | ||
1e1dcd93 | 459 | entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx); |
ced39002 JO |
460 | if (!entry) |
461 | return; | |
462 | ||
463 | entry->ip = ip; | |
464 | entry->parent_ip = parent_ip; | |
1e1dcd93 | 465 | perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN, |
466c81c4 | 466 | 1, ®s, &head, NULL); |
ced39002 JO |
467 | |
468 | #undef ENTRY_SIZE | |
469 | } | |
470 | ||
471 | static int perf_ftrace_function_register(struct perf_event *event) | |
472 | { | |
473 | struct ftrace_ops *ops = &event->ftrace_ops; | |
474 | ||
1dd311e6 | 475 | ops->flags = FTRACE_OPS_FL_RCU; |
466c81c4 PZ |
476 | ops->func = perf_ftrace_function_call; |
477 | ops->private = (void *)(unsigned long)nr_cpu_ids; | |
478 | ||
ced39002 JO |
479 | return register_ftrace_function(ops); |
480 | } | |
481 | ||
482 | static int perf_ftrace_function_unregister(struct perf_event *event) | |
483 | { | |
484 | struct ftrace_ops *ops = &event->ftrace_ops; | |
5500fa51 JO |
485 | int ret = unregister_ftrace_function(ops); |
486 | ftrace_free_filter(ops); | |
487 | return ret; | |
ced39002 JO |
488 | } |
489 | ||
2425bcb9 | 490 | int perf_ftrace_event_register(struct trace_event_call *call, |
ced39002 JO |
491 | enum trace_reg type, void *data) |
492 | { | |
466c81c4 PZ |
493 | struct perf_event *event = data; |
494 | ||
ced39002 JO |
495 | switch (type) { |
496 | case TRACE_REG_REGISTER: | |
497 | case TRACE_REG_UNREGISTER: | |
498 | break; | |
499 | case TRACE_REG_PERF_REGISTER: | |
500 | case TRACE_REG_PERF_UNREGISTER: | |
501 | return 0; | |
502 | case TRACE_REG_PERF_OPEN: | |
503 | return perf_ftrace_function_register(data); | |
504 | case TRACE_REG_PERF_CLOSE: | |
505 | return perf_ftrace_function_unregister(data); | |
506 | case TRACE_REG_PERF_ADD: | |
466c81c4 PZ |
507 | event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id(); |
508 | return 1; | |
ced39002 | 509 | case TRACE_REG_PERF_DEL: |
466c81c4 PZ |
510 | event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids; |
511 | return 1; | |
ced39002 JO |
512 | } |
513 | ||
514 | return -EINVAL; | |
515 | } | |
516 | #endif /* CONFIG_FUNCTION_TRACER */ |