Merge tag 'pm+acpi-4.1-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael...
[linux-2.6-block.git] / kernel / events / core.c
CommitLineData
0793a61d 1/*
57c0c15b 2 * Performance events core code:
0793a61d 3 *
98144511 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
e7e7ee2e
IM
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
d36b6910 7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
7b732a75 8 *
57c0c15b 9 * For licensing details see kernel-base/COPYING
0793a61d
TG
10 */
11
12#include <linux/fs.h>
b9cacc7b 13#include <linux/mm.h>
0793a61d
TG
14#include <linux/cpu.h>
15#include <linux/smp.h>
2e80a82a 16#include <linux/idr.h>
04289bb9 17#include <linux/file.h>
0793a61d 18#include <linux/poll.h>
5a0e3ad6 19#include <linux/slab.h>
76e1d904 20#include <linux/hash.h>
12351ef8 21#include <linux/tick.h>
0793a61d 22#include <linux/sysfs.h>
22a4f650 23#include <linux/dcache.h>
0793a61d 24#include <linux/percpu.h>
22a4f650 25#include <linux/ptrace.h>
c277443c 26#include <linux/reboot.h>
b9cacc7b 27#include <linux/vmstat.h>
abe43400 28#include <linux/device.h>
6e5fdeed 29#include <linux/export.h>
906010b2 30#include <linux/vmalloc.h>
b9cacc7b
PZ
31#include <linux/hardirq.h>
32#include <linux/rculist.h>
0793a61d
TG
33#include <linux/uaccess.h>
34#include <linux/syscalls.h>
35#include <linux/anon_inodes.h>
aa9c4c0f 36#include <linux/kernel_stat.h>
39bed6cb 37#include <linux/cgroup.h>
cdd6c482 38#include <linux/perf_event.h>
6fb2915d 39#include <linux/ftrace_event.h>
3c502e7a 40#include <linux/hw_breakpoint.h>
c5ebcedb 41#include <linux/mm_types.h>
c464c76e 42#include <linux/module.h>
f972eb63 43#include <linux/mman.h>
b3f20785 44#include <linux/compat.h>
2541517c
AS
45#include <linux/bpf.h>
46#include <linux/filter.h>
0793a61d 47
76369139
FW
48#include "internal.h"
49
4e193bd4
TB
50#include <asm/irq_regs.h>
51
fadfe7be
JO
52static struct workqueue_struct *perf_wq;
53
fe4b04fa 54struct remote_function_call {
e7e7ee2e
IM
55 struct task_struct *p;
56 int (*func)(void *info);
57 void *info;
58 int ret;
fe4b04fa
PZ
59};
60
61static void remote_function(void *data)
62{
63 struct remote_function_call *tfc = data;
64 struct task_struct *p = tfc->p;
65
66 if (p) {
67 tfc->ret = -EAGAIN;
68 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
69 return;
70 }
71
72 tfc->ret = tfc->func(tfc->info);
73}
74
75/**
76 * task_function_call - call a function on the cpu on which a task runs
77 * @p: the task to evaluate
78 * @func: the function to be called
79 * @info: the function call argument
80 *
81 * Calls the function @func when the task is currently running. This might
82 * be on the current CPU, which just calls the function directly
83 *
84 * returns: @func return value, or
85 * -ESRCH - when the process isn't running
86 * -EAGAIN - when the process moved away
87 */
88static int
89task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
90{
91 struct remote_function_call data = {
e7e7ee2e
IM
92 .p = p,
93 .func = func,
94 .info = info,
95 .ret = -ESRCH, /* No such (running) process */
fe4b04fa
PZ
96 };
97
98 if (task_curr(p))
99 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
100
101 return data.ret;
102}
103
104/**
105 * cpu_function_call - call a function on the cpu
106 * @func: the function to be called
107 * @info: the function call argument
108 *
109 * Calls the function @func on the remote cpu.
110 *
111 * returns: @func return value or -ENXIO when the cpu is offline
112 */
113static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
114{
115 struct remote_function_call data = {
e7e7ee2e
IM
116 .p = NULL,
117 .func = func,
118 .info = info,
119 .ret = -ENXIO, /* No such CPU */
fe4b04fa
PZ
120 };
121
122 smp_call_function_single(cpu, remote_function, &data, 1);
123
124 return data.ret;
125}
126
f8697762
JO
127#define EVENT_OWNER_KERNEL ((void *) -1)
128
129static bool is_kernel_event(struct perf_event *event)
130{
131 return event->owner == EVENT_OWNER_KERNEL;
132}
133
e5d1367f
SE
134#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
135 PERF_FLAG_FD_OUTPUT |\
a21b0b35
YD
136 PERF_FLAG_PID_CGROUP |\
137 PERF_FLAG_FD_CLOEXEC)
e5d1367f 138
bce38cd5
SE
139/*
140 * branch priv levels that need permission checks
141 */
142#define PERF_SAMPLE_BRANCH_PERM_PLM \
143 (PERF_SAMPLE_BRANCH_KERNEL |\
144 PERF_SAMPLE_BRANCH_HV)
145
0b3fcf17
SE
146enum event_type_t {
147 EVENT_FLEXIBLE = 0x1,
148 EVENT_PINNED = 0x2,
149 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
150};
151
e5d1367f
SE
152/*
153 * perf_sched_events : >0 events exist
154 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
155 */
c5905afb 156struct static_key_deferred perf_sched_events __read_mostly;
e5d1367f 157static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
ba532500 158static DEFINE_PER_CPU(int, perf_sched_cb_usages);
e5d1367f 159
cdd6c482
IM
160static atomic_t nr_mmap_events __read_mostly;
161static atomic_t nr_comm_events __read_mostly;
162static atomic_t nr_task_events __read_mostly;
948b26b6 163static atomic_t nr_freq_events __read_mostly;
9ee318a7 164
108b02cf
PZ
165static LIST_HEAD(pmus);
166static DEFINE_MUTEX(pmus_lock);
167static struct srcu_struct pmus_srcu;
168
0764771d 169/*
cdd6c482 170 * perf event paranoia level:
0fbdea19
IM
171 * -1 - not paranoid at all
172 * 0 - disallow raw tracepoint access for unpriv
cdd6c482 173 * 1 - disallow cpu events for unpriv
0fbdea19 174 * 2 - disallow kernel profiling for unpriv
0764771d 175 */
cdd6c482 176int sysctl_perf_event_paranoid __read_mostly = 1;
0764771d 177
20443384
FW
178/* Minimum for 512 kiB + 1 user control page */
179int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
df58ab24
PZ
180
181/*
cdd6c482 182 * max perf event sample rate
df58ab24 183 */
14c63f17
DH
184#define DEFAULT_MAX_SAMPLE_RATE 100000
185#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
186#define DEFAULT_CPU_TIME_MAX_PERCENT 25
187
188int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
189
190static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
191static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
192
d9494cb4
PZ
193static int perf_sample_allowed_ns __read_mostly =
194 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
14c63f17
DH
195
196void update_perf_cpu_limits(void)
197{
198 u64 tmp = perf_sample_period_ns;
199
200 tmp *= sysctl_perf_cpu_time_max_percent;
e5302920 201 do_div(tmp, 100);
d9494cb4 202 ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
14c63f17 203}
163ec435 204
9e630205
SE
205static int perf_rotate_context(struct perf_cpu_context *cpuctx);
206
163ec435
PZ
207int perf_proc_update_handler(struct ctl_table *table, int write,
208 void __user *buffer, size_t *lenp,
209 loff_t *ppos)
210{
723478c8 211 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
163ec435
PZ
212
213 if (ret || !write)
214 return ret;
215
216 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
14c63f17
DH
217 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
218 update_perf_cpu_limits();
219
220 return 0;
221}
222
223int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
224
225int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
226 void __user *buffer, size_t *lenp,
227 loff_t *ppos)
228{
229 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
230
231 if (ret || !write)
232 return ret;
233
234 update_perf_cpu_limits();
163ec435
PZ
235
236 return 0;
237}
1ccd1549 238
14c63f17
DH
239/*
240 * perf samples are done in some very critical code paths (NMIs).
241 * If they take too much CPU time, the system can lock up and not
242 * get any real work done. This will drop the sample rate when
243 * we detect that events are taking too long.
244 */
245#define NR_ACCUMULATED_SAMPLES 128
d9494cb4 246static DEFINE_PER_CPU(u64, running_sample_length);
14c63f17 247
6a02ad66 248static void perf_duration_warn(struct irq_work *w)
14c63f17 249{
6a02ad66 250 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
14c63f17 251 u64 avg_local_sample_len;
e5302920 252 u64 local_samples_len;
6a02ad66 253
4a32fea9 254 local_samples_len = __this_cpu_read(running_sample_length);
6a02ad66
PZ
255 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
256
257 printk_ratelimited(KERN_WARNING
258 "perf interrupt took too long (%lld > %lld), lowering "
259 "kernel.perf_event_max_sample_rate to %d\n",
cd578abb 260 avg_local_sample_len, allowed_ns >> 1,
6a02ad66
PZ
261 sysctl_perf_event_sample_rate);
262}
263
264static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
265
266void perf_sample_event_took(u64 sample_len_ns)
267{
d9494cb4 268 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
6a02ad66
PZ
269 u64 avg_local_sample_len;
270 u64 local_samples_len;
14c63f17 271
d9494cb4 272 if (allowed_ns == 0)
14c63f17
DH
273 return;
274
275 /* decay the counter by 1 average sample */
4a32fea9 276 local_samples_len = __this_cpu_read(running_sample_length);
14c63f17
DH
277 local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
278 local_samples_len += sample_len_ns;
4a32fea9 279 __this_cpu_write(running_sample_length, local_samples_len);
14c63f17
DH
280
281 /*
282 * note: this will be biased artifically low until we have
283 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
284 * from having to maintain a count.
285 */
286 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
287
d9494cb4 288 if (avg_local_sample_len <= allowed_ns)
14c63f17
DH
289 return;
290
291 if (max_samples_per_tick <= 1)
292 return;
293
294 max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
295 sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
296 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
297
14c63f17 298 update_perf_cpu_limits();
6a02ad66 299
cd578abb
PZ
300 if (!irq_work_queue(&perf_duration_work)) {
301 early_printk("perf interrupt took too long (%lld > %lld), lowering "
302 "kernel.perf_event_max_sample_rate to %d\n",
303 avg_local_sample_len, allowed_ns >> 1,
304 sysctl_perf_event_sample_rate);
305 }
14c63f17
DH
306}
307
cdd6c482 308static atomic64_t perf_event_id;
a96bbc16 309
0b3fcf17
SE
310static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
311 enum event_type_t event_type);
312
313static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
e5d1367f
SE
314 enum event_type_t event_type,
315 struct task_struct *task);
316
317static void update_context_time(struct perf_event_context *ctx);
318static u64 perf_event_time(struct perf_event *event);
0b3fcf17 319
cdd6c482 320void __weak perf_event_print_debug(void) { }
0793a61d 321
84c79910 322extern __weak const char *perf_pmu_name(void)
0793a61d 323{
84c79910 324 return "pmu";
0793a61d
TG
325}
326
0b3fcf17
SE
327static inline u64 perf_clock(void)
328{
329 return local_clock();
330}
331
34f43927
PZ
332static inline u64 perf_event_clock(struct perf_event *event)
333{
334 return event->clock();
335}
336
e5d1367f
SE
337static inline struct perf_cpu_context *
338__get_cpu_context(struct perf_event_context *ctx)
339{
340 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
341}
342
facc4307
PZ
343static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
344 struct perf_event_context *ctx)
345{
346 raw_spin_lock(&cpuctx->ctx.lock);
347 if (ctx)
348 raw_spin_lock(&ctx->lock);
349}
350
351static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
352 struct perf_event_context *ctx)
353{
354 if (ctx)
355 raw_spin_unlock(&ctx->lock);
356 raw_spin_unlock(&cpuctx->ctx.lock);
357}
358
e5d1367f
SE
359#ifdef CONFIG_CGROUP_PERF
360
e5d1367f
SE
361static inline bool
362perf_cgroup_match(struct perf_event *event)
363{
364 struct perf_event_context *ctx = event->ctx;
365 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
366
ef824fa1
TH
367 /* @event doesn't care about cgroup */
368 if (!event->cgrp)
369 return true;
370
371 /* wants specific cgroup scope but @cpuctx isn't associated with any */
372 if (!cpuctx->cgrp)
373 return false;
374
375 /*
376 * Cgroup scoping is recursive. An event enabled for a cgroup is
377 * also enabled for all its descendant cgroups. If @cpuctx's
378 * cgroup is a descendant of @event's (the test covers identity
379 * case), it's a match.
380 */
381 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
382 event->cgrp->css.cgroup);
e5d1367f
SE
383}
384
e5d1367f
SE
385static inline void perf_detach_cgroup(struct perf_event *event)
386{
4e2ba650 387 css_put(&event->cgrp->css);
e5d1367f
SE
388 event->cgrp = NULL;
389}
390
391static inline int is_cgroup_event(struct perf_event *event)
392{
393 return event->cgrp != NULL;
394}
395
396static inline u64 perf_cgroup_event_time(struct perf_event *event)
397{
398 struct perf_cgroup_info *t;
399
400 t = per_cpu_ptr(event->cgrp->info, event->cpu);
401 return t->time;
402}
403
404static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
405{
406 struct perf_cgroup_info *info;
407 u64 now;
408
409 now = perf_clock();
410
411 info = this_cpu_ptr(cgrp->info);
412
413 info->time += now - info->timestamp;
414 info->timestamp = now;
415}
416
417static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
418{
419 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
420 if (cgrp_out)
421 __update_cgrp_time(cgrp_out);
422}
423
424static inline void update_cgrp_time_from_event(struct perf_event *event)
425{
3f7cce3c
SE
426 struct perf_cgroup *cgrp;
427
e5d1367f 428 /*
3f7cce3c
SE
429 * ensure we access cgroup data only when needed and
430 * when we know the cgroup is pinned (css_get)
e5d1367f 431 */
3f7cce3c 432 if (!is_cgroup_event(event))
e5d1367f
SE
433 return;
434
3f7cce3c
SE
435 cgrp = perf_cgroup_from_task(current);
436 /*
437 * Do not update time when cgroup is not active
438 */
439 if (cgrp == event->cgrp)
440 __update_cgrp_time(event->cgrp);
e5d1367f
SE
441}
442
443static inline void
3f7cce3c
SE
444perf_cgroup_set_timestamp(struct task_struct *task,
445 struct perf_event_context *ctx)
e5d1367f
SE
446{
447 struct perf_cgroup *cgrp;
448 struct perf_cgroup_info *info;
449
3f7cce3c
SE
450 /*
451 * ctx->lock held by caller
452 * ensure we do not access cgroup data
453 * unless we have the cgroup pinned (css_get)
454 */
455 if (!task || !ctx->nr_cgroups)
e5d1367f
SE
456 return;
457
458 cgrp = perf_cgroup_from_task(task);
459 info = this_cpu_ptr(cgrp->info);
3f7cce3c 460 info->timestamp = ctx->timestamp;
e5d1367f
SE
461}
462
463#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
464#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
465
466/*
467 * reschedule events based on the cgroup constraint of task.
468 *
469 * mode SWOUT : schedule out everything
470 * mode SWIN : schedule in based on cgroup for next
471 */
472void perf_cgroup_switch(struct task_struct *task, int mode)
473{
474 struct perf_cpu_context *cpuctx;
475 struct pmu *pmu;
476 unsigned long flags;
477
478 /*
479 * disable interrupts to avoid geting nr_cgroup
480 * changes via __perf_event_disable(). Also
481 * avoids preemption.
482 */
483 local_irq_save(flags);
484
485 /*
486 * we reschedule only in the presence of cgroup
487 * constrained events.
488 */
489 rcu_read_lock();
490
491 list_for_each_entry_rcu(pmu, &pmus, entry) {
e5d1367f 492 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
95cf59ea
PZ
493 if (cpuctx->unique_pmu != pmu)
494 continue; /* ensure we process each cpuctx once */
e5d1367f 495
e5d1367f
SE
496 /*
497 * perf_cgroup_events says at least one
498 * context on this CPU has cgroup events.
499 *
500 * ctx->nr_cgroups reports the number of cgroup
501 * events for a context.
502 */
503 if (cpuctx->ctx.nr_cgroups > 0) {
facc4307
PZ
504 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
505 perf_pmu_disable(cpuctx->ctx.pmu);
e5d1367f
SE
506
507 if (mode & PERF_CGROUP_SWOUT) {
508 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
509 /*
510 * must not be done before ctxswout due
511 * to event_filter_match() in event_sched_out()
512 */
513 cpuctx->cgrp = NULL;
514 }
515
516 if (mode & PERF_CGROUP_SWIN) {
e566b76e 517 WARN_ON_ONCE(cpuctx->cgrp);
95cf59ea
PZ
518 /*
519 * set cgrp before ctxsw in to allow
520 * event_filter_match() to not have to pass
521 * task around
e5d1367f
SE
522 */
523 cpuctx->cgrp = perf_cgroup_from_task(task);
524 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
525 }
facc4307
PZ
526 perf_pmu_enable(cpuctx->ctx.pmu);
527 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
e5d1367f 528 }
e5d1367f
SE
529 }
530
531 rcu_read_unlock();
532
533 local_irq_restore(flags);
534}
535
a8d757ef
SE
536static inline void perf_cgroup_sched_out(struct task_struct *task,
537 struct task_struct *next)
e5d1367f 538{
a8d757ef
SE
539 struct perf_cgroup *cgrp1;
540 struct perf_cgroup *cgrp2 = NULL;
541
542 /*
543 * we come here when we know perf_cgroup_events > 0
544 */
545 cgrp1 = perf_cgroup_from_task(task);
546
547 /*
548 * next is NULL when called from perf_event_enable_on_exec()
549 * that will systematically cause a cgroup_switch()
550 */
551 if (next)
552 cgrp2 = perf_cgroup_from_task(next);
553
554 /*
555 * only schedule out current cgroup events if we know
556 * that we are switching to a different cgroup. Otherwise,
557 * do no touch the cgroup events.
558 */
559 if (cgrp1 != cgrp2)
560 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
e5d1367f
SE
561}
562
a8d757ef
SE
563static inline void perf_cgroup_sched_in(struct task_struct *prev,
564 struct task_struct *task)
e5d1367f 565{
a8d757ef
SE
566 struct perf_cgroup *cgrp1;
567 struct perf_cgroup *cgrp2 = NULL;
568
569 /*
570 * we come here when we know perf_cgroup_events > 0
571 */
572 cgrp1 = perf_cgroup_from_task(task);
573
574 /* prev can never be NULL */
575 cgrp2 = perf_cgroup_from_task(prev);
576
577 /*
578 * only need to schedule in cgroup events if we are changing
579 * cgroup during ctxsw. Cgroup events were not scheduled
580 * out of ctxsw out if that was not the case.
581 */
582 if (cgrp1 != cgrp2)
583 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
e5d1367f
SE
584}
585
586static inline int perf_cgroup_connect(int fd, struct perf_event *event,
587 struct perf_event_attr *attr,
588 struct perf_event *group_leader)
589{
590 struct perf_cgroup *cgrp;
591 struct cgroup_subsys_state *css;
2903ff01
AV
592 struct fd f = fdget(fd);
593 int ret = 0;
e5d1367f 594
2903ff01 595 if (!f.file)
e5d1367f
SE
596 return -EBADF;
597
b583043e 598 css = css_tryget_online_from_dir(f.file->f_path.dentry,
ec903c0c 599 &perf_event_cgrp_subsys);
3db272c0
LZ
600 if (IS_ERR(css)) {
601 ret = PTR_ERR(css);
602 goto out;
603 }
e5d1367f
SE
604
605 cgrp = container_of(css, struct perf_cgroup, css);
606 event->cgrp = cgrp;
607
608 /*
609 * all events in a group must monitor
610 * the same cgroup because a task belongs
611 * to only one perf cgroup at a time
612 */
613 if (group_leader && group_leader->cgrp != cgrp) {
614 perf_detach_cgroup(event);
615 ret = -EINVAL;
e5d1367f 616 }
3db272c0 617out:
2903ff01 618 fdput(f);
e5d1367f
SE
619 return ret;
620}
621
622static inline void
623perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
624{
625 struct perf_cgroup_info *t;
626 t = per_cpu_ptr(event->cgrp->info, event->cpu);
627 event->shadow_ctx_time = now - t->timestamp;
628}
629
630static inline void
631perf_cgroup_defer_enabled(struct perf_event *event)
632{
633 /*
634 * when the current task's perf cgroup does not match
635 * the event's, we need to remember to call the
636 * perf_mark_enable() function the first time a task with
637 * a matching perf cgroup is scheduled in.
638 */
639 if (is_cgroup_event(event) && !perf_cgroup_match(event))
640 event->cgrp_defer_enabled = 1;
641}
642
643static inline void
644perf_cgroup_mark_enabled(struct perf_event *event,
645 struct perf_event_context *ctx)
646{
647 struct perf_event *sub;
648 u64 tstamp = perf_event_time(event);
649
650 if (!event->cgrp_defer_enabled)
651 return;
652
653 event->cgrp_defer_enabled = 0;
654
655 event->tstamp_enabled = tstamp - event->total_time_enabled;
656 list_for_each_entry(sub, &event->sibling_list, group_entry) {
657 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
658 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
659 sub->cgrp_defer_enabled = 0;
660 }
661 }
662}
663#else /* !CONFIG_CGROUP_PERF */
664
665static inline bool
666perf_cgroup_match(struct perf_event *event)
667{
668 return true;
669}
670
671static inline void perf_detach_cgroup(struct perf_event *event)
672{}
673
674static inline int is_cgroup_event(struct perf_event *event)
675{
676 return 0;
677}
678
679static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
680{
681 return 0;
682}
683
684static inline void update_cgrp_time_from_event(struct perf_event *event)
685{
686}
687
688static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
689{
690}
691
a8d757ef
SE
692static inline void perf_cgroup_sched_out(struct task_struct *task,
693 struct task_struct *next)
e5d1367f
SE
694{
695}
696
a8d757ef
SE
697static inline void perf_cgroup_sched_in(struct task_struct *prev,
698 struct task_struct *task)
e5d1367f
SE
699{
700}
701
702static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
703 struct perf_event_attr *attr,
704 struct perf_event *group_leader)
705{
706 return -EINVAL;
707}
708
709static inline void
3f7cce3c
SE
710perf_cgroup_set_timestamp(struct task_struct *task,
711 struct perf_event_context *ctx)
e5d1367f
SE
712{
713}
714
715void
716perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
717{
718}
719
720static inline void
721perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
722{
723}
724
725static inline u64 perf_cgroup_event_time(struct perf_event *event)
726{
727 return 0;
728}
729
730static inline void
731perf_cgroup_defer_enabled(struct perf_event *event)
732{
733}
734
735static inline void
736perf_cgroup_mark_enabled(struct perf_event *event,
737 struct perf_event_context *ctx)
738{
739}
740#endif
741
9e630205
SE
742/*
743 * set default to be dependent on timer tick just
744 * like original code
745 */
746#define PERF_CPU_HRTIMER (1000 / HZ)
747/*
748 * function must be called with interrupts disbled
749 */
750static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
751{
752 struct perf_cpu_context *cpuctx;
753 enum hrtimer_restart ret = HRTIMER_NORESTART;
754 int rotations = 0;
755
756 WARN_ON(!irqs_disabled());
757
758 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
759
760 rotations = perf_rotate_context(cpuctx);
761
762 /*
763 * arm timer if needed
764 */
765 if (rotations) {
766 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
767 ret = HRTIMER_RESTART;
768 }
769
770 return ret;
771}
772
773/* CPU is going down */
774void perf_cpu_hrtimer_cancel(int cpu)
775{
776 struct perf_cpu_context *cpuctx;
777 struct pmu *pmu;
778 unsigned long flags;
779
780 if (WARN_ON(cpu != smp_processor_id()))
781 return;
782
783 local_irq_save(flags);
784
785 rcu_read_lock();
786
787 list_for_each_entry_rcu(pmu, &pmus, entry) {
788 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
789
790 if (pmu->task_ctx_nr == perf_sw_context)
791 continue;
792
793 hrtimer_cancel(&cpuctx->hrtimer);
794 }
795
796 rcu_read_unlock();
797
798 local_irq_restore(flags);
799}
800
801static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
802{
803 struct hrtimer *hr = &cpuctx->hrtimer;
804 struct pmu *pmu = cpuctx->ctx.pmu;
62b85639 805 int timer;
9e630205
SE
806
807 /* no multiplexing needed for SW PMU */
808 if (pmu->task_ctx_nr == perf_sw_context)
809 return;
810
62b85639
SE
811 /*
812 * check default is sane, if not set then force to
813 * default interval (1/tick)
814 */
815 timer = pmu->hrtimer_interval_ms;
816 if (timer < 1)
817 timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
818
819 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
9e630205
SE
820
821 hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
822 hr->function = perf_cpu_hrtimer_handler;
823}
824
825static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
826{
827 struct hrtimer *hr = &cpuctx->hrtimer;
828 struct pmu *pmu = cpuctx->ctx.pmu;
829
830 /* not for SW PMU */
831 if (pmu->task_ctx_nr == perf_sw_context)
832 return;
833
834 if (hrtimer_active(hr))
835 return;
836
837 if (!hrtimer_callback_running(hr))
838 __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
839 0, HRTIMER_MODE_REL_PINNED, 0);
840}
841
33696fc0 842void perf_pmu_disable(struct pmu *pmu)
9e35ad38 843{
33696fc0
PZ
844 int *count = this_cpu_ptr(pmu->pmu_disable_count);
845 if (!(*count)++)
846 pmu->pmu_disable(pmu);
9e35ad38 847}
9e35ad38 848
33696fc0 849void perf_pmu_enable(struct pmu *pmu)
9e35ad38 850{
33696fc0
PZ
851 int *count = this_cpu_ptr(pmu->pmu_disable_count);
852 if (!--(*count))
853 pmu->pmu_enable(pmu);
9e35ad38 854}
9e35ad38 855
2fde4f94 856static DEFINE_PER_CPU(struct list_head, active_ctx_list);
e9d2b064
PZ
857
858/*
2fde4f94
MR
859 * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
860 * perf_event_task_tick() are fully serialized because they're strictly cpu
861 * affine and perf_event_ctx{activate,deactivate} are called with IRQs
862 * disabled, while perf_event_task_tick is called from IRQ context.
e9d2b064 863 */
2fde4f94 864static void perf_event_ctx_activate(struct perf_event_context *ctx)
9e35ad38 865{
2fde4f94 866 struct list_head *head = this_cpu_ptr(&active_ctx_list);
b5ab4cd5 867
e9d2b064 868 WARN_ON(!irqs_disabled());
b5ab4cd5 869
2fde4f94
MR
870 WARN_ON(!list_empty(&ctx->active_ctx_list));
871
872 list_add(&ctx->active_ctx_list, head);
873}
874
875static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
876{
877 WARN_ON(!irqs_disabled());
878
879 WARN_ON(list_empty(&ctx->active_ctx_list));
880
881 list_del_init(&ctx->active_ctx_list);
9e35ad38 882}
9e35ad38 883
cdd6c482 884static void get_ctx(struct perf_event_context *ctx)
a63eaf34 885{
e5289d4a 886 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
a63eaf34
PM
887}
888
4af57ef2
YZ
889static void free_ctx(struct rcu_head *head)
890{
891 struct perf_event_context *ctx;
892
893 ctx = container_of(head, struct perf_event_context, rcu_head);
894 kfree(ctx->task_ctx_data);
895 kfree(ctx);
896}
897
cdd6c482 898static void put_ctx(struct perf_event_context *ctx)
a63eaf34 899{
564c2b21
PM
900 if (atomic_dec_and_test(&ctx->refcount)) {
901 if (ctx->parent_ctx)
902 put_ctx(ctx->parent_ctx);
c93f7669
PM
903 if (ctx->task)
904 put_task_struct(ctx->task);
4af57ef2 905 call_rcu(&ctx->rcu_head, free_ctx);
564c2b21 906 }
a63eaf34
PM
907}
908
f63a8daa
PZ
909/*
910 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
911 * perf_pmu_migrate_context() we need some magic.
912 *
913 * Those places that change perf_event::ctx will hold both
914 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
915 *
916 * Lock ordering is by mutex address. There is one other site where
917 * perf_event_context::mutex nests and that is put_event(). But remember that
918 * that is a parent<->child context relation, and migration does not affect
919 * children, therefore these two orderings should not interact.
920 *
921 * The change in perf_event::ctx does not affect children (as claimed above)
922 * because the sys_perf_event_open() case will install a new event and break
923 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
924 * concerned with cpuctx and that doesn't have children.
925 *
926 * The places that change perf_event::ctx will issue:
927 *
928 * perf_remove_from_context();
929 * synchronize_rcu();
930 * perf_install_in_context();
931 *
932 * to affect the change. The remove_from_context() + synchronize_rcu() should
933 * quiesce the event, after which we can install it in the new location. This
934 * means that only external vectors (perf_fops, prctl) can perturb the event
935 * while in transit. Therefore all such accessors should also acquire
936 * perf_event_context::mutex to serialize against this.
937 *
938 * However; because event->ctx can change while we're waiting to acquire
939 * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
940 * function.
941 *
942 * Lock order:
943 * task_struct::perf_event_mutex
944 * perf_event_context::mutex
945 * perf_event_context::lock
946 * perf_event::child_mutex;
947 * perf_event::mmap_mutex
948 * mmap_sem
949 */
a83fe28e
PZ
950static struct perf_event_context *
951perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
f63a8daa
PZ
952{
953 struct perf_event_context *ctx;
954
955again:
956 rcu_read_lock();
957 ctx = ACCESS_ONCE(event->ctx);
958 if (!atomic_inc_not_zero(&ctx->refcount)) {
959 rcu_read_unlock();
960 goto again;
961 }
962 rcu_read_unlock();
963
a83fe28e 964 mutex_lock_nested(&ctx->mutex, nesting);
f63a8daa
PZ
965 if (event->ctx != ctx) {
966 mutex_unlock(&ctx->mutex);
967 put_ctx(ctx);
968 goto again;
969 }
970
971 return ctx;
972}
973
a83fe28e
PZ
974static inline struct perf_event_context *
975perf_event_ctx_lock(struct perf_event *event)
976{
977 return perf_event_ctx_lock_nested(event, 0);
978}
979
f63a8daa
PZ
980static void perf_event_ctx_unlock(struct perf_event *event,
981 struct perf_event_context *ctx)
982{
983 mutex_unlock(&ctx->mutex);
984 put_ctx(ctx);
985}
986
211de6eb
PZ
987/*
988 * This must be done under the ctx->lock, such as to serialize against
989 * context_equiv(), therefore we cannot call put_ctx() since that might end up
990 * calling scheduler related locks and ctx->lock nests inside those.
991 */
992static __must_check struct perf_event_context *
993unclone_ctx(struct perf_event_context *ctx)
71a851b4 994{
211de6eb
PZ
995 struct perf_event_context *parent_ctx = ctx->parent_ctx;
996
997 lockdep_assert_held(&ctx->lock);
998
999 if (parent_ctx)
71a851b4 1000 ctx->parent_ctx = NULL;
5a3126d4 1001 ctx->generation++;
211de6eb
PZ
1002
1003 return parent_ctx;
71a851b4
PZ
1004}
1005
6844c09d
ACM
1006static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1007{
1008 /*
1009 * only top level events have the pid namespace they were created in
1010 */
1011 if (event->parent)
1012 event = event->parent;
1013
1014 return task_tgid_nr_ns(p, event->ns);
1015}
1016
1017static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1018{
1019 /*
1020 * only top level events have the pid namespace they were created in
1021 */
1022 if (event->parent)
1023 event = event->parent;
1024
1025 return task_pid_nr_ns(p, event->ns);
1026}
1027
7f453c24 1028/*
cdd6c482 1029 * If we inherit events we want to return the parent event id
7f453c24
PZ
1030 * to userspace.
1031 */
cdd6c482 1032static u64 primary_event_id(struct perf_event *event)
7f453c24 1033{
cdd6c482 1034 u64 id = event->id;
7f453c24 1035
cdd6c482
IM
1036 if (event->parent)
1037 id = event->parent->id;
7f453c24
PZ
1038
1039 return id;
1040}
1041
25346b93 1042/*
cdd6c482 1043 * Get the perf_event_context for a task and lock it.
25346b93
PM
1044 * This has to cope with with the fact that until it is locked,
1045 * the context could get moved to another task.
1046 */
cdd6c482 1047static struct perf_event_context *
8dc85d54 1048perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
25346b93 1049{
cdd6c482 1050 struct perf_event_context *ctx;
25346b93 1051
9ed6060d 1052retry:
058ebd0e
PZ
1053 /*
1054 * One of the few rules of preemptible RCU is that one cannot do
1055 * rcu_read_unlock() while holding a scheduler (or nested) lock when
1056 * part of the read side critical section was preemptible -- see
1057 * rcu_read_unlock_special().
1058 *
1059 * Since ctx->lock nests under rq->lock we must ensure the entire read
1060 * side critical section is non-preemptible.
1061 */
1062 preempt_disable();
1063 rcu_read_lock();
8dc85d54 1064 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
25346b93
PM
1065 if (ctx) {
1066 /*
1067 * If this context is a clone of another, it might
1068 * get swapped for another underneath us by
cdd6c482 1069 * perf_event_task_sched_out, though the
25346b93
PM
1070 * rcu_read_lock() protects us from any context
1071 * getting freed. Lock the context and check if it
1072 * got swapped before we could get the lock, and retry
1073 * if so. If we locked the right context, then it
1074 * can't get swapped on us any more.
1075 */
e625cce1 1076 raw_spin_lock_irqsave(&ctx->lock, *flags);
8dc85d54 1077 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
e625cce1 1078 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
058ebd0e
PZ
1079 rcu_read_unlock();
1080 preempt_enable();
25346b93
PM
1081 goto retry;
1082 }
b49a9e7e
PZ
1083
1084 if (!atomic_inc_not_zero(&ctx->refcount)) {
e625cce1 1085 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
b49a9e7e
PZ
1086 ctx = NULL;
1087 }
25346b93
PM
1088 }
1089 rcu_read_unlock();
058ebd0e 1090 preempt_enable();
25346b93
PM
1091 return ctx;
1092}
1093
1094/*
1095 * Get the context for a task and increment its pin_count so it
1096 * can't get swapped to another task. This also increments its
1097 * reference count so that the context can't get freed.
1098 */
8dc85d54
PZ
1099static struct perf_event_context *
1100perf_pin_task_context(struct task_struct *task, int ctxn)
25346b93 1101{
cdd6c482 1102 struct perf_event_context *ctx;
25346b93
PM
1103 unsigned long flags;
1104
8dc85d54 1105 ctx = perf_lock_task_context(task, ctxn, &flags);
25346b93
PM
1106 if (ctx) {
1107 ++ctx->pin_count;
e625cce1 1108 raw_spin_unlock_irqrestore(&ctx->lock, flags);
25346b93
PM
1109 }
1110 return ctx;
1111}
1112
cdd6c482 1113static void perf_unpin_context(struct perf_event_context *ctx)
25346b93
PM
1114{
1115 unsigned long flags;
1116
e625cce1 1117 raw_spin_lock_irqsave(&ctx->lock, flags);
25346b93 1118 --ctx->pin_count;
e625cce1 1119 raw_spin_unlock_irqrestore(&ctx->lock, flags);
25346b93
PM
1120}
1121
f67218c3
PZ
1122/*
1123 * Update the record of the current time in a context.
1124 */
1125static void update_context_time(struct perf_event_context *ctx)
1126{
1127 u64 now = perf_clock();
1128
1129 ctx->time += now - ctx->timestamp;
1130 ctx->timestamp = now;
1131}
1132
4158755d
SE
1133static u64 perf_event_time(struct perf_event *event)
1134{
1135 struct perf_event_context *ctx = event->ctx;
e5d1367f
SE
1136
1137 if (is_cgroup_event(event))
1138 return perf_cgroup_event_time(event);
1139
4158755d
SE
1140 return ctx ? ctx->time : 0;
1141}
1142
f67218c3
PZ
1143/*
1144 * Update the total_time_enabled and total_time_running fields for a event.
b7526f0c 1145 * The caller of this function needs to hold the ctx->lock.
f67218c3
PZ
1146 */
1147static void update_event_times(struct perf_event *event)
1148{
1149 struct perf_event_context *ctx = event->ctx;
1150 u64 run_end;
1151
1152 if (event->state < PERF_EVENT_STATE_INACTIVE ||
1153 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
1154 return;
e5d1367f
SE
1155 /*
1156 * in cgroup mode, time_enabled represents
1157 * the time the event was enabled AND active
1158 * tasks were in the monitored cgroup. This is
1159 * independent of the activity of the context as
1160 * there may be a mix of cgroup and non-cgroup events.
1161 *
1162 * That is why we treat cgroup events differently
1163 * here.
1164 */
1165 if (is_cgroup_event(event))
46cd6a7f 1166 run_end = perf_cgroup_event_time(event);
e5d1367f
SE
1167 else if (ctx->is_active)
1168 run_end = ctx->time;
acd1d7c1
PZ
1169 else
1170 run_end = event->tstamp_stopped;
1171
1172 event->total_time_enabled = run_end - event->tstamp_enabled;
f67218c3
PZ
1173
1174 if (event->state == PERF_EVENT_STATE_INACTIVE)
1175 run_end = event->tstamp_stopped;
1176 else
4158755d 1177 run_end = perf_event_time(event);
f67218c3
PZ
1178
1179 event->total_time_running = run_end - event->tstamp_running;
e5d1367f 1180
f67218c3
PZ
1181}
1182
96c21a46
PZ
1183/*
1184 * Update total_time_enabled and total_time_running for all events in a group.
1185 */
1186static void update_group_times(struct perf_event *leader)
1187{
1188 struct perf_event *event;
1189
1190 update_event_times(leader);
1191 list_for_each_entry(event, &leader->sibling_list, group_entry)
1192 update_event_times(event);
1193}
1194
889ff015
FW
1195static struct list_head *
1196ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1197{
1198 if (event->attr.pinned)
1199 return &ctx->pinned_groups;
1200 else
1201 return &ctx->flexible_groups;
1202}
1203
fccc714b 1204/*
cdd6c482 1205 * Add a event from the lists for its context.
fccc714b
PZ
1206 * Must be called with ctx->mutex and ctx->lock held.
1207 */
04289bb9 1208static void
cdd6c482 1209list_add_event(struct perf_event *event, struct perf_event_context *ctx)
04289bb9 1210{
8a49542c
PZ
1211 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1212 event->attach_state |= PERF_ATTACH_CONTEXT;
04289bb9
IM
1213
1214 /*
8a49542c
PZ
1215 * If we're a stand alone event or group leader, we go to the context
1216 * list, group events are kept attached to the group so that
1217 * perf_group_detach can, at all times, locate all siblings.
04289bb9 1218 */
8a49542c 1219 if (event->group_leader == event) {
889ff015
FW
1220 struct list_head *list;
1221
d6f962b5
FW
1222 if (is_software_event(event))
1223 event->group_flags |= PERF_GROUP_SOFTWARE;
1224
889ff015
FW
1225 list = ctx_group_list(event, ctx);
1226 list_add_tail(&event->group_entry, list);
5c148194 1227 }
592903cd 1228
08309379 1229 if (is_cgroup_event(event))
e5d1367f 1230 ctx->nr_cgroups++;
e5d1367f 1231
cdd6c482
IM
1232 list_add_rcu(&event->event_entry, &ctx->event_list);
1233 ctx->nr_events++;
1234 if (event->attr.inherit_stat)
bfbd3381 1235 ctx->nr_stat++;
5a3126d4
PZ
1236
1237 ctx->generation++;
04289bb9
IM
1238}
1239
0231bb53
JO
1240/*
1241 * Initialize event state based on the perf_event_attr::disabled.
1242 */
1243static inline void perf_event__state_init(struct perf_event *event)
1244{
1245 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1246 PERF_EVENT_STATE_INACTIVE;
1247}
1248
c320c7b7
ACM
1249/*
1250 * Called at perf_event creation and when events are attached/detached from a
1251 * group.
1252 */
1253static void perf_event__read_size(struct perf_event *event)
1254{
1255 int entry = sizeof(u64); /* value */
1256 int size = 0;
1257 int nr = 1;
1258
1259 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1260 size += sizeof(u64);
1261
1262 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1263 size += sizeof(u64);
1264
1265 if (event->attr.read_format & PERF_FORMAT_ID)
1266 entry += sizeof(u64);
1267
1268 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1269 nr += event->group_leader->nr_siblings;
1270 size += sizeof(u64);
1271 }
1272
1273 size += entry * nr;
1274 event->read_size = size;
1275}
1276
1277static void perf_event__header_size(struct perf_event *event)
1278{
1279 struct perf_sample_data *data;
1280 u64 sample_type = event->attr.sample_type;
1281 u16 size = 0;
1282
1283 perf_event__read_size(event);
1284
1285 if (sample_type & PERF_SAMPLE_IP)
1286 size += sizeof(data->ip);
1287
6844c09d
ACM
1288 if (sample_type & PERF_SAMPLE_ADDR)
1289 size += sizeof(data->addr);
1290
1291 if (sample_type & PERF_SAMPLE_PERIOD)
1292 size += sizeof(data->period);
1293
c3feedf2
AK
1294 if (sample_type & PERF_SAMPLE_WEIGHT)
1295 size += sizeof(data->weight);
1296
6844c09d
ACM
1297 if (sample_type & PERF_SAMPLE_READ)
1298 size += event->read_size;
1299
d6be9ad6
SE
1300 if (sample_type & PERF_SAMPLE_DATA_SRC)
1301 size += sizeof(data->data_src.val);
1302
fdfbbd07
AK
1303 if (sample_type & PERF_SAMPLE_TRANSACTION)
1304 size += sizeof(data->txn);
1305
6844c09d
ACM
1306 event->header_size = size;
1307}
1308
1309static void perf_event__id_header_size(struct perf_event *event)
1310{
1311 struct perf_sample_data *data;
1312 u64 sample_type = event->attr.sample_type;
1313 u16 size = 0;
1314
c320c7b7
ACM
1315 if (sample_type & PERF_SAMPLE_TID)
1316 size += sizeof(data->tid_entry);
1317
1318 if (sample_type & PERF_SAMPLE_TIME)
1319 size += sizeof(data->time);
1320
ff3d527c
AH
1321 if (sample_type & PERF_SAMPLE_IDENTIFIER)
1322 size += sizeof(data->id);
1323
c320c7b7
ACM
1324 if (sample_type & PERF_SAMPLE_ID)
1325 size += sizeof(data->id);
1326
1327 if (sample_type & PERF_SAMPLE_STREAM_ID)
1328 size += sizeof(data->stream_id);
1329
1330 if (sample_type & PERF_SAMPLE_CPU)
1331 size += sizeof(data->cpu_entry);
1332
6844c09d 1333 event->id_header_size = size;
c320c7b7
ACM
1334}
1335
8a49542c
PZ
1336static void perf_group_attach(struct perf_event *event)
1337{
c320c7b7 1338 struct perf_event *group_leader = event->group_leader, *pos;
8a49542c 1339
74c3337c
PZ
1340 /*
1341 * We can have double attach due to group movement in perf_event_open.
1342 */
1343 if (event->attach_state & PERF_ATTACH_GROUP)
1344 return;
1345
8a49542c
PZ
1346 event->attach_state |= PERF_ATTACH_GROUP;
1347
1348 if (group_leader == event)
1349 return;
1350
652884fe
PZ
1351 WARN_ON_ONCE(group_leader->ctx != event->ctx);
1352
8a49542c
PZ
1353 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
1354 !is_software_event(event))
1355 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
1356
1357 list_add_tail(&event->group_entry, &group_leader->sibling_list);
1358 group_leader->nr_siblings++;
c320c7b7
ACM
1359
1360 perf_event__header_size(group_leader);
1361
1362 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1363 perf_event__header_size(pos);
8a49542c
PZ
1364}
1365
a63eaf34 1366/*
cdd6c482 1367 * Remove a event from the lists for its context.
fccc714b 1368 * Must be called with ctx->mutex and ctx->lock held.
a63eaf34 1369 */
04289bb9 1370static void
cdd6c482 1371list_del_event(struct perf_event *event, struct perf_event_context *ctx)
04289bb9 1372{
68cacd29 1373 struct perf_cpu_context *cpuctx;
652884fe
PZ
1374
1375 WARN_ON_ONCE(event->ctx != ctx);
1376 lockdep_assert_held(&ctx->lock);
1377
8a49542c
PZ
1378 /*
1379 * We can have double detach due to exit/hot-unplug + close.
1380 */
1381 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
a63eaf34 1382 return;
8a49542c
PZ
1383
1384 event->attach_state &= ~PERF_ATTACH_CONTEXT;
1385
68cacd29 1386 if (is_cgroup_event(event)) {
e5d1367f 1387 ctx->nr_cgroups--;
68cacd29
SE
1388 cpuctx = __get_cpu_context(ctx);
1389 /*
1390 * if there are no more cgroup events
1391 * then cler cgrp to avoid stale pointer
1392 * in update_cgrp_time_from_cpuctx()
1393 */
1394 if (!ctx->nr_cgroups)
1395 cpuctx->cgrp = NULL;
1396 }
e5d1367f 1397
cdd6c482
IM
1398 ctx->nr_events--;
1399 if (event->attr.inherit_stat)
bfbd3381 1400 ctx->nr_stat--;
8bc20959 1401
cdd6c482 1402 list_del_rcu(&event->event_entry);
04289bb9 1403
8a49542c
PZ
1404 if (event->group_leader == event)
1405 list_del_init(&event->group_entry);
5c148194 1406
96c21a46 1407 update_group_times(event);
b2e74a26
SE
1408
1409 /*
1410 * If event was in error state, then keep it
1411 * that way, otherwise bogus counts will be
1412 * returned on read(). The only way to get out
1413 * of error state is by explicit re-enabling
1414 * of the event
1415 */
1416 if (event->state > PERF_EVENT_STATE_OFF)
1417 event->state = PERF_EVENT_STATE_OFF;
5a3126d4
PZ
1418
1419 ctx->generation++;
050735b0
PZ
1420}
1421
8a49542c 1422static void perf_group_detach(struct perf_event *event)
050735b0
PZ
1423{
1424 struct perf_event *sibling, *tmp;
8a49542c
PZ
1425 struct list_head *list = NULL;
1426
1427 /*
1428 * We can have double detach due to exit/hot-unplug + close.
1429 */
1430 if (!(event->attach_state & PERF_ATTACH_GROUP))
1431 return;
1432
1433 event->attach_state &= ~PERF_ATTACH_GROUP;
1434
1435 /*
1436 * If this is a sibling, remove it from its group.
1437 */
1438 if (event->group_leader != event) {
1439 list_del_init(&event->group_entry);
1440 event->group_leader->nr_siblings--;
c320c7b7 1441 goto out;
8a49542c
PZ
1442 }
1443
1444 if (!list_empty(&event->group_entry))
1445 list = &event->group_entry;
2e2af50b 1446
04289bb9 1447 /*
cdd6c482
IM
1448 * If this was a group event with sibling events then
1449 * upgrade the siblings to singleton events by adding them
8a49542c 1450 * to whatever list we are on.
04289bb9 1451 */
cdd6c482 1452 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
8a49542c
PZ
1453 if (list)
1454 list_move_tail(&sibling->group_entry, list);
04289bb9 1455 sibling->group_leader = sibling;
d6f962b5
FW
1456
1457 /* Inherit group flags from the previous leader */
1458 sibling->group_flags = event->group_flags;
652884fe
PZ
1459
1460 WARN_ON_ONCE(sibling->ctx != event->ctx);
04289bb9 1461 }
c320c7b7
ACM
1462
1463out:
1464 perf_event__header_size(event->group_leader);
1465
1466 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1467 perf_event__header_size(tmp);
04289bb9
IM
1468}
1469
fadfe7be
JO
1470/*
1471 * User event without the task.
1472 */
1473static bool is_orphaned_event(struct perf_event *event)
1474{
1475 return event && !is_kernel_event(event) && !event->owner;
1476}
1477
1478/*
1479 * Event has a parent but parent's task finished and it's
1480 * alive only because of children holding refference.
1481 */
1482static bool is_orphaned_child(struct perf_event *event)
1483{
1484 return is_orphaned_event(event->parent);
1485}
1486
1487static void orphans_remove_work(struct work_struct *work);
1488
1489static void schedule_orphans_remove(struct perf_event_context *ctx)
1490{
1491 if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
1492 return;
1493
1494 if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
1495 get_ctx(ctx);
1496 ctx->orphans_remove_sched = true;
1497 }
1498}
1499
1500static int __init perf_workqueue_init(void)
1501{
1502 perf_wq = create_singlethread_workqueue("perf");
1503 WARN(!perf_wq, "failed to create perf workqueue\n");
1504 return perf_wq ? 0 : -1;
1505}
1506
1507core_initcall(perf_workqueue_init);
1508
fa66f07a
SE
1509static inline int
1510event_filter_match(struct perf_event *event)
1511{
e5d1367f
SE
1512 return (event->cpu == -1 || event->cpu == smp_processor_id())
1513 && perf_cgroup_match(event);
fa66f07a
SE
1514}
1515
9ffcfa6f
SE
1516static void
1517event_sched_out(struct perf_event *event,
3b6f9e5c 1518 struct perf_cpu_context *cpuctx,
cdd6c482 1519 struct perf_event_context *ctx)
3b6f9e5c 1520{
4158755d 1521 u64 tstamp = perf_event_time(event);
fa66f07a 1522 u64 delta;
652884fe
PZ
1523
1524 WARN_ON_ONCE(event->ctx != ctx);
1525 lockdep_assert_held(&ctx->lock);
1526
fa66f07a
SE
1527 /*
1528 * An event which could not be activated because of
1529 * filter mismatch still needs to have its timings
1530 * maintained, otherwise bogus information is return
1531 * via read() for time_enabled, time_running:
1532 */
1533 if (event->state == PERF_EVENT_STATE_INACTIVE
1534 && !event_filter_match(event)) {
e5d1367f 1535 delta = tstamp - event->tstamp_stopped;
fa66f07a 1536 event->tstamp_running += delta;
4158755d 1537 event->tstamp_stopped = tstamp;
fa66f07a
SE
1538 }
1539
cdd6c482 1540 if (event->state != PERF_EVENT_STATE_ACTIVE)
9ffcfa6f 1541 return;
3b6f9e5c 1542
44377277
AS
1543 perf_pmu_disable(event->pmu);
1544
cdd6c482
IM
1545 event->state = PERF_EVENT_STATE_INACTIVE;
1546 if (event->pending_disable) {
1547 event->pending_disable = 0;
1548 event->state = PERF_EVENT_STATE_OFF;
970892a9 1549 }
4158755d 1550 event->tstamp_stopped = tstamp;
a4eaf7f1 1551 event->pmu->del(event, 0);
cdd6c482 1552 event->oncpu = -1;
3b6f9e5c 1553
cdd6c482 1554 if (!is_software_event(event))
3b6f9e5c 1555 cpuctx->active_oncpu--;
2fde4f94
MR
1556 if (!--ctx->nr_active)
1557 perf_event_ctx_deactivate(ctx);
0f5a2601
PZ
1558 if (event->attr.freq && event->attr.sample_freq)
1559 ctx->nr_freq--;
cdd6c482 1560 if (event->attr.exclusive || !cpuctx->active_oncpu)
3b6f9e5c 1561 cpuctx->exclusive = 0;
44377277 1562
fadfe7be
JO
1563 if (is_orphaned_child(event))
1564 schedule_orphans_remove(ctx);
1565
44377277 1566 perf_pmu_enable(event->pmu);
3b6f9e5c
PM
1567}
1568
d859e29f 1569static void
cdd6c482 1570group_sched_out(struct perf_event *group_event,
d859e29f 1571 struct perf_cpu_context *cpuctx,
cdd6c482 1572 struct perf_event_context *ctx)
d859e29f 1573{
cdd6c482 1574 struct perf_event *event;
fa66f07a 1575 int state = group_event->state;
d859e29f 1576
cdd6c482 1577 event_sched_out(group_event, cpuctx, ctx);
d859e29f
PM
1578
1579 /*
1580 * Schedule out siblings (if any):
1581 */
cdd6c482
IM
1582 list_for_each_entry(event, &group_event->sibling_list, group_entry)
1583 event_sched_out(event, cpuctx, ctx);
d859e29f 1584
fa66f07a 1585 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
d859e29f
PM
1586 cpuctx->exclusive = 0;
1587}
1588
46ce0fe9
PZ
1589struct remove_event {
1590 struct perf_event *event;
1591 bool detach_group;
1592};
1593
0793a61d 1594/*
cdd6c482 1595 * Cross CPU call to remove a performance event
0793a61d 1596 *
cdd6c482 1597 * We disable the event on the hardware level first. After that we
0793a61d
TG
1598 * remove it from the context list.
1599 */
fe4b04fa 1600static int __perf_remove_from_context(void *info)
0793a61d 1601{
46ce0fe9
PZ
1602 struct remove_event *re = info;
1603 struct perf_event *event = re->event;
cdd6c482 1604 struct perf_event_context *ctx = event->ctx;
108b02cf 1605 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
0793a61d 1606
e625cce1 1607 raw_spin_lock(&ctx->lock);
cdd6c482 1608 event_sched_out(event, cpuctx, ctx);
46ce0fe9
PZ
1609 if (re->detach_group)
1610 perf_group_detach(event);
cdd6c482 1611 list_del_event(event, ctx);
64ce3126
PZ
1612 if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
1613 ctx->is_active = 0;
1614 cpuctx->task_ctx = NULL;
1615 }
e625cce1 1616 raw_spin_unlock(&ctx->lock);
fe4b04fa
PZ
1617
1618 return 0;
0793a61d
TG
1619}
1620
1621
1622/*
cdd6c482 1623 * Remove the event from a task's (or a CPU's) list of events.
0793a61d 1624 *
cdd6c482 1625 * CPU events are removed with a smp call. For task events we only
0793a61d 1626 * call when the task is on a CPU.
c93f7669 1627 *
cdd6c482
IM
1628 * If event->ctx is a cloned context, callers must make sure that
1629 * every task struct that event->ctx->task could possibly point to
c93f7669
PM
1630 * remains valid. This is OK when called from perf_release since
1631 * that only calls us on the top-level context, which can't be a clone.
cdd6c482 1632 * When called from perf_event_exit_task, it's OK because the
c93f7669 1633 * context has been detached from its task.
0793a61d 1634 */
46ce0fe9 1635static void perf_remove_from_context(struct perf_event *event, bool detach_group)
0793a61d 1636{
cdd6c482 1637 struct perf_event_context *ctx = event->ctx;
0793a61d 1638 struct task_struct *task = ctx->task;
46ce0fe9
PZ
1639 struct remove_event re = {
1640 .event = event,
1641 .detach_group = detach_group,
1642 };
0793a61d 1643
fe4b04fa
PZ
1644 lockdep_assert_held(&ctx->mutex);
1645
0793a61d
TG
1646 if (!task) {
1647 /*
226424ee
MR
1648 * Per cpu events are removed via an smp call. The removal can
1649 * fail if the CPU is currently offline, but in that case we
1650 * already called __perf_remove_from_context from
1651 * perf_event_exit_cpu.
0793a61d 1652 */
46ce0fe9 1653 cpu_function_call(event->cpu, __perf_remove_from_context, &re);
0793a61d
TG
1654 return;
1655 }
1656
1657retry:
46ce0fe9 1658 if (!task_function_call(task, __perf_remove_from_context, &re))
fe4b04fa 1659 return;
0793a61d 1660
e625cce1 1661 raw_spin_lock_irq(&ctx->lock);
0793a61d 1662 /*
fe4b04fa
PZ
1663 * If we failed to find a running task, but find the context active now
1664 * that we've acquired the ctx->lock, retry.
0793a61d 1665 */
fe4b04fa 1666 if (ctx->is_active) {
e625cce1 1667 raw_spin_unlock_irq(&ctx->lock);
3577af70
CW
1668 /*
1669 * Reload the task pointer, it might have been changed by
1670 * a concurrent perf_event_context_sched_out().
1671 */
1672 task = ctx->task;
0793a61d
TG
1673 goto retry;
1674 }
1675
1676 /*
fe4b04fa
PZ
1677 * Since the task isn't running, its safe to remove the event, us
1678 * holding the ctx->lock ensures the task won't get scheduled in.
0793a61d 1679 */
46ce0fe9
PZ
1680 if (detach_group)
1681 perf_group_detach(event);
fe4b04fa 1682 list_del_event(event, ctx);
e625cce1 1683 raw_spin_unlock_irq(&ctx->lock);
0793a61d
TG
1684}
1685
d859e29f 1686/*
cdd6c482 1687 * Cross CPU call to disable a performance event
d859e29f 1688 */
500ad2d8 1689int __perf_event_disable(void *info)
d859e29f 1690{
cdd6c482 1691 struct perf_event *event = info;
cdd6c482 1692 struct perf_event_context *ctx = event->ctx;
108b02cf 1693 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
d859e29f
PM
1694
1695 /*
cdd6c482
IM
1696 * If this is a per-task event, need to check whether this
1697 * event's task is the current task on this cpu.
fe4b04fa
PZ
1698 *
1699 * Can trigger due to concurrent perf_event_context_sched_out()
1700 * flipping contexts around.
d859e29f 1701 */
665c2142 1702 if (ctx->task && cpuctx->task_ctx != ctx)
fe4b04fa 1703 return -EINVAL;
d859e29f 1704
e625cce1 1705 raw_spin_lock(&ctx->lock);
d859e29f
PM
1706
1707 /*
cdd6c482 1708 * If the event is on, turn it off.
d859e29f
PM
1709 * If it is in error state, leave it in error state.
1710 */
cdd6c482 1711 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
4af4998b 1712 update_context_time(ctx);
e5d1367f 1713 update_cgrp_time_from_event(event);
cdd6c482
IM
1714 update_group_times(event);
1715 if (event == event->group_leader)
1716 group_sched_out(event, cpuctx, ctx);
d859e29f 1717 else
cdd6c482
IM
1718 event_sched_out(event, cpuctx, ctx);
1719 event->state = PERF_EVENT_STATE_OFF;
d859e29f
PM
1720 }
1721
e625cce1 1722 raw_spin_unlock(&ctx->lock);
fe4b04fa
PZ
1723
1724 return 0;
d859e29f
PM
1725}
1726
1727/*
cdd6c482 1728 * Disable a event.
c93f7669 1729 *
cdd6c482
IM
1730 * If event->ctx is a cloned context, callers must make sure that
1731 * every task struct that event->ctx->task could possibly point to
c93f7669 1732 * remains valid. This condition is satisifed when called through
cdd6c482
IM
1733 * perf_event_for_each_child or perf_event_for_each because they
1734 * hold the top-level event's child_mutex, so any descendant that
1735 * goes to exit will block in sync_child_event.
1736 * When called from perf_pending_event it's OK because event->ctx
c93f7669 1737 * is the current context on this CPU and preemption is disabled,
cdd6c482 1738 * hence we can't get into perf_event_task_sched_out for this context.
d859e29f 1739 */
f63a8daa 1740static void _perf_event_disable(struct perf_event *event)
d859e29f 1741{
cdd6c482 1742 struct perf_event_context *ctx = event->ctx;
d859e29f
PM
1743 struct task_struct *task = ctx->task;
1744
1745 if (!task) {
1746 /*
cdd6c482 1747 * Disable the event on the cpu that it's on
d859e29f 1748 */
fe4b04fa 1749 cpu_function_call(event->cpu, __perf_event_disable, event);
d859e29f
PM
1750 return;
1751 }
1752
9ed6060d 1753retry:
fe4b04fa
PZ
1754 if (!task_function_call(task, __perf_event_disable, event))
1755 return;
d859e29f 1756
e625cce1 1757 raw_spin_lock_irq(&ctx->lock);
d859e29f 1758 /*
cdd6c482 1759 * If the event is still active, we need to retry the cross-call.
d859e29f 1760 */
cdd6c482 1761 if (event->state == PERF_EVENT_STATE_ACTIVE) {
e625cce1 1762 raw_spin_unlock_irq(&ctx->lock);
fe4b04fa
PZ
1763 /*
1764 * Reload the task pointer, it might have been changed by
1765 * a concurrent perf_event_context_sched_out().
1766 */
1767 task = ctx->task;
d859e29f
PM
1768 goto retry;
1769 }
1770
1771 /*
1772 * Since we have the lock this context can't be scheduled
1773 * in, so we can change the state safely.
1774 */
cdd6c482
IM
1775 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1776 update_group_times(event);
1777 event->state = PERF_EVENT_STATE_OFF;
53cfbf59 1778 }
e625cce1 1779 raw_spin_unlock_irq(&ctx->lock);
d859e29f 1780}
f63a8daa
PZ
1781
1782/*
1783 * Strictly speaking kernel users cannot create groups and therefore this
1784 * interface does not need the perf_event_ctx_lock() magic.
1785 */
1786void perf_event_disable(struct perf_event *event)
1787{
1788 struct perf_event_context *ctx;
1789
1790 ctx = perf_event_ctx_lock(event);
1791 _perf_event_disable(event);
1792 perf_event_ctx_unlock(event, ctx);
1793}
dcfce4a0 1794EXPORT_SYMBOL_GPL(perf_event_disable);
d859e29f 1795
e5d1367f
SE
1796static void perf_set_shadow_time(struct perf_event *event,
1797 struct perf_event_context *ctx,
1798 u64 tstamp)
1799{
1800 /*
1801 * use the correct time source for the time snapshot
1802 *
1803 * We could get by without this by leveraging the
1804 * fact that to get to this function, the caller
1805 * has most likely already called update_context_time()
1806 * and update_cgrp_time_xx() and thus both timestamp
1807 * are identical (or very close). Given that tstamp is,
1808 * already adjusted for cgroup, we could say that:
1809 * tstamp - ctx->timestamp
1810 * is equivalent to
1811 * tstamp - cgrp->timestamp.
1812 *
1813 * Then, in perf_output_read(), the calculation would
1814 * work with no changes because:
1815 * - event is guaranteed scheduled in
1816 * - no scheduled out in between
1817 * - thus the timestamp would be the same
1818 *
1819 * But this is a bit hairy.
1820 *
1821 * So instead, we have an explicit cgroup call to remain
1822 * within the time time source all along. We believe it
1823 * is cleaner and simpler to understand.
1824 */
1825 if (is_cgroup_event(event))
1826 perf_cgroup_set_shadow_time(event, tstamp);
1827 else
1828 event->shadow_ctx_time = tstamp - ctx->timestamp;
1829}
1830
4fe757dd
PZ
1831#define MAX_INTERRUPTS (~0ULL)
1832
1833static void perf_log_throttle(struct perf_event *event, int enable);
ec0d7729 1834static void perf_log_itrace_start(struct perf_event *event);
4fe757dd 1835
235c7fc7 1836static int
9ffcfa6f 1837event_sched_in(struct perf_event *event,
235c7fc7 1838 struct perf_cpu_context *cpuctx,
6e37738a 1839 struct perf_event_context *ctx)
235c7fc7 1840{
4158755d 1841 u64 tstamp = perf_event_time(event);
44377277 1842 int ret = 0;
4158755d 1843
63342411
PZ
1844 lockdep_assert_held(&ctx->lock);
1845
cdd6c482 1846 if (event->state <= PERF_EVENT_STATE_OFF)
235c7fc7
IM
1847 return 0;
1848
cdd6c482 1849 event->state = PERF_EVENT_STATE_ACTIVE;
6e37738a 1850 event->oncpu = smp_processor_id();
4fe757dd
PZ
1851
1852 /*
1853 * Unthrottle events, since we scheduled we might have missed several
1854 * ticks already, also for a heavily scheduling task there is little
1855 * guarantee it'll get a tick in a timely manner.
1856 */
1857 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1858 perf_log_throttle(event, 1);
1859 event->hw.interrupts = 0;
1860 }
1861
235c7fc7
IM
1862 /*
1863 * The new state must be visible before we turn it on in the hardware:
1864 */
1865 smp_wmb();
1866
44377277
AS
1867 perf_pmu_disable(event->pmu);
1868
72f669c0
SL
1869 event->tstamp_running += tstamp - event->tstamp_stopped;
1870
1871 perf_set_shadow_time(event, ctx, tstamp);
1872
ec0d7729
AS
1873 perf_log_itrace_start(event);
1874
a4eaf7f1 1875 if (event->pmu->add(event, PERF_EF_START)) {
cdd6c482
IM
1876 event->state = PERF_EVENT_STATE_INACTIVE;
1877 event->oncpu = -1;
44377277
AS
1878 ret = -EAGAIN;
1879 goto out;
235c7fc7
IM
1880 }
1881
cdd6c482 1882 if (!is_software_event(event))
3b6f9e5c 1883 cpuctx->active_oncpu++;
2fde4f94
MR
1884 if (!ctx->nr_active++)
1885 perf_event_ctx_activate(ctx);
0f5a2601
PZ
1886 if (event->attr.freq && event->attr.sample_freq)
1887 ctx->nr_freq++;
235c7fc7 1888
cdd6c482 1889 if (event->attr.exclusive)
3b6f9e5c
PM
1890 cpuctx->exclusive = 1;
1891
fadfe7be
JO
1892 if (is_orphaned_child(event))
1893 schedule_orphans_remove(ctx);
1894
44377277
AS
1895out:
1896 perf_pmu_enable(event->pmu);
1897
1898 return ret;
235c7fc7
IM
1899}
1900
6751b71e 1901static int
cdd6c482 1902group_sched_in(struct perf_event *group_event,
6751b71e 1903 struct perf_cpu_context *cpuctx,
6e37738a 1904 struct perf_event_context *ctx)
6751b71e 1905{
6bde9b6c 1906 struct perf_event *event, *partial_group = NULL;
4a234593 1907 struct pmu *pmu = ctx->pmu;
d7842da4
SE
1908 u64 now = ctx->time;
1909 bool simulate = false;
6751b71e 1910
cdd6c482 1911 if (group_event->state == PERF_EVENT_STATE_OFF)
6751b71e
PM
1912 return 0;
1913
ad5133b7 1914 pmu->start_txn(pmu);
6bde9b6c 1915
9ffcfa6f 1916 if (event_sched_in(group_event, cpuctx, ctx)) {
ad5133b7 1917 pmu->cancel_txn(pmu);
9e630205 1918 perf_cpu_hrtimer_restart(cpuctx);
6751b71e 1919 return -EAGAIN;
90151c35 1920 }
6751b71e
PM
1921
1922 /*
1923 * Schedule in siblings as one group (if any):
1924 */
cdd6c482 1925 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
9ffcfa6f 1926 if (event_sched_in(event, cpuctx, ctx)) {
cdd6c482 1927 partial_group = event;
6751b71e
PM
1928 goto group_error;
1929 }
1930 }
1931
9ffcfa6f 1932 if (!pmu->commit_txn(pmu))
6e85158c 1933 return 0;
9ffcfa6f 1934
6751b71e
PM
1935group_error:
1936 /*
1937 * Groups can be scheduled in as one unit only, so undo any
1938 * partial group before returning:
d7842da4
SE
1939 * The events up to the failed event are scheduled out normally,
1940 * tstamp_stopped will be updated.
1941 *
1942 * The failed events and the remaining siblings need to have
1943 * their timings updated as if they had gone thru event_sched_in()
1944 * and event_sched_out(). This is required to get consistent timings
1945 * across the group. This also takes care of the case where the group
1946 * could never be scheduled by ensuring tstamp_stopped is set to mark
1947 * the time the event was actually stopped, such that time delta
1948 * calculation in update_event_times() is correct.
6751b71e 1949 */
cdd6c482
IM
1950 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1951 if (event == partial_group)
d7842da4
SE
1952 simulate = true;
1953
1954 if (simulate) {
1955 event->tstamp_running += now - event->tstamp_stopped;
1956 event->tstamp_stopped = now;
1957 } else {
1958 event_sched_out(event, cpuctx, ctx);
1959 }
6751b71e 1960 }
9ffcfa6f 1961 event_sched_out(group_event, cpuctx, ctx);
6751b71e 1962
ad5133b7 1963 pmu->cancel_txn(pmu);
90151c35 1964
9e630205
SE
1965 perf_cpu_hrtimer_restart(cpuctx);
1966
6751b71e
PM
1967 return -EAGAIN;
1968}
1969
3b6f9e5c 1970/*
cdd6c482 1971 * Work out whether we can put this event group on the CPU now.
3b6f9e5c 1972 */
cdd6c482 1973static int group_can_go_on(struct perf_event *event,
3b6f9e5c
PM
1974 struct perf_cpu_context *cpuctx,
1975 int can_add_hw)
1976{
1977 /*
cdd6c482 1978 * Groups consisting entirely of software events can always go on.
3b6f9e5c 1979 */
d6f962b5 1980 if (event->group_flags & PERF_GROUP_SOFTWARE)
3b6f9e5c
PM
1981 return 1;
1982 /*
1983 * If an exclusive group is already on, no other hardware
cdd6c482 1984 * events can go on.
3b6f9e5c
PM
1985 */
1986 if (cpuctx->exclusive)
1987 return 0;
1988 /*
1989 * If this group is exclusive and there are already
cdd6c482 1990 * events on the CPU, it can't go on.
3b6f9e5c 1991 */
cdd6c482 1992 if (event->attr.exclusive && cpuctx->active_oncpu)
3b6f9e5c
PM
1993 return 0;
1994 /*
1995 * Otherwise, try to add it if all previous groups were able
1996 * to go on.
1997 */
1998 return can_add_hw;
1999}
2000
cdd6c482
IM
2001static void add_event_to_ctx(struct perf_event *event,
2002 struct perf_event_context *ctx)
53cfbf59 2003{
4158755d
SE
2004 u64 tstamp = perf_event_time(event);
2005
cdd6c482 2006 list_add_event(event, ctx);
8a49542c 2007 perf_group_attach(event);
4158755d
SE
2008 event->tstamp_enabled = tstamp;
2009 event->tstamp_running = tstamp;
2010 event->tstamp_stopped = tstamp;
53cfbf59
PM
2011}
2012
2c29ef0f
PZ
2013static void task_ctx_sched_out(struct perf_event_context *ctx);
2014static void
2015ctx_sched_in(struct perf_event_context *ctx,
2016 struct perf_cpu_context *cpuctx,
2017 enum event_type_t event_type,
2018 struct task_struct *task);
fe4b04fa 2019
dce5855b
PZ
2020static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2021 struct perf_event_context *ctx,
2022 struct task_struct *task)
2023{
2024 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2025 if (ctx)
2026 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2027 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2028 if (ctx)
2029 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2030}
2031
0793a61d 2032/*
cdd6c482 2033 * Cross CPU call to install and enable a performance event
682076ae
PZ
2034 *
2035 * Must be called with ctx->mutex held
0793a61d 2036 */
fe4b04fa 2037static int __perf_install_in_context(void *info)
0793a61d 2038{
cdd6c482
IM
2039 struct perf_event *event = info;
2040 struct perf_event_context *ctx = event->ctx;
108b02cf 2041 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2c29ef0f
PZ
2042 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2043 struct task_struct *task = current;
2044
b58f6b0d 2045 perf_ctx_lock(cpuctx, task_ctx);
2c29ef0f 2046 perf_pmu_disable(cpuctx->ctx.pmu);
0793a61d
TG
2047
2048 /*
2c29ef0f 2049 * If there was an active task_ctx schedule it out.
0793a61d 2050 */
b58f6b0d 2051 if (task_ctx)
2c29ef0f 2052 task_ctx_sched_out(task_ctx);
b58f6b0d
PZ
2053
2054 /*
2055 * If the context we're installing events in is not the
2056 * active task_ctx, flip them.
2057 */
2058 if (ctx->task && task_ctx != ctx) {
2059 if (task_ctx)
2060 raw_spin_unlock(&task_ctx->lock);
2061 raw_spin_lock(&ctx->lock);
2062 task_ctx = ctx;
2063 }
2064
2065 if (task_ctx) {
2066 cpuctx->task_ctx = task_ctx;
2c29ef0f
PZ
2067 task = task_ctx->task;
2068 }
b58f6b0d 2069
2c29ef0f 2070 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
0793a61d 2071
4af4998b 2072 update_context_time(ctx);
e5d1367f
SE
2073 /*
2074 * update cgrp time only if current cgrp
2075 * matches event->cgrp. Must be done before
2076 * calling add_event_to_ctx()
2077 */
2078 update_cgrp_time_from_event(event);
0793a61d 2079
cdd6c482 2080 add_event_to_ctx(event, ctx);
0793a61d 2081
d859e29f 2082 /*
2c29ef0f 2083 * Schedule everything back in
d859e29f 2084 */
dce5855b 2085 perf_event_sched_in(cpuctx, task_ctx, task);
2c29ef0f
PZ
2086
2087 perf_pmu_enable(cpuctx->ctx.pmu);
2088 perf_ctx_unlock(cpuctx, task_ctx);
fe4b04fa
PZ
2089
2090 return 0;
0793a61d
TG
2091}
2092
2093/*
cdd6c482 2094 * Attach a performance event to a context
0793a61d 2095 *
cdd6c482
IM
2096 * First we add the event to the list with the hardware enable bit
2097 * in event->hw_config cleared.
0793a61d 2098 *
cdd6c482 2099 * If the event is attached to a task which is on a CPU we use a smp
0793a61d
TG
2100 * call to enable it in the task context. The task might have been
2101 * scheduled away, but we check this in the smp call again.
2102 */
2103static void
cdd6c482
IM
2104perf_install_in_context(struct perf_event_context *ctx,
2105 struct perf_event *event,
0793a61d
TG
2106 int cpu)
2107{
2108 struct task_struct *task = ctx->task;
2109
fe4b04fa
PZ
2110 lockdep_assert_held(&ctx->mutex);
2111
c3f00c70 2112 event->ctx = ctx;
0cda4c02
YZ
2113 if (event->cpu != -1)
2114 event->cpu = cpu;
c3f00c70 2115
0793a61d
TG
2116 if (!task) {
2117 /*
cdd6c482 2118 * Per cpu events are installed via an smp call and
af901ca1 2119 * the install is always successful.
0793a61d 2120 */
fe4b04fa 2121 cpu_function_call(cpu, __perf_install_in_context, event);
0793a61d
TG
2122 return;
2123 }
2124
0793a61d 2125retry:
fe4b04fa
PZ
2126 if (!task_function_call(task, __perf_install_in_context, event))
2127 return;
0793a61d 2128
e625cce1 2129 raw_spin_lock_irq(&ctx->lock);
0793a61d 2130 /*
fe4b04fa
PZ
2131 * If we failed to find a running task, but find the context active now
2132 * that we've acquired the ctx->lock, retry.
0793a61d 2133 */
fe4b04fa 2134 if (ctx->is_active) {
e625cce1 2135 raw_spin_unlock_irq(&ctx->lock);
3577af70
CW
2136 /*
2137 * Reload the task pointer, it might have been changed by
2138 * a concurrent perf_event_context_sched_out().
2139 */
2140 task = ctx->task;
0793a61d
TG
2141 goto retry;
2142 }
2143
2144 /*
fe4b04fa
PZ
2145 * Since the task isn't running, its safe to add the event, us holding
2146 * the ctx->lock ensures the task won't get scheduled in.
0793a61d 2147 */
fe4b04fa 2148 add_event_to_ctx(event, ctx);
e625cce1 2149 raw_spin_unlock_irq(&ctx->lock);
0793a61d
TG
2150}
2151
fa289bec 2152/*
cdd6c482 2153 * Put a event into inactive state and update time fields.
fa289bec
PM
2154 * Enabling the leader of a group effectively enables all
2155 * the group members that aren't explicitly disabled, so we
2156 * have to update their ->tstamp_enabled also.
2157 * Note: this works for group members as well as group leaders
2158 * since the non-leader members' sibling_lists will be empty.
2159 */
1d9b482e 2160static void __perf_event_mark_enabled(struct perf_event *event)
fa289bec 2161{
cdd6c482 2162 struct perf_event *sub;
4158755d 2163 u64 tstamp = perf_event_time(event);
fa289bec 2164
cdd6c482 2165 event->state = PERF_EVENT_STATE_INACTIVE;
4158755d 2166 event->tstamp_enabled = tstamp - event->total_time_enabled;
9ed6060d 2167 list_for_each_entry(sub, &event->sibling_list, group_entry) {
4158755d
SE
2168 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
2169 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
9ed6060d 2170 }
fa289bec
PM
2171}
2172
d859e29f 2173/*
cdd6c482 2174 * Cross CPU call to enable a performance event
d859e29f 2175 */
fe4b04fa 2176static int __perf_event_enable(void *info)
04289bb9 2177{
cdd6c482 2178 struct perf_event *event = info;
cdd6c482
IM
2179 struct perf_event_context *ctx = event->ctx;
2180 struct perf_event *leader = event->group_leader;
108b02cf 2181 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
d859e29f 2182 int err;
04289bb9 2183
06f41796
JO
2184 /*
2185 * There's a time window between 'ctx->is_active' check
2186 * in perf_event_enable function and this place having:
2187 * - IRQs on
2188 * - ctx->lock unlocked
2189 *
2190 * where the task could be killed and 'ctx' deactivated
2191 * by perf_event_exit_task.
2192 */
2193 if (!ctx->is_active)
fe4b04fa 2194 return -EINVAL;
3cbed429 2195
e625cce1 2196 raw_spin_lock(&ctx->lock);
4af4998b 2197 update_context_time(ctx);
d859e29f 2198
cdd6c482 2199 if (event->state >= PERF_EVENT_STATE_INACTIVE)
d859e29f 2200 goto unlock;
e5d1367f
SE
2201
2202 /*
2203 * set current task's cgroup time reference point
2204 */
3f7cce3c 2205 perf_cgroup_set_timestamp(current, ctx);
e5d1367f 2206
1d9b482e 2207 __perf_event_mark_enabled(event);
04289bb9 2208
e5d1367f
SE
2209 if (!event_filter_match(event)) {
2210 if (is_cgroup_event(event))
2211 perf_cgroup_defer_enabled(event);
f4c4176f 2212 goto unlock;
e5d1367f 2213 }
f4c4176f 2214
04289bb9 2215 /*
cdd6c482 2216 * If the event is in a group and isn't the group leader,
d859e29f 2217 * then don't put it on unless the group is on.
04289bb9 2218 */
cdd6c482 2219 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
d859e29f 2220 goto unlock;
3b6f9e5c 2221
cdd6c482 2222 if (!group_can_go_on(event, cpuctx, 1)) {
d859e29f 2223 err = -EEXIST;
e758a33d 2224 } else {
cdd6c482 2225 if (event == leader)
6e37738a 2226 err = group_sched_in(event, cpuctx, ctx);
e758a33d 2227 else
6e37738a 2228 err = event_sched_in(event, cpuctx, ctx);
e758a33d 2229 }
d859e29f
PM
2230
2231 if (err) {
2232 /*
cdd6c482 2233 * If this event can't go on and it's part of a
d859e29f
PM
2234 * group, then the whole group has to come off.
2235 */
9e630205 2236 if (leader != event) {
d859e29f 2237 group_sched_out(leader, cpuctx, ctx);
9e630205
SE
2238 perf_cpu_hrtimer_restart(cpuctx);
2239 }
0d48696f 2240 if (leader->attr.pinned) {
53cfbf59 2241 update_group_times(leader);
cdd6c482 2242 leader->state = PERF_EVENT_STATE_ERROR;
53cfbf59 2243 }
d859e29f
PM
2244 }
2245
9ed6060d 2246unlock:
e625cce1 2247 raw_spin_unlock(&ctx->lock);
fe4b04fa
PZ
2248
2249 return 0;
d859e29f
PM
2250}
2251
2252/*
cdd6c482 2253 * Enable a event.
c93f7669 2254 *
cdd6c482
IM
2255 * If event->ctx is a cloned context, callers must make sure that
2256 * every task struct that event->ctx->task could possibly point to
c93f7669 2257 * remains valid. This condition is satisfied when called through
cdd6c482
IM
2258 * perf_event_for_each_child or perf_event_for_each as described
2259 * for perf_event_disable.
d859e29f 2260 */
f63a8daa 2261static void _perf_event_enable(struct perf_event *event)
d859e29f 2262{
cdd6c482 2263 struct perf_event_context *ctx = event->ctx;
d859e29f
PM
2264 struct task_struct *task = ctx->task;
2265
2266 if (!task) {
2267 /*
cdd6c482 2268 * Enable the event on the cpu that it's on
d859e29f 2269 */
fe4b04fa 2270 cpu_function_call(event->cpu, __perf_event_enable, event);
d859e29f
PM
2271 return;
2272 }
2273
e625cce1 2274 raw_spin_lock_irq(&ctx->lock);
cdd6c482 2275 if (event->state >= PERF_EVENT_STATE_INACTIVE)
d859e29f
PM
2276 goto out;
2277
2278 /*
cdd6c482
IM
2279 * If the event is in error state, clear that first.
2280 * That way, if we see the event in error state below, we
d859e29f
PM
2281 * know that it has gone back into error state, as distinct
2282 * from the task having been scheduled away before the
2283 * cross-call arrived.
2284 */
cdd6c482
IM
2285 if (event->state == PERF_EVENT_STATE_ERROR)
2286 event->state = PERF_EVENT_STATE_OFF;
d859e29f 2287
9ed6060d 2288retry:
fe4b04fa 2289 if (!ctx->is_active) {
1d9b482e 2290 __perf_event_mark_enabled(event);
fe4b04fa
PZ
2291 goto out;
2292 }
2293
e625cce1 2294 raw_spin_unlock_irq(&ctx->lock);
fe4b04fa
PZ
2295
2296 if (!task_function_call(task, __perf_event_enable, event))
2297 return;
d859e29f 2298
e625cce1 2299 raw_spin_lock_irq(&ctx->lock);
d859e29f
PM
2300
2301 /*
cdd6c482 2302 * If the context is active and the event is still off,
d859e29f
PM
2303 * we need to retry the cross-call.
2304 */
fe4b04fa
PZ
2305 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
2306 /*
2307 * task could have been flipped by a concurrent
2308 * perf_event_context_sched_out()
2309 */
2310 task = ctx->task;
d859e29f 2311 goto retry;
fe4b04fa 2312 }
fa289bec 2313
9ed6060d 2314out:
e625cce1 2315 raw_spin_unlock_irq(&ctx->lock);
d859e29f 2316}
f63a8daa
PZ
2317
2318/*
2319 * See perf_event_disable();
2320 */
2321void perf_event_enable(struct perf_event *event)
2322{
2323 struct perf_event_context *ctx;
2324
2325 ctx = perf_event_ctx_lock(event);
2326 _perf_event_enable(event);
2327 perf_event_ctx_unlock(event, ctx);
2328}
dcfce4a0 2329EXPORT_SYMBOL_GPL(perf_event_enable);
d859e29f 2330
f63a8daa 2331static int _perf_event_refresh(struct perf_event *event, int refresh)
79f14641 2332{
2023b359 2333 /*
cdd6c482 2334 * not supported on inherited events
2023b359 2335 */
2e939d1d 2336 if (event->attr.inherit || !is_sampling_event(event))
2023b359
PZ
2337 return -EINVAL;
2338
cdd6c482 2339 atomic_add(refresh, &event->event_limit);
f63a8daa 2340 _perf_event_enable(event);
2023b359
PZ
2341
2342 return 0;
79f14641 2343}
f63a8daa
PZ
2344
2345/*
2346 * See perf_event_disable()
2347 */
2348int perf_event_refresh(struct perf_event *event, int refresh)
2349{
2350 struct perf_event_context *ctx;
2351 int ret;
2352
2353 ctx = perf_event_ctx_lock(event);
2354 ret = _perf_event_refresh(event, refresh);
2355 perf_event_ctx_unlock(event, ctx);
2356
2357 return ret;
2358}
26ca5c11 2359EXPORT_SYMBOL_GPL(perf_event_refresh);
79f14641 2360
5b0311e1
FW
2361static void ctx_sched_out(struct perf_event_context *ctx,
2362 struct perf_cpu_context *cpuctx,
2363 enum event_type_t event_type)
235c7fc7 2364{
cdd6c482 2365 struct perf_event *event;
db24d33e 2366 int is_active = ctx->is_active;
235c7fc7 2367
db24d33e 2368 ctx->is_active &= ~event_type;
cdd6c482 2369 if (likely(!ctx->nr_events))
facc4307
PZ
2370 return;
2371
4af4998b 2372 update_context_time(ctx);
e5d1367f 2373 update_cgrp_time_from_cpuctx(cpuctx);
5b0311e1 2374 if (!ctx->nr_active)
facc4307 2375 return;
5b0311e1 2376
075e0b00 2377 perf_pmu_disable(ctx->pmu);
db24d33e 2378 if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
889ff015
FW
2379 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
2380 group_sched_out(event, cpuctx, ctx);
9ed6060d 2381 }
889ff015 2382
db24d33e 2383 if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
889ff015 2384 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
8c9ed8e1 2385 group_sched_out(event, cpuctx, ctx);
9ed6060d 2386 }
1b9a644f 2387 perf_pmu_enable(ctx->pmu);
235c7fc7
IM
2388}
2389
564c2b21 2390/*
5a3126d4
PZ
2391 * Test whether two contexts are equivalent, i.e. whether they have both been
2392 * cloned from the same version of the same context.
2393 *
2394 * Equivalence is measured using a generation number in the context that is
2395 * incremented on each modification to it; see unclone_ctx(), list_add_event()
2396 * and list_del_event().
564c2b21 2397 */
cdd6c482
IM
2398static int context_equiv(struct perf_event_context *ctx1,
2399 struct perf_event_context *ctx2)
564c2b21 2400{
211de6eb
PZ
2401 lockdep_assert_held(&ctx1->lock);
2402 lockdep_assert_held(&ctx2->lock);
2403
5a3126d4
PZ
2404 /* Pinning disables the swap optimization */
2405 if (ctx1->pin_count || ctx2->pin_count)
2406 return 0;
2407
2408 /* If ctx1 is the parent of ctx2 */
2409 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2410 return 1;
2411
2412 /* If ctx2 is the parent of ctx1 */
2413 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2414 return 1;
2415
2416 /*
2417 * If ctx1 and ctx2 have the same parent; we flatten the parent
2418 * hierarchy, see perf_event_init_context().
2419 */
2420 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2421 ctx1->parent_gen == ctx2->parent_gen)
2422 return 1;
2423
2424 /* Unmatched */
2425 return 0;
564c2b21
PM
2426}
2427
cdd6c482
IM
2428static void __perf_event_sync_stat(struct perf_event *event,
2429 struct perf_event *next_event)
bfbd3381
PZ
2430{
2431 u64 value;
2432
cdd6c482 2433 if (!event->attr.inherit_stat)
bfbd3381
PZ
2434 return;
2435
2436 /*
cdd6c482 2437 * Update the event value, we cannot use perf_event_read()
bfbd3381
PZ
2438 * because we're in the middle of a context switch and have IRQs
2439 * disabled, which upsets smp_call_function_single(), however
cdd6c482 2440 * we know the event must be on the current CPU, therefore we
bfbd3381
PZ
2441 * don't need to use it.
2442 */
cdd6c482
IM
2443 switch (event->state) {
2444 case PERF_EVENT_STATE_ACTIVE:
3dbebf15
PZ
2445 event->pmu->read(event);
2446 /* fall-through */
bfbd3381 2447
cdd6c482
IM
2448 case PERF_EVENT_STATE_INACTIVE:
2449 update_event_times(event);
bfbd3381
PZ
2450 break;
2451
2452 default:
2453 break;
2454 }
2455
2456 /*
cdd6c482 2457 * In order to keep per-task stats reliable we need to flip the event
bfbd3381
PZ
2458 * values when we flip the contexts.
2459 */
e7850595
PZ
2460 value = local64_read(&next_event->count);
2461 value = local64_xchg(&event->count, value);
2462 local64_set(&next_event->count, value);
bfbd3381 2463
cdd6c482
IM
2464 swap(event->total_time_enabled, next_event->total_time_enabled);
2465 swap(event->total_time_running, next_event->total_time_running);
19d2e755 2466
bfbd3381 2467 /*
19d2e755 2468 * Since we swizzled the values, update the user visible data too.
bfbd3381 2469 */
cdd6c482
IM
2470 perf_event_update_userpage(event);
2471 perf_event_update_userpage(next_event);
bfbd3381
PZ
2472}
2473
cdd6c482
IM
2474static void perf_event_sync_stat(struct perf_event_context *ctx,
2475 struct perf_event_context *next_ctx)
bfbd3381 2476{
cdd6c482 2477 struct perf_event *event, *next_event;
bfbd3381
PZ
2478
2479 if (!ctx->nr_stat)
2480 return;
2481
02ffdbc8
PZ
2482 update_context_time(ctx);
2483
cdd6c482
IM
2484 event = list_first_entry(&ctx->event_list,
2485 struct perf_event, event_entry);
bfbd3381 2486
cdd6c482
IM
2487 next_event = list_first_entry(&next_ctx->event_list,
2488 struct perf_event, event_entry);
bfbd3381 2489
cdd6c482
IM
2490 while (&event->event_entry != &ctx->event_list &&
2491 &next_event->event_entry != &next_ctx->event_list) {
bfbd3381 2492
cdd6c482 2493 __perf_event_sync_stat(event, next_event);
bfbd3381 2494
cdd6c482
IM
2495 event = list_next_entry(event, event_entry);
2496 next_event = list_next_entry(next_event, event_entry);
bfbd3381
PZ
2497 }
2498}
2499
fe4b04fa
PZ
2500static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2501 struct task_struct *next)
0793a61d 2502{
8dc85d54 2503 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
cdd6c482 2504 struct perf_event_context *next_ctx;
5a3126d4 2505 struct perf_event_context *parent, *next_parent;
108b02cf 2506 struct perf_cpu_context *cpuctx;
c93f7669 2507 int do_switch = 1;
0793a61d 2508
108b02cf
PZ
2509 if (likely(!ctx))
2510 return;
10989fb2 2511
108b02cf
PZ
2512 cpuctx = __get_cpu_context(ctx);
2513 if (!cpuctx->task_ctx)
0793a61d
TG
2514 return;
2515
c93f7669 2516 rcu_read_lock();
8dc85d54 2517 next_ctx = next->perf_event_ctxp[ctxn];
5a3126d4
PZ
2518 if (!next_ctx)
2519 goto unlock;
2520
2521 parent = rcu_dereference(ctx->parent_ctx);
2522 next_parent = rcu_dereference(next_ctx->parent_ctx);
2523
2524 /* If neither context have a parent context; they cannot be clones. */
802c8a61 2525 if (!parent && !next_parent)
5a3126d4
PZ
2526 goto unlock;
2527
2528 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
c93f7669
PM
2529 /*
2530 * Looks like the two contexts are clones, so we might be
2531 * able to optimize the context switch. We lock both
2532 * contexts and check that they are clones under the
2533 * lock (including re-checking that neither has been
2534 * uncloned in the meantime). It doesn't matter which
2535 * order we take the locks because no other cpu could
2536 * be trying to lock both of these tasks.
2537 */
e625cce1
TG
2538 raw_spin_lock(&ctx->lock);
2539 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
c93f7669 2540 if (context_equiv(ctx, next_ctx)) {
665c2142
PZ
2541 /*
2542 * XXX do we need a memory barrier of sorts
cdd6c482 2543 * wrt to rcu_dereference() of perf_event_ctxp
665c2142 2544 */
8dc85d54
PZ
2545 task->perf_event_ctxp[ctxn] = next_ctx;
2546 next->perf_event_ctxp[ctxn] = ctx;
c93f7669
PM
2547 ctx->task = next;
2548 next_ctx->task = task;
5a158c3c
YZ
2549
2550 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
2551
c93f7669 2552 do_switch = 0;
bfbd3381 2553
cdd6c482 2554 perf_event_sync_stat(ctx, next_ctx);
c93f7669 2555 }
e625cce1
TG
2556 raw_spin_unlock(&next_ctx->lock);
2557 raw_spin_unlock(&ctx->lock);
564c2b21 2558 }
5a3126d4 2559unlock:
c93f7669 2560 rcu_read_unlock();
564c2b21 2561
c93f7669 2562 if (do_switch) {
facc4307 2563 raw_spin_lock(&ctx->lock);
5b0311e1 2564 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
c93f7669 2565 cpuctx->task_ctx = NULL;
facc4307 2566 raw_spin_unlock(&ctx->lock);
c93f7669 2567 }
0793a61d
TG
2568}
2569
ba532500
YZ
2570void perf_sched_cb_dec(struct pmu *pmu)
2571{
2572 this_cpu_dec(perf_sched_cb_usages);
2573}
2574
2575void perf_sched_cb_inc(struct pmu *pmu)
2576{
2577 this_cpu_inc(perf_sched_cb_usages);
2578}
2579
2580/*
2581 * This function provides the context switch callback to the lower code
2582 * layer. It is invoked ONLY when the context switch callback is enabled.
2583 */
2584static void perf_pmu_sched_task(struct task_struct *prev,
2585 struct task_struct *next,
2586 bool sched_in)
2587{
2588 struct perf_cpu_context *cpuctx;
2589 struct pmu *pmu;
2590 unsigned long flags;
2591
2592 if (prev == next)
2593 return;
2594
2595 local_irq_save(flags);
2596
2597 rcu_read_lock();
2598
2599 list_for_each_entry_rcu(pmu, &pmus, entry) {
2600 if (pmu->sched_task) {
2601 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2602
2603 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2604
2605 perf_pmu_disable(pmu);
2606
2607 pmu->sched_task(cpuctx->task_ctx, sched_in);
2608
2609 perf_pmu_enable(pmu);
2610
2611 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2612 }
2613 }
2614
2615 rcu_read_unlock();
2616
2617 local_irq_restore(flags);
2618}
2619
8dc85d54
PZ
2620#define for_each_task_context_nr(ctxn) \
2621 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2622
2623/*
2624 * Called from scheduler to remove the events of the current task,
2625 * with interrupts disabled.
2626 *
2627 * We stop each event and update the event value in event->count.
2628 *
2629 * This does not protect us against NMI, but disable()
2630 * sets the disabled bit in the control field of event _before_
2631 * accessing the event control register. If a NMI hits, then it will
2632 * not restart the event.
2633 */
ab0cce56
JO
2634void __perf_event_task_sched_out(struct task_struct *task,
2635 struct task_struct *next)
8dc85d54
PZ
2636{
2637 int ctxn;
2638
ba532500
YZ
2639 if (__this_cpu_read(perf_sched_cb_usages))
2640 perf_pmu_sched_task(task, next, false);
2641
8dc85d54
PZ
2642 for_each_task_context_nr(ctxn)
2643 perf_event_context_sched_out(task, ctxn, next);
e5d1367f
SE
2644
2645 /*
2646 * if cgroup events exist on this CPU, then we need
2647 * to check if we have to switch out PMU state.
2648 * cgroup event are system-wide mode only
2649 */
4a32fea9 2650 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
a8d757ef 2651 perf_cgroup_sched_out(task, next);
8dc85d54
PZ
2652}
2653
04dc2dbb 2654static void task_ctx_sched_out(struct perf_event_context *ctx)
a08b159f 2655{
108b02cf 2656 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
a08b159f 2657
a63eaf34
PM
2658 if (!cpuctx->task_ctx)
2659 return;
012b84da
IM
2660
2661 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2662 return;
2663
04dc2dbb 2664 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
a08b159f
PM
2665 cpuctx->task_ctx = NULL;
2666}
2667
5b0311e1
FW
2668/*
2669 * Called with IRQs disabled
2670 */
2671static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
2672 enum event_type_t event_type)
2673{
2674 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
04289bb9
IM
2675}
2676
235c7fc7 2677static void
5b0311e1 2678ctx_pinned_sched_in(struct perf_event_context *ctx,
6e37738a 2679 struct perf_cpu_context *cpuctx)
0793a61d 2680{
cdd6c482 2681 struct perf_event *event;
0793a61d 2682
889ff015
FW
2683 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2684 if (event->state <= PERF_EVENT_STATE_OFF)
3b6f9e5c 2685 continue;
5632ab12 2686 if (!event_filter_match(event))
3b6f9e5c
PM
2687 continue;
2688
e5d1367f
SE
2689 /* may need to reset tstamp_enabled */
2690 if (is_cgroup_event(event))
2691 perf_cgroup_mark_enabled(event, ctx);
2692
8c9ed8e1 2693 if (group_can_go_on(event, cpuctx, 1))
6e37738a 2694 group_sched_in(event, cpuctx, ctx);
3b6f9e5c
PM
2695
2696 /*
2697 * If this pinned group hasn't been scheduled,
2698 * put it in error state.
2699 */
cdd6c482
IM
2700 if (event->state == PERF_EVENT_STATE_INACTIVE) {
2701 update_group_times(event);
2702 event->state = PERF_EVENT_STATE_ERROR;
53cfbf59 2703 }
3b6f9e5c 2704 }
5b0311e1
FW
2705}
2706
2707static void
2708ctx_flexible_sched_in(struct perf_event_context *ctx,
6e37738a 2709 struct perf_cpu_context *cpuctx)
5b0311e1
FW
2710{
2711 struct perf_event *event;
2712 int can_add_hw = 1;
3b6f9e5c 2713
889ff015
FW
2714 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2715 /* Ignore events in OFF or ERROR state */
2716 if (event->state <= PERF_EVENT_STATE_OFF)
3b6f9e5c 2717 continue;
04289bb9
IM
2718 /*
2719 * Listen to the 'cpu' scheduling filter constraint
cdd6c482 2720 * of events:
04289bb9 2721 */
5632ab12 2722 if (!event_filter_match(event))
0793a61d
TG
2723 continue;
2724
e5d1367f
SE
2725 /* may need to reset tstamp_enabled */
2726 if (is_cgroup_event(event))
2727 perf_cgroup_mark_enabled(event, ctx);
2728
9ed6060d 2729 if (group_can_go_on(event, cpuctx, can_add_hw)) {
6e37738a 2730 if (group_sched_in(event, cpuctx, ctx))
dd0e6ba2 2731 can_add_hw = 0;
9ed6060d 2732 }
0793a61d 2733 }
5b0311e1
FW
2734}
2735
2736static void
2737ctx_sched_in(struct perf_event_context *ctx,
2738 struct perf_cpu_context *cpuctx,
e5d1367f
SE
2739 enum event_type_t event_type,
2740 struct task_struct *task)
5b0311e1 2741{
e5d1367f 2742 u64 now;
db24d33e 2743 int is_active = ctx->is_active;
e5d1367f 2744
db24d33e 2745 ctx->is_active |= event_type;
5b0311e1 2746 if (likely(!ctx->nr_events))
facc4307 2747 return;
5b0311e1 2748
e5d1367f
SE
2749 now = perf_clock();
2750 ctx->timestamp = now;
3f7cce3c 2751 perf_cgroup_set_timestamp(task, ctx);
5b0311e1
FW
2752 /*
2753 * First go through the list and put on any pinned groups
2754 * in order to give them the best chance of going on.
2755 */
db24d33e 2756 if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
6e37738a 2757 ctx_pinned_sched_in(ctx, cpuctx);
5b0311e1
FW
2758
2759 /* Then walk through the lower prio flexible groups */
db24d33e 2760 if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
6e37738a 2761 ctx_flexible_sched_in(ctx, cpuctx);
235c7fc7
IM
2762}
2763
329c0e01 2764static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
e5d1367f
SE
2765 enum event_type_t event_type,
2766 struct task_struct *task)
329c0e01
FW
2767{
2768 struct perf_event_context *ctx = &cpuctx->ctx;
2769
e5d1367f 2770 ctx_sched_in(ctx, cpuctx, event_type, task);
329c0e01
FW
2771}
2772
e5d1367f
SE
2773static void perf_event_context_sched_in(struct perf_event_context *ctx,
2774 struct task_struct *task)
235c7fc7 2775{
108b02cf 2776 struct perf_cpu_context *cpuctx;
235c7fc7 2777
108b02cf 2778 cpuctx = __get_cpu_context(ctx);
329c0e01
FW
2779 if (cpuctx->task_ctx == ctx)
2780 return;
2781
facc4307 2782 perf_ctx_lock(cpuctx, ctx);
1b9a644f 2783 perf_pmu_disable(ctx->pmu);
329c0e01
FW
2784 /*
2785 * We want to keep the following priority order:
2786 * cpu pinned (that don't need to move), task pinned,
2787 * cpu flexible, task flexible.
2788 */
2789 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2790
1d5f003f
GN
2791 if (ctx->nr_events)
2792 cpuctx->task_ctx = ctx;
9b33fa6b 2793
86b47c25
GN
2794 perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
2795
facc4307
PZ
2796 perf_pmu_enable(ctx->pmu);
2797 perf_ctx_unlock(cpuctx, ctx);
235c7fc7
IM
2798}
2799
8dc85d54
PZ
2800/*
2801 * Called from scheduler to add the events of the current task
2802 * with interrupts disabled.
2803 *
2804 * We restore the event value and then enable it.
2805 *
2806 * This does not protect us against NMI, but enable()
2807 * sets the enabled bit in the control field of event _before_
2808 * accessing the event control register. If a NMI hits, then it will
2809 * keep the event running.
2810 */
ab0cce56
JO
2811void __perf_event_task_sched_in(struct task_struct *prev,
2812 struct task_struct *task)
8dc85d54
PZ
2813{
2814 struct perf_event_context *ctx;
2815 int ctxn;
2816
2817 for_each_task_context_nr(ctxn) {
2818 ctx = task->perf_event_ctxp[ctxn];
2819 if (likely(!ctx))
2820 continue;
2821
e5d1367f 2822 perf_event_context_sched_in(ctx, task);
8dc85d54 2823 }
e5d1367f
SE
2824 /*
2825 * if cgroup events exist on this CPU, then we need
2826 * to check if we have to switch in PMU state.
2827 * cgroup event are system-wide mode only
2828 */
4a32fea9 2829 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
a8d757ef 2830 perf_cgroup_sched_in(prev, task);
d010b332 2831
ba532500
YZ
2832 if (__this_cpu_read(perf_sched_cb_usages))
2833 perf_pmu_sched_task(prev, task, true);
235c7fc7
IM
2834}
2835
abd50713
PZ
2836static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
2837{
2838 u64 frequency = event->attr.sample_freq;
2839 u64 sec = NSEC_PER_SEC;
2840 u64 divisor, dividend;
2841
2842 int count_fls, nsec_fls, frequency_fls, sec_fls;
2843
2844 count_fls = fls64(count);
2845 nsec_fls = fls64(nsec);
2846 frequency_fls = fls64(frequency);
2847 sec_fls = 30;
2848
2849 /*
2850 * We got @count in @nsec, with a target of sample_freq HZ
2851 * the target period becomes:
2852 *
2853 * @count * 10^9
2854 * period = -------------------
2855 * @nsec * sample_freq
2856 *
2857 */
2858
2859 /*
2860 * Reduce accuracy by one bit such that @a and @b converge
2861 * to a similar magnitude.
2862 */
fe4b04fa 2863#define REDUCE_FLS(a, b) \
abd50713
PZ
2864do { \
2865 if (a##_fls > b##_fls) { \
2866 a >>= 1; \
2867 a##_fls--; \
2868 } else { \
2869 b >>= 1; \
2870 b##_fls--; \
2871 } \
2872} while (0)
2873
2874 /*
2875 * Reduce accuracy until either term fits in a u64, then proceed with
2876 * the other, so that finally we can do a u64/u64 division.
2877 */
2878 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
2879 REDUCE_FLS(nsec, frequency);
2880 REDUCE_FLS(sec, count);
2881 }
2882
2883 if (count_fls + sec_fls > 64) {
2884 divisor = nsec * frequency;
2885
2886 while (count_fls + sec_fls > 64) {
2887 REDUCE_FLS(count, sec);
2888 divisor >>= 1;
2889 }
2890
2891 dividend = count * sec;
2892 } else {
2893 dividend = count * sec;
2894
2895 while (nsec_fls + frequency_fls > 64) {
2896 REDUCE_FLS(nsec, frequency);
2897 dividend >>= 1;
2898 }
2899
2900 divisor = nsec * frequency;
2901 }
2902
f6ab91ad
PZ
2903 if (!divisor)
2904 return dividend;
2905
abd50713
PZ
2906 return div64_u64(dividend, divisor);
2907}
2908
e050e3f0
SE
2909static DEFINE_PER_CPU(int, perf_throttled_count);
2910static DEFINE_PER_CPU(u64, perf_throttled_seq);
2911
f39d47ff 2912static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
bd2b5b12 2913{
cdd6c482 2914 struct hw_perf_event *hwc = &event->hw;
f6ab91ad 2915 s64 period, sample_period;
bd2b5b12
PZ
2916 s64 delta;
2917
abd50713 2918 period = perf_calculate_period(event, nsec, count);
bd2b5b12
PZ
2919
2920 delta = (s64)(period - hwc->sample_period);
2921 delta = (delta + 7) / 8; /* low pass filter */
2922
2923 sample_period = hwc->sample_period + delta;
2924
2925 if (!sample_period)
2926 sample_period = 1;
2927
bd2b5b12 2928 hwc->sample_period = sample_period;
abd50713 2929
e7850595 2930 if (local64_read(&hwc->period_left) > 8*sample_period) {
f39d47ff
SE
2931 if (disable)
2932 event->pmu->stop(event, PERF_EF_UPDATE);
2933
e7850595 2934 local64_set(&hwc->period_left, 0);
f39d47ff
SE
2935
2936 if (disable)
2937 event->pmu->start(event, PERF_EF_RELOAD);
abd50713 2938 }
bd2b5b12
PZ
2939}
2940
e050e3f0
SE
2941/*
2942 * combine freq adjustment with unthrottling to avoid two passes over the
2943 * events. At the same time, make sure, having freq events does not change
2944 * the rate of unthrottling as that would introduce bias.
2945 */
2946static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2947 int needs_unthr)
60db5e09 2948{
cdd6c482
IM
2949 struct perf_event *event;
2950 struct hw_perf_event *hwc;
e050e3f0 2951 u64 now, period = TICK_NSEC;
abd50713 2952 s64 delta;
60db5e09 2953
e050e3f0
SE
2954 /*
2955 * only need to iterate over all events iff:
2956 * - context have events in frequency mode (needs freq adjust)
2957 * - there are events to unthrottle on this cpu
2958 */
2959 if (!(ctx->nr_freq || needs_unthr))
0f5a2601
PZ
2960 return;
2961
e050e3f0 2962 raw_spin_lock(&ctx->lock);
f39d47ff 2963 perf_pmu_disable(ctx->pmu);
e050e3f0 2964
03541f8b 2965 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
cdd6c482 2966 if (event->state != PERF_EVENT_STATE_ACTIVE)
60db5e09
PZ
2967 continue;
2968
5632ab12 2969 if (!event_filter_match(event))
5d27c23d
PZ
2970 continue;
2971
44377277
AS
2972 perf_pmu_disable(event->pmu);
2973
cdd6c482 2974 hwc = &event->hw;
6a24ed6c 2975
ae23bff1 2976 if (hwc->interrupts == MAX_INTERRUPTS) {
e050e3f0 2977 hwc->interrupts = 0;
cdd6c482 2978 perf_log_throttle(event, 1);
a4eaf7f1 2979 event->pmu->start(event, 0);
a78ac325
PZ
2980 }
2981
cdd6c482 2982 if (!event->attr.freq || !event->attr.sample_freq)
44377277 2983 goto next;
60db5e09 2984
e050e3f0
SE
2985 /*
2986 * stop the event and update event->count
2987 */
2988 event->pmu->stop(event, PERF_EF_UPDATE);
2989
e7850595 2990 now = local64_read(&event->count);
abd50713
PZ
2991 delta = now - hwc->freq_count_stamp;
2992 hwc->freq_count_stamp = now;
60db5e09 2993
e050e3f0
SE
2994 /*
2995 * restart the event
2996 * reload only if value has changed
f39d47ff
SE
2997 * we have stopped the event so tell that
2998 * to perf_adjust_period() to avoid stopping it
2999 * twice.
e050e3f0 3000 */
abd50713 3001 if (delta > 0)
f39d47ff 3002 perf_adjust_period(event, period, delta, false);
e050e3f0
SE
3003
3004 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
44377277
AS
3005 next:
3006 perf_pmu_enable(event->pmu);
60db5e09 3007 }
e050e3f0 3008
f39d47ff 3009 perf_pmu_enable(ctx->pmu);
e050e3f0 3010 raw_spin_unlock(&ctx->lock);
60db5e09
PZ
3011}
3012
235c7fc7 3013/*
cdd6c482 3014 * Round-robin a context's events:
235c7fc7 3015 */
cdd6c482 3016static void rotate_ctx(struct perf_event_context *ctx)
0793a61d 3017{
dddd3379
TG
3018 /*
3019 * Rotate the first entry last of non-pinned groups. Rotation might be
3020 * disabled by the inheritance code.
3021 */
3022 if (!ctx->rotate_disable)
3023 list_rotate_left(&ctx->flexible_groups);
235c7fc7
IM
3024}
3025
9e630205 3026static int perf_rotate_context(struct perf_cpu_context *cpuctx)
235c7fc7 3027{
8dc85d54 3028 struct perf_event_context *ctx = NULL;
2fde4f94 3029 int rotate = 0;
7fc23a53 3030
b5ab4cd5 3031 if (cpuctx->ctx.nr_events) {
b5ab4cd5
PZ
3032 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3033 rotate = 1;
3034 }
235c7fc7 3035
8dc85d54 3036 ctx = cpuctx->task_ctx;
b5ab4cd5 3037 if (ctx && ctx->nr_events) {
b5ab4cd5
PZ
3038 if (ctx->nr_events != ctx->nr_active)
3039 rotate = 1;
3040 }
9717e6cd 3041
e050e3f0 3042 if (!rotate)
0f5a2601
PZ
3043 goto done;
3044
facc4307 3045 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
1b9a644f 3046 perf_pmu_disable(cpuctx->ctx.pmu);
60db5e09 3047
e050e3f0
SE
3048 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3049 if (ctx)
3050 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
0793a61d 3051
e050e3f0
SE
3052 rotate_ctx(&cpuctx->ctx);
3053 if (ctx)
3054 rotate_ctx(ctx);
235c7fc7 3055
e050e3f0 3056 perf_event_sched_in(cpuctx, ctx, current);
235c7fc7 3057
0f5a2601
PZ
3058 perf_pmu_enable(cpuctx->ctx.pmu);
3059 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
b5ab4cd5 3060done:
9e630205
SE
3061
3062 return rotate;
e9d2b064
PZ
3063}
3064
026249ef
FW
3065#ifdef CONFIG_NO_HZ_FULL
3066bool perf_event_can_stop_tick(void)
3067{
948b26b6 3068 if (atomic_read(&nr_freq_events) ||
d84153d6 3069 __this_cpu_read(perf_throttled_count))
026249ef 3070 return false;
d84153d6
FW
3071 else
3072 return true;
026249ef
FW
3073}
3074#endif
3075
e9d2b064
PZ
3076void perf_event_task_tick(void)
3077{
2fde4f94
MR
3078 struct list_head *head = this_cpu_ptr(&active_ctx_list);
3079 struct perf_event_context *ctx, *tmp;
e050e3f0 3080 int throttled;
b5ab4cd5 3081
e9d2b064
PZ
3082 WARN_ON(!irqs_disabled());
3083
e050e3f0
SE
3084 __this_cpu_inc(perf_throttled_seq);
3085 throttled = __this_cpu_xchg(perf_throttled_count, 0);
3086
2fde4f94 3087 list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
e050e3f0 3088 perf_adjust_freq_unthr_context(ctx, throttled);
0793a61d
TG
3089}
3090
889ff015
FW
3091static int event_enable_on_exec(struct perf_event *event,
3092 struct perf_event_context *ctx)
3093{
3094 if (!event->attr.enable_on_exec)
3095 return 0;
3096
3097 event->attr.enable_on_exec = 0;
3098 if (event->state >= PERF_EVENT_STATE_INACTIVE)
3099 return 0;
3100
1d9b482e 3101 __perf_event_mark_enabled(event);
889ff015
FW
3102
3103 return 1;
3104}
3105
57e7986e 3106/*
cdd6c482 3107 * Enable all of a task's events that have been marked enable-on-exec.
57e7986e
PM
3108 * This expects task == current.
3109 */
8dc85d54 3110static void perf_event_enable_on_exec(struct perf_event_context *ctx)
57e7986e 3111{
211de6eb 3112 struct perf_event_context *clone_ctx = NULL;
cdd6c482 3113 struct perf_event *event;
57e7986e
PM
3114 unsigned long flags;
3115 int enabled = 0;
889ff015 3116 int ret;
57e7986e
PM
3117
3118 local_irq_save(flags);
cdd6c482 3119 if (!ctx || !ctx->nr_events)
57e7986e
PM
3120 goto out;
3121
e566b76e
SE
3122 /*
3123 * We must ctxsw out cgroup events to avoid conflict
3124 * when invoking perf_task_event_sched_in() later on
3125 * in this function. Otherwise we end up trying to
3126 * ctxswin cgroup events which are already scheduled
3127 * in.
3128 */
a8d757ef 3129 perf_cgroup_sched_out(current, NULL);
57e7986e 3130
e625cce1 3131 raw_spin_lock(&ctx->lock);
04dc2dbb 3132 task_ctx_sched_out(ctx);
57e7986e 3133
b79387ef 3134 list_for_each_entry(event, &ctx->event_list, event_entry) {
889ff015
FW
3135 ret = event_enable_on_exec(event, ctx);
3136 if (ret)
3137 enabled = 1;
57e7986e
PM
3138 }
3139
3140 /*
cdd6c482 3141 * Unclone this context if we enabled any event.
57e7986e 3142 */
71a851b4 3143 if (enabled)
211de6eb 3144 clone_ctx = unclone_ctx(ctx);
57e7986e 3145
e625cce1 3146 raw_spin_unlock(&ctx->lock);
57e7986e 3147
e566b76e
SE
3148 /*
3149 * Also calls ctxswin for cgroup events, if any:
3150 */
e5d1367f 3151 perf_event_context_sched_in(ctx, ctx->task);
9ed6060d 3152out:
57e7986e 3153 local_irq_restore(flags);
211de6eb
PZ
3154
3155 if (clone_ctx)
3156 put_ctx(clone_ctx);
57e7986e
PM
3157}
3158
e041e328
PZ
3159void perf_event_exec(void)
3160{
3161 struct perf_event_context *ctx;
3162 int ctxn;
3163
3164 rcu_read_lock();
3165 for_each_task_context_nr(ctxn) {
3166 ctx = current->perf_event_ctxp[ctxn];
3167 if (!ctx)
3168 continue;
3169
3170 perf_event_enable_on_exec(ctx);
3171 }
3172 rcu_read_unlock();
3173}
3174
0793a61d 3175/*
cdd6c482 3176 * Cross CPU call to read the hardware event
0793a61d 3177 */
cdd6c482 3178static void __perf_event_read(void *info)
0793a61d 3179{
cdd6c482
IM
3180 struct perf_event *event = info;
3181 struct perf_event_context *ctx = event->ctx;
108b02cf 3182 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
621a01ea 3183
e1ac3614
PM
3184 /*
3185 * If this is a task context, we need to check whether it is
3186 * the current task context of this cpu. If not it has been
3187 * scheduled out before the smp call arrived. In that case
cdd6c482
IM
3188 * event->count would have been updated to a recent sample
3189 * when the event was scheduled out.
e1ac3614
PM
3190 */
3191 if (ctx->task && cpuctx->task_ctx != ctx)
3192 return;
3193
e625cce1 3194 raw_spin_lock(&ctx->lock);
e5d1367f 3195 if (ctx->is_active) {
542e72fc 3196 update_context_time(ctx);
e5d1367f
SE
3197 update_cgrp_time_from_event(event);
3198 }
cdd6c482 3199 update_event_times(event);
542e72fc
PZ
3200 if (event->state == PERF_EVENT_STATE_ACTIVE)
3201 event->pmu->read(event);
e625cce1 3202 raw_spin_unlock(&ctx->lock);
0793a61d
TG
3203}
3204
b5e58793
PZ
3205static inline u64 perf_event_count(struct perf_event *event)
3206{
eacd3ecc
MF
3207 if (event->pmu->count)
3208 return event->pmu->count(event);
3209
3210 return __perf_event_count(event);
b5e58793
PZ
3211}
3212
cdd6c482 3213static u64 perf_event_read(struct perf_event *event)
0793a61d
TG
3214{
3215 /*
cdd6c482
IM
3216 * If event is enabled and currently active on a CPU, update the
3217 * value in the event structure:
0793a61d 3218 */
cdd6c482
IM
3219 if (event->state == PERF_EVENT_STATE_ACTIVE) {
3220 smp_call_function_single(event->oncpu,
3221 __perf_event_read, event, 1);
3222 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
2b8988c9
PZ
3223 struct perf_event_context *ctx = event->ctx;
3224 unsigned long flags;
3225
e625cce1 3226 raw_spin_lock_irqsave(&ctx->lock, flags);
c530ccd9
SE
3227 /*
3228 * may read while context is not active
3229 * (e.g., thread is blocked), in that case
3230 * we cannot update context time
3231 */
e5d1367f 3232 if (ctx->is_active) {
c530ccd9 3233 update_context_time(ctx);
e5d1367f
SE
3234 update_cgrp_time_from_event(event);
3235 }
cdd6c482 3236 update_event_times(event);
e625cce1 3237 raw_spin_unlock_irqrestore(&ctx->lock, flags);
0793a61d
TG
3238 }
3239
b5e58793 3240 return perf_event_count(event);
0793a61d
TG
3241}
3242
a63eaf34 3243/*
cdd6c482 3244 * Initialize the perf_event context in a task_struct:
a63eaf34 3245 */
eb184479 3246static void __perf_event_init_context(struct perf_event_context *ctx)
a63eaf34 3247{
e625cce1 3248 raw_spin_lock_init(&ctx->lock);
a63eaf34 3249 mutex_init(&ctx->mutex);
2fde4f94 3250 INIT_LIST_HEAD(&ctx->active_ctx_list);
889ff015
FW
3251 INIT_LIST_HEAD(&ctx->pinned_groups);
3252 INIT_LIST_HEAD(&ctx->flexible_groups);
a63eaf34
PM
3253 INIT_LIST_HEAD(&ctx->event_list);
3254 atomic_set(&ctx->refcount, 1);
fadfe7be 3255 INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
eb184479
PZ
3256}
3257
3258static struct perf_event_context *
3259alloc_perf_context(struct pmu *pmu, struct task_struct *task)
3260{
3261 struct perf_event_context *ctx;
3262
3263 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
3264 if (!ctx)
3265 return NULL;
3266
3267 __perf_event_init_context(ctx);
3268 if (task) {
3269 ctx->task = task;
3270 get_task_struct(task);
0793a61d 3271 }
eb184479
PZ
3272 ctx->pmu = pmu;
3273
3274 return ctx;
a63eaf34
PM
3275}
3276
2ebd4ffb
MH
3277static struct task_struct *
3278find_lively_task_by_vpid(pid_t vpid)
3279{
3280 struct task_struct *task;
3281 int err;
0793a61d
TG
3282
3283 rcu_read_lock();
2ebd4ffb 3284 if (!vpid)
0793a61d
TG
3285 task = current;
3286 else
2ebd4ffb 3287 task = find_task_by_vpid(vpid);
0793a61d
TG
3288 if (task)
3289 get_task_struct(task);
3290 rcu_read_unlock();
3291
3292 if (!task)
3293 return ERR_PTR(-ESRCH);
3294
0793a61d 3295 /* Reuse ptrace permission checks for now. */
c93f7669
PM
3296 err = -EACCES;
3297 if (!ptrace_may_access(task, PTRACE_MODE_READ))
3298 goto errout;
3299
2ebd4ffb
MH
3300 return task;
3301errout:
3302 put_task_struct(task);
3303 return ERR_PTR(err);
3304
3305}
3306
fe4b04fa
PZ
3307/*
3308 * Returns a matching context with refcount and pincount.
3309 */
108b02cf 3310static struct perf_event_context *
4af57ef2
YZ
3311find_get_context(struct pmu *pmu, struct task_struct *task,
3312 struct perf_event *event)
0793a61d 3313{
211de6eb 3314 struct perf_event_context *ctx, *clone_ctx = NULL;
22a4f650 3315 struct perf_cpu_context *cpuctx;
4af57ef2 3316 void *task_ctx_data = NULL;
25346b93 3317 unsigned long flags;
8dc85d54 3318 int ctxn, err;
4af57ef2 3319 int cpu = event->cpu;
0793a61d 3320
22a4ec72 3321 if (!task) {
cdd6c482 3322 /* Must be root to operate on a CPU event: */
0764771d 3323 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
0793a61d
TG
3324 return ERR_PTR(-EACCES);
3325
0793a61d 3326 /*
cdd6c482 3327 * We could be clever and allow to attach a event to an
0793a61d
TG
3328 * offline CPU and activate it when the CPU comes up, but
3329 * that's for later.
3330 */
f6325e30 3331 if (!cpu_online(cpu))
0793a61d
TG
3332 return ERR_PTR(-ENODEV);
3333
108b02cf 3334 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
0793a61d 3335 ctx = &cpuctx->ctx;
c93f7669 3336 get_ctx(ctx);
fe4b04fa 3337 ++ctx->pin_count;
0793a61d 3338
0793a61d
TG
3339 return ctx;
3340 }
3341
8dc85d54
PZ
3342 err = -EINVAL;
3343 ctxn = pmu->task_ctx_nr;
3344 if (ctxn < 0)
3345 goto errout;
3346
4af57ef2
YZ
3347 if (event->attach_state & PERF_ATTACH_TASK_DATA) {
3348 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
3349 if (!task_ctx_data) {
3350 err = -ENOMEM;
3351 goto errout;
3352 }
3353 }
3354
9ed6060d 3355retry:
8dc85d54 3356 ctx = perf_lock_task_context(task, ctxn, &flags);
c93f7669 3357 if (ctx) {
211de6eb 3358 clone_ctx = unclone_ctx(ctx);
fe4b04fa 3359 ++ctx->pin_count;
4af57ef2
YZ
3360
3361 if (task_ctx_data && !ctx->task_ctx_data) {
3362 ctx->task_ctx_data = task_ctx_data;
3363 task_ctx_data = NULL;
3364 }
e625cce1 3365 raw_spin_unlock_irqrestore(&ctx->lock, flags);
211de6eb
PZ
3366
3367 if (clone_ctx)
3368 put_ctx(clone_ctx);
9137fb28 3369 } else {
eb184479 3370 ctx = alloc_perf_context(pmu, task);
c93f7669
PM
3371 err = -ENOMEM;
3372 if (!ctx)
3373 goto errout;
eb184479 3374
4af57ef2
YZ
3375 if (task_ctx_data) {
3376 ctx->task_ctx_data = task_ctx_data;
3377 task_ctx_data = NULL;
3378 }
3379
dbe08d82
ON
3380 err = 0;
3381 mutex_lock(&task->perf_event_mutex);
3382 /*
3383 * If it has already passed perf_event_exit_task().
3384 * we must see PF_EXITING, it takes this mutex too.
3385 */
3386 if (task->flags & PF_EXITING)
3387 err = -ESRCH;
3388 else if (task->perf_event_ctxp[ctxn])
3389 err = -EAGAIN;
fe4b04fa 3390 else {
9137fb28 3391 get_ctx(ctx);
fe4b04fa 3392 ++ctx->pin_count;
dbe08d82 3393 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
fe4b04fa 3394 }
dbe08d82
ON
3395 mutex_unlock(&task->perf_event_mutex);
3396
3397 if (unlikely(err)) {
9137fb28 3398 put_ctx(ctx);
dbe08d82
ON
3399
3400 if (err == -EAGAIN)
3401 goto retry;
3402 goto errout;
a63eaf34
PM
3403 }
3404 }
3405
4af57ef2 3406 kfree(task_ctx_data);
0793a61d 3407 return ctx;
c93f7669 3408
9ed6060d 3409errout:
4af57ef2 3410 kfree(task_ctx_data);
c93f7669 3411 return ERR_PTR(err);
0793a61d
TG
3412}
3413
6fb2915d 3414static void perf_event_free_filter(struct perf_event *event);
2541517c 3415static void perf_event_free_bpf_prog(struct perf_event *event);
6fb2915d 3416
cdd6c482 3417static void free_event_rcu(struct rcu_head *head)
592903cd 3418{
cdd6c482 3419 struct perf_event *event;
592903cd 3420
cdd6c482
IM
3421 event = container_of(head, struct perf_event, rcu_head);
3422 if (event->ns)
3423 put_pid_ns(event->ns);
6fb2915d 3424 perf_event_free_filter(event);
2541517c 3425 perf_event_free_bpf_prog(event);
cdd6c482 3426 kfree(event);
592903cd
PZ
3427}
3428
b69cf536
PZ
3429static void ring_buffer_attach(struct perf_event *event,
3430 struct ring_buffer *rb);
925d519a 3431
4beb31f3 3432static void unaccount_event_cpu(struct perf_event *event, int cpu)
f1600952 3433{
4beb31f3
FW
3434 if (event->parent)
3435 return;
3436
4beb31f3
FW
3437 if (is_cgroup_event(event))
3438 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3439}
925d519a 3440
4beb31f3
FW
3441static void unaccount_event(struct perf_event *event)
3442{
3443 if (event->parent)
3444 return;
3445
3446 if (event->attach_state & PERF_ATTACH_TASK)
3447 static_key_slow_dec_deferred(&perf_sched_events);
3448 if (event->attr.mmap || event->attr.mmap_data)
3449 atomic_dec(&nr_mmap_events);
3450 if (event->attr.comm)
3451 atomic_dec(&nr_comm_events);
3452 if (event->attr.task)
3453 atomic_dec(&nr_task_events);
948b26b6
FW
3454 if (event->attr.freq)
3455 atomic_dec(&nr_freq_events);
4beb31f3
FW
3456 if (is_cgroup_event(event))
3457 static_key_slow_dec_deferred(&perf_sched_events);
3458 if (has_branch_stack(event))
3459 static_key_slow_dec_deferred(&perf_sched_events);
3460
3461 unaccount_event_cpu(event, event->cpu);
3462}
925d519a 3463
bed5b25a
AS
3464/*
3465 * The following implement mutual exclusion of events on "exclusive" pmus
3466 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
3467 * at a time, so we disallow creating events that might conflict, namely:
3468 *
3469 * 1) cpu-wide events in the presence of per-task events,
3470 * 2) per-task events in the presence of cpu-wide events,
3471 * 3) two matching events on the same context.
3472 *
3473 * The former two cases are handled in the allocation path (perf_event_alloc(),
3474 * __free_event()), the latter -- before the first perf_install_in_context().
3475 */
3476static int exclusive_event_init(struct perf_event *event)
3477{
3478 struct pmu *pmu = event->pmu;
3479
3480 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3481 return 0;
3482
3483 /*
3484 * Prevent co-existence of per-task and cpu-wide events on the
3485 * same exclusive pmu.
3486 *
3487 * Negative pmu::exclusive_cnt means there are cpu-wide
3488 * events on this "exclusive" pmu, positive means there are
3489 * per-task events.
3490 *
3491 * Since this is called in perf_event_alloc() path, event::ctx
3492 * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
3493 * to mean "per-task event", because unlike other attach states it
3494 * never gets cleared.
3495 */
3496 if (event->attach_state & PERF_ATTACH_TASK) {
3497 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
3498 return -EBUSY;
3499 } else {
3500 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
3501 return -EBUSY;
3502 }
3503
3504 return 0;
3505}
3506
3507static void exclusive_event_destroy(struct perf_event *event)
3508{
3509 struct pmu *pmu = event->pmu;
3510
3511 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3512 return;
3513
3514 /* see comment in exclusive_event_init() */
3515 if (event->attach_state & PERF_ATTACH_TASK)
3516 atomic_dec(&pmu->exclusive_cnt);
3517 else
3518 atomic_inc(&pmu->exclusive_cnt);
3519}
3520
3521static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
3522{
3523 if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
3524 (e1->cpu == e2->cpu ||
3525 e1->cpu == -1 ||
3526 e2->cpu == -1))
3527 return true;
3528 return false;
3529}
3530
3531/* Called under the same ctx::mutex as perf_install_in_context() */
3532static bool exclusive_event_installable(struct perf_event *event,
3533 struct perf_event_context *ctx)
3534{
3535 struct perf_event *iter_event;
3536 struct pmu *pmu = event->pmu;
3537
3538 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3539 return true;
3540
3541 list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
3542 if (exclusive_event_match(iter_event, event))
3543 return false;
3544 }
3545
3546 return true;
3547}
3548
766d6c07
FW
3549static void __free_event(struct perf_event *event)
3550{
cdd6c482 3551 if (!event->parent) {
927c7a9e
FW
3552 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
3553 put_callchain_buffers();
f344011c 3554 }
9ee318a7 3555
766d6c07
FW
3556 if (event->destroy)
3557 event->destroy(event);
3558
3559 if (event->ctx)
3560 put_ctx(event->ctx);
3561
bed5b25a
AS
3562 if (event->pmu) {
3563 exclusive_event_destroy(event);
c464c76e 3564 module_put(event->pmu->module);
bed5b25a 3565 }
c464c76e 3566
766d6c07
FW
3567 call_rcu(&event->rcu_head, free_event_rcu);
3568}
683ede43
PZ
3569
3570static void _free_event(struct perf_event *event)
f1600952 3571{
e360adbe 3572 irq_work_sync(&event->pending);
925d519a 3573
4beb31f3 3574 unaccount_event(event);
9ee318a7 3575
76369139 3576 if (event->rb) {
9bb5d40c
PZ
3577 /*
3578 * Can happen when we close an event with re-directed output.
3579 *
3580 * Since we have a 0 refcount, perf_mmap_close() will skip
3581 * over us; possibly making our ring_buffer_put() the last.
3582 */
3583 mutex_lock(&event->mmap_mutex);
b69cf536 3584 ring_buffer_attach(event, NULL);
9bb5d40c 3585 mutex_unlock(&event->mmap_mutex);
a4be7c27
PZ
3586 }
3587
e5d1367f
SE
3588 if (is_cgroup_event(event))
3589 perf_detach_cgroup(event);
3590
766d6c07 3591 __free_event(event);
f1600952
PZ
3592}
3593
683ede43
PZ
3594/*
3595 * Used to free events which have a known refcount of 1, such as in error paths
3596 * where the event isn't exposed yet and inherited events.
3597 */
3598static void free_event(struct perf_event *event)
0793a61d 3599{
683ede43
PZ
3600 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
3601 "unexpected event refcount: %ld; ptr=%p\n",
3602 atomic_long_read(&event->refcount), event)) {
3603 /* leak to avoid use-after-free */
3604 return;
3605 }
0793a61d 3606
683ede43 3607 _free_event(event);
0793a61d
TG
3608}
3609
a66a3052 3610/*
f8697762 3611 * Remove user event from the owner task.
a66a3052 3612 */
f8697762 3613static void perf_remove_from_owner(struct perf_event *event)
fb0459d7 3614{
8882135b 3615 struct task_struct *owner;
fb0459d7 3616
8882135b
PZ
3617 rcu_read_lock();
3618 owner = ACCESS_ONCE(event->owner);
3619 /*
3620 * Matches the smp_wmb() in perf_event_exit_task(). If we observe
3621 * !owner it means the list deletion is complete and we can indeed
3622 * free this event, otherwise we need to serialize on
3623 * owner->perf_event_mutex.
3624 */
3625 smp_read_barrier_depends();
3626 if (owner) {
3627 /*
3628 * Since delayed_put_task_struct() also drops the last
3629 * task reference we can safely take a new reference
3630 * while holding the rcu_read_lock().
3631 */
3632 get_task_struct(owner);
3633 }
3634 rcu_read_unlock();
3635
3636 if (owner) {
f63a8daa
PZ
3637 /*
3638 * If we're here through perf_event_exit_task() we're already
3639 * holding ctx->mutex which would be an inversion wrt. the
3640 * normal lock order.
3641 *
3642 * However we can safely take this lock because its the child
3643 * ctx->mutex.
3644 */
3645 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
3646
8882135b
PZ
3647 /*
3648 * We have to re-check the event->owner field, if it is cleared
3649 * we raced with perf_event_exit_task(), acquiring the mutex
3650 * ensured they're done, and we can proceed with freeing the
3651 * event.
3652 */
3653 if (event->owner)
3654 list_del_init(&event->owner_entry);
3655 mutex_unlock(&owner->perf_event_mutex);
3656 put_task_struct(owner);
3657 }
f8697762
JO
3658}
3659
3660/*
3661 * Called when the last reference to the file is gone.
3662 */
3663static void put_event(struct perf_event *event)
3664{
a83fe28e 3665 struct perf_event_context *ctx;
f8697762
JO
3666
3667 if (!atomic_long_dec_and_test(&event->refcount))
3668 return;
3669
3670 if (!is_kernel_event(event))
3671 perf_remove_from_owner(event);
8882135b 3672
683ede43
PZ
3673 /*
3674 * There are two ways this annotation is useful:
3675 *
3676 * 1) there is a lock recursion from perf_event_exit_task
3677 * see the comment there.
3678 *
3679 * 2) there is a lock-inversion with mmap_sem through
3680 * perf_event_read_group(), which takes faults while
3681 * holding ctx->mutex, however this is called after
3682 * the last filedesc died, so there is no possibility
3683 * to trigger the AB-BA case.
3684 */
a83fe28e
PZ
3685 ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
3686 WARN_ON_ONCE(ctx->parent_ctx);
683ede43 3687 perf_remove_from_context(event, true);
d415a7f1 3688 perf_event_ctx_unlock(event, ctx);
683ede43
PZ
3689
3690 _free_event(event);
a6fa941d
AV
3691}
3692
683ede43
PZ
3693int perf_event_release_kernel(struct perf_event *event)
3694{
3695 put_event(event);
3696 return 0;
3697}
3698EXPORT_SYMBOL_GPL(perf_event_release_kernel);
3699
a6fa941d
AV
3700static int perf_release(struct inode *inode, struct file *file)
3701{
3702 put_event(file->private_data);
3703 return 0;
fb0459d7 3704}
fb0459d7 3705
fadfe7be
JO
3706/*
3707 * Remove all orphanes events from the context.
3708 */
3709static void orphans_remove_work(struct work_struct *work)
3710{
3711 struct perf_event_context *ctx;
3712 struct perf_event *event, *tmp;
3713
3714 ctx = container_of(work, struct perf_event_context,
3715 orphans_remove.work);
3716
3717 mutex_lock(&ctx->mutex);
3718 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
3719 struct perf_event *parent_event = event->parent;
3720
3721 if (!is_orphaned_child(event))
3722 continue;
3723
3724 perf_remove_from_context(event, true);
3725
3726 mutex_lock(&parent_event->child_mutex);
3727 list_del_init(&event->child_list);
3728 mutex_unlock(&parent_event->child_mutex);
3729
3730 free_event(event);
3731 put_event(parent_event);
3732 }
3733
3734 raw_spin_lock_irq(&ctx->lock);
3735 ctx->orphans_remove_sched = false;
3736 raw_spin_unlock_irq(&ctx->lock);
3737 mutex_unlock(&ctx->mutex);
3738
3739 put_ctx(ctx);
3740}
3741
59ed446f 3742u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
e53c0994 3743{
cdd6c482 3744 struct perf_event *child;
e53c0994
PZ
3745 u64 total = 0;
3746
59ed446f
PZ
3747 *enabled = 0;
3748 *running = 0;
3749
6f10581a 3750 mutex_lock(&event->child_mutex);
cdd6c482 3751 total += perf_event_read(event);
59ed446f
PZ
3752 *enabled += event->total_time_enabled +
3753 atomic64_read(&event->child_total_time_enabled);
3754 *running += event->total_time_running +
3755 atomic64_read(&event->child_total_time_running);
3756
3757 list_for_each_entry(child, &event->child_list, child_list) {
cdd6c482 3758 total += perf_event_read(child);
59ed446f
PZ
3759 *enabled += child->total_time_enabled;
3760 *running += child->total_time_running;
3761 }
6f10581a 3762 mutex_unlock(&event->child_mutex);
e53c0994
PZ
3763
3764 return total;
3765}
fb0459d7 3766EXPORT_SYMBOL_GPL(perf_event_read_value);
e53c0994 3767
cdd6c482 3768static int perf_event_read_group(struct perf_event *event,
3dab77fb
PZ
3769 u64 read_format, char __user *buf)
3770{
cdd6c482 3771 struct perf_event *leader = event->group_leader, *sub;
6f10581a 3772 struct perf_event_context *ctx = leader->ctx;
f63a8daa 3773 int n = 0, size = 0, ret;
59ed446f 3774 u64 count, enabled, running;
f63a8daa
PZ
3775 u64 values[5];
3776
3777 lockdep_assert_held(&ctx->mutex);
abf4868b 3778
59ed446f 3779 count = perf_event_read_value(leader, &enabled, &running);
3dab77fb
PZ
3780
3781 values[n++] = 1 + leader->nr_siblings;
59ed446f
PZ
3782 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3783 values[n++] = enabled;
3784 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3785 values[n++] = running;
abf4868b
PZ
3786 values[n++] = count;
3787 if (read_format & PERF_FORMAT_ID)
3788 values[n++] = primary_event_id(leader);
3dab77fb
PZ
3789
3790 size = n * sizeof(u64);
3791
3792 if (copy_to_user(buf, values, size))
f63a8daa 3793 return -EFAULT;
3dab77fb 3794
6f10581a 3795 ret = size;
3dab77fb 3796
65abc865 3797 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
abf4868b 3798 n = 0;
3dab77fb 3799
59ed446f 3800 values[n++] = perf_event_read_value(sub, &enabled, &running);
abf4868b
PZ
3801 if (read_format & PERF_FORMAT_ID)
3802 values[n++] = primary_event_id(sub);
3803
3804 size = n * sizeof(u64);
3805
184d3da8 3806 if (copy_to_user(buf + ret, values, size)) {
f63a8daa 3807 return -EFAULT;
6f10581a 3808 }
abf4868b
PZ
3809
3810 ret += size;
3dab77fb
PZ
3811 }
3812
abf4868b 3813 return ret;
3dab77fb
PZ
3814}
3815
cdd6c482 3816static int perf_event_read_one(struct perf_event *event,
3dab77fb
PZ
3817 u64 read_format, char __user *buf)
3818{
59ed446f 3819 u64 enabled, running;
3dab77fb
PZ
3820 u64 values[4];
3821 int n = 0;
3822
59ed446f
PZ
3823 values[n++] = perf_event_read_value(event, &enabled, &running);
3824 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3825 values[n++] = enabled;
3826 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3827 values[n++] = running;
3dab77fb 3828 if (read_format & PERF_FORMAT_ID)
cdd6c482 3829 values[n++] = primary_event_id(event);
3dab77fb
PZ
3830
3831 if (copy_to_user(buf, values, n * sizeof(u64)))
3832 return -EFAULT;
3833
3834 return n * sizeof(u64);
3835}
3836
dc633982
JO
3837static bool is_event_hup(struct perf_event *event)
3838{
3839 bool no_children;
3840
3841 if (event->state != PERF_EVENT_STATE_EXIT)
3842 return false;
3843
3844 mutex_lock(&event->child_mutex);
3845 no_children = list_empty(&event->child_list);
3846 mutex_unlock(&event->child_mutex);
3847 return no_children;
3848}
3849
0793a61d 3850/*
cdd6c482 3851 * Read the performance event - simple non blocking version for now
0793a61d
TG
3852 */
3853static ssize_t
cdd6c482 3854perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
0793a61d 3855{
cdd6c482 3856 u64 read_format = event->attr.read_format;
3dab77fb 3857 int ret;
0793a61d 3858
3b6f9e5c 3859 /*
cdd6c482 3860 * Return end-of-file for a read on a event that is in
3b6f9e5c
PM
3861 * error state (i.e. because it was pinned but it couldn't be
3862 * scheduled on to the CPU at some point).
3863 */
cdd6c482 3864 if (event->state == PERF_EVENT_STATE_ERROR)
3b6f9e5c
PM
3865 return 0;
3866
c320c7b7 3867 if (count < event->read_size)
3dab77fb
PZ
3868 return -ENOSPC;
3869
cdd6c482 3870 WARN_ON_ONCE(event->ctx->parent_ctx);
3dab77fb 3871 if (read_format & PERF_FORMAT_GROUP)
cdd6c482 3872 ret = perf_event_read_group(event, read_format, buf);
3dab77fb 3873 else
cdd6c482 3874 ret = perf_event_read_one(event, read_format, buf);
0793a61d 3875
3dab77fb 3876 return ret;
0793a61d
TG
3877}
3878
0793a61d
TG
3879static ssize_t
3880perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3881{
cdd6c482 3882 struct perf_event *event = file->private_data;
f63a8daa
PZ
3883 struct perf_event_context *ctx;
3884 int ret;
0793a61d 3885
f63a8daa
PZ
3886 ctx = perf_event_ctx_lock(event);
3887 ret = perf_read_hw(event, buf, count);
3888 perf_event_ctx_unlock(event, ctx);
3889
3890 return ret;
0793a61d
TG
3891}
3892
3893static unsigned int perf_poll(struct file *file, poll_table *wait)
3894{
cdd6c482 3895 struct perf_event *event = file->private_data;
76369139 3896 struct ring_buffer *rb;
61b67684 3897 unsigned int events = POLLHUP;
c7138f37 3898
e708d7ad 3899 poll_wait(file, &event->waitq, wait);
179033b3 3900
dc633982 3901 if (is_event_hup(event))
179033b3 3902 return events;
c7138f37 3903
10c6db11 3904 /*
9bb5d40c
PZ
3905 * Pin the event->rb by taking event->mmap_mutex; otherwise
3906 * perf_event_set_output() can swizzle our rb and make us miss wakeups.
10c6db11
PZ
3907 */
3908 mutex_lock(&event->mmap_mutex);
9bb5d40c
PZ
3909 rb = event->rb;
3910 if (rb)
76369139 3911 events = atomic_xchg(&rb->poll, 0);
10c6db11 3912 mutex_unlock(&event->mmap_mutex);
0793a61d
TG
3913 return events;
3914}
3915
f63a8daa 3916static void _perf_event_reset(struct perf_event *event)
6de6a7b9 3917{
cdd6c482 3918 (void)perf_event_read(event);
e7850595 3919 local64_set(&event->count, 0);
cdd6c482 3920 perf_event_update_userpage(event);
3df5edad
PZ
3921}
3922
c93f7669 3923/*
cdd6c482
IM
3924 * Holding the top-level event's child_mutex means that any
3925 * descendant process that has inherited this event will block
3926 * in sync_child_event if it goes to exit, thus satisfying the
3927 * task existence requirements of perf_event_enable/disable.
c93f7669 3928 */
cdd6c482
IM
3929static void perf_event_for_each_child(struct perf_event *event,
3930 void (*func)(struct perf_event *))
3df5edad 3931{
cdd6c482 3932 struct perf_event *child;
3df5edad 3933
cdd6c482 3934 WARN_ON_ONCE(event->ctx->parent_ctx);
f63a8daa 3935
cdd6c482
IM
3936 mutex_lock(&event->child_mutex);
3937 func(event);
3938 list_for_each_entry(child, &event->child_list, child_list)
3df5edad 3939 func(child);
cdd6c482 3940 mutex_unlock(&event->child_mutex);
3df5edad
PZ
3941}
3942
cdd6c482
IM
3943static void perf_event_for_each(struct perf_event *event,
3944 void (*func)(struct perf_event *))
3df5edad 3945{
cdd6c482
IM
3946 struct perf_event_context *ctx = event->ctx;
3947 struct perf_event *sibling;
3df5edad 3948
f63a8daa
PZ
3949 lockdep_assert_held(&ctx->mutex);
3950
cdd6c482 3951 event = event->group_leader;
75f937f2 3952
cdd6c482 3953 perf_event_for_each_child(event, func);
cdd6c482 3954 list_for_each_entry(sibling, &event->sibling_list, group_entry)
724b6daa 3955 perf_event_for_each_child(sibling, func);
6de6a7b9
PZ
3956}
3957
cdd6c482 3958static int perf_event_period(struct perf_event *event, u64 __user *arg)
08247e31 3959{
cdd6c482 3960 struct perf_event_context *ctx = event->ctx;
bad7192b 3961 int ret = 0, active;
08247e31
PZ
3962 u64 value;
3963
6c7e550f 3964 if (!is_sampling_event(event))
08247e31
PZ
3965 return -EINVAL;
3966
ad0cf347 3967 if (copy_from_user(&value, arg, sizeof(value)))
08247e31
PZ
3968 return -EFAULT;
3969
3970 if (!value)
3971 return -EINVAL;
3972
e625cce1 3973 raw_spin_lock_irq(&ctx->lock);
cdd6c482
IM
3974 if (event->attr.freq) {
3975 if (value > sysctl_perf_event_sample_rate) {
08247e31
PZ
3976 ret = -EINVAL;
3977 goto unlock;
3978 }
3979
cdd6c482 3980 event->attr.sample_freq = value;
08247e31 3981 } else {
cdd6c482
IM
3982 event->attr.sample_period = value;
3983 event->hw.sample_period = value;
08247e31 3984 }
bad7192b
PZ
3985
3986 active = (event->state == PERF_EVENT_STATE_ACTIVE);
3987 if (active) {
3988 perf_pmu_disable(ctx->pmu);
3989 event->pmu->stop(event, PERF_EF_UPDATE);
3990 }
3991
3992 local64_set(&event->hw.period_left, 0);
3993
3994 if (active) {
3995 event->pmu->start(event, PERF_EF_RELOAD);
3996 perf_pmu_enable(ctx->pmu);
3997 }
3998
08247e31 3999unlock:
e625cce1 4000 raw_spin_unlock_irq(&ctx->lock);
08247e31
PZ
4001
4002 return ret;
4003}
4004
ac9721f3
PZ
4005static const struct file_operations perf_fops;
4006
2903ff01 4007static inline int perf_fget_light(int fd, struct fd *p)
ac9721f3 4008{
2903ff01
AV
4009 struct fd f = fdget(fd);
4010 if (!f.file)
4011 return -EBADF;
ac9721f3 4012
2903ff01
AV
4013 if (f.file->f_op != &perf_fops) {
4014 fdput(f);
4015 return -EBADF;
ac9721f3 4016 }
2903ff01
AV
4017 *p = f;
4018 return 0;
ac9721f3
PZ
4019}
4020
4021static int perf_event_set_output(struct perf_event *event,
4022 struct perf_event *output_event);
6fb2915d 4023static int perf_event_set_filter(struct perf_event *event, void __user *arg);
2541517c 4024static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
a4be7c27 4025
f63a8daa 4026static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
d859e29f 4027{
cdd6c482 4028 void (*func)(struct perf_event *);
3df5edad 4029 u32 flags = arg;
d859e29f
PM
4030
4031 switch (cmd) {
cdd6c482 4032 case PERF_EVENT_IOC_ENABLE:
f63a8daa 4033 func = _perf_event_enable;
d859e29f 4034 break;
cdd6c482 4035 case PERF_EVENT_IOC_DISABLE:
f63a8daa 4036 func = _perf_event_disable;
79f14641 4037 break;
cdd6c482 4038 case PERF_EVENT_IOC_RESET:
f63a8daa 4039 func = _perf_event_reset;
6de6a7b9 4040 break;
3df5edad 4041
cdd6c482 4042 case PERF_EVENT_IOC_REFRESH:
f63a8daa 4043 return _perf_event_refresh(event, arg);
08247e31 4044
cdd6c482
IM
4045 case PERF_EVENT_IOC_PERIOD:
4046 return perf_event_period(event, (u64 __user *)arg);
08247e31 4047
cf4957f1
JO
4048 case PERF_EVENT_IOC_ID:
4049 {
4050 u64 id = primary_event_id(event);
4051
4052 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
4053 return -EFAULT;
4054 return 0;
4055 }
4056
cdd6c482 4057 case PERF_EVENT_IOC_SET_OUTPUT:
ac9721f3 4058 {
ac9721f3 4059 int ret;
ac9721f3 4060 if (arg != -1) {
2903ff01
AV
4061 struct perf_event *output_event;
4062 struct fd output;
4063 ret = perf_fget_light(arg, &output);
4064 if (ret)
4065 return ret;
4066 output_event = output.file->private_data;
4067 ret = perf_event_set_output(event, output_event);
4068 fdput(output);
4069 } else {
4070 ret = perf_event_set_output(event, NULL);
ac9721f3 4071 }
ac9721f3
PZ
4072 return ret;
4073 }
a4be7c27 4074
6fb2915d
LZ
4075 case PERF_EVENT_IOC_SET_FILTER:
4076 return perf_event_set_filter(event, (void __user *)arg);
4077
2541517c
AS
4078 case PERF_EVENT_IOC_SET_BPF:
4079 return perf_event_set_bpf_prog(event, arg);
4080
d859e29f 4081 default:
3df5edad 4082 return -ENOTTY;
d859e29f 4083 }
3df5edad
PZ
4084
4085 if (flags & PERF_IOC_FLAG_GROUP)
cdd6c482 4086 perf_event_for_each(event, func);
3df5edad 4087 else
cdd6c482 4088 perf_event_for_each_child(event, func);
3df5edad
PZ
4089
4090 return 0;
d859e29f
PM
4091}
4092
f63a8daa
PZ
4093static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
4094{
4095 struct perf_event *event = file->private_data;
4096 struct perf_event_context *ctx;
4097 long ret;
4098
4099 ctx = perf_event_ctx_lock(event);
4100 ret = _perf_ioctl(event, cmd, arg);
4101 perf_event_ctx_unlock(event, ctx);
4102
4103 return ret;
4104}
4105
b3f20785
PM
4106#ifdef CONFIG_COMPAT
4107static long perf_compat_ioctl(struct file *file, unsigned int cmd,
4108 unsigned long arg)
4109{
4110 switch (_IOC_NR(cmd)) {
4111 case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
4112 case _IOC_NR(PERF_EVENT_IOC_ID):
4113 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
4114 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
4115 cmd &= ~IOCSIZE_MASK;
4116 cmd |= sizeof(void *) << IOCSIZE_SHIFT;
4117 }
4118 break;
4119 }
4120 return perf_ioctl(file, cmd, arg);
4121}
4122#else
4123# define perf_compat_ioctl NULL
4124#endif
4125
cdd6c482 4126int perf_event_task_enable(void)
771d7cde 4127{
f63a8daa 4128 struct perf_event_context *ctx;
cdd6c482 4129 struct perf_event *event;
771d7cde 4130
cdd6c482 4131 mutex_lock(&current->perf_event_mutex);
f63a8daa
PZ
4132 list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4133 ctx = perf_event_ctx_lock(event);
4134 perf_event_for_each_child(event, _perf_event_enable);
4135 perf_event_ctx_unlock(event, ctx);
4136 }
cdd6c482 4137 mutex_unlock(&current->perf_event_mutex);
771d7cde
PZ
4138
4139 return 0;
4140}
4141
cdd6c482 4142int perf_event_task_disable(void)
771d7cde 4143{
f63a8daa 4144 struct perf_event_context *ctx;
cdd6c482 4145 struct perf_event *event;
771d7cde 4146
cdd6c482 4147 mutex_lock(&current->perf_event_mutex);
f63a8daa
PZ
4148 list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4149 ctx = perf_event_ctx_lock(event);
4150 perf_event_for_each_child(event, _perf_event_disable);
4151 perf_event_ctx_unlock(event, ctx);
4152 }
cdd6c482 4153 mutex_unlock(&current->perf_event_mutex);
771d7cde
PZ
4154
4155 return 0;
4156}
4157
cdd6c482 4158static int perf_event_index(struct perf_event *event)
194002b2 4159{
a4eaf7f1
PZ
4160 if (event->hw.state & PERF_HES_STOPPED)
4161 return 0;
4162
cdd6c482 4163 if (event->state != PERF_EVENT_STATE_ACTIVE)
194002b2
PZ
4164 return 0;
4165
35edc2a5 4166 return event->pmu->event_idx(event);
194002b2
PZ
4167}
4168
c4794295 4169static void calc_timer_values(struct perf_event *event,
e3f3541c 4170 u64 *now,
7f310a5d
EM
4171 u64 *enabled,
4172 u64 *running)
c4794295 4173{
e3f3541c 4174 u64 ctx_time;
c4794295 4175
e3f3541c
PZ
4176 *now = perf_clock();
4177 ctx_time = event->shadow_ctx_time + *now;
c4794295
EM
4178 *enabled = ctx_time - event->tstamp_enabled;
4179 *running = ctx_time - event->tstamp_running;
4180}
4181
fa731587
PZ
4182static void perf_event_init_userpage(struct perf_event *event)
4183{
4184 struct perf_event_mmap_page *userpg;
4185 struct ring_buffer *rb;
4186
4187 rcu_read_lock();
4188 rb = rcu_dereference(event->rb);
4189 if (!rb)
4190 goto unlock;
4191
4192 userpg = rb->user_page;
4193
4194 /* Allow new userspace to detect that bit 0 is deprecated */
4195 userpg->cap_bit0_is_deprecated = 1;
4196 userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
e8c6deac
AS
4197 userpg->data_offset = PAGE_SIZE;
4198 userpg->data_size = perf_data_size(rb);
fa731587
PZ
4199
4200unlock:
4201 rcu_read_unlock();
4202}
4203
c1317ec2
AL
4204void __weak arch_perf_update_userpage(
4205 struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
e3f3541c
PZ
4206{
4207}
4208
38ff667b
PZ
4209/*
4210 * Callers need to ensure there can be no nesting of this function, otherwise
4211 * the seqlock logic goes bad. We can not serialize this because the arch
4212 * code calls this from NMI context.
4213 */
cdd6c482 4214void perf_event_update_userpage(struct perf_event *event)
37d81828 4215{
cdd6c482 4216 struct perf_event_mmap_page *userpg;
76369139 4217 struct ring_buffer *rb;
e3f3541c 4218 u64 enabled, running, now;
38ff667b
PZ
4219
4220 rcu_read_lock();
5ec4c599
PZ
4221 rb = rcu_dereference(event->rb);
4222 if (!rb)
4223 goto unlock;
4224
0d641208
EM
4225 /*
4226 * compute total_time_enabled, total_time_running
4227 * based on snapshot values taken when the event
4228 * was last scheduled in.
4229 *
4230 * we cannot simply called update_context_time()
4231 * because of locking issue as we can be called in
4232 * NMI context
4233 */
e3f3541c 4234 calc_timer_values(event, &now, &enabled, &running);
38ff667b 4235
76369139 4236 userpg = rb->user_page;
7b732a75
PZ
4237 /*
4238 * Disable preemption so as to not let the corresponding user-space
4239 * spin too long if we get preempted.
4240 */
4241 preempt_disable();
37d81828 4242 ++userpg->lock;
92f22a38 4243 barrier();
cdd6c482 4244 userpg->index = perf_event_index(event);
b5e58793 4245 userpg->offset = perf_event_count(event);
365a4038 4246 if (userpg->index)
e7850595 4247 userpg->offset -= local64_read(&event->hw.prev_count);
7b732a75 4248
0d641208 4249 userpg->time_enabled = enabled +
cdd6c482 4250 atomic64_read(&event->child_total_time_enabled);
7f8b4e4e 4251
0d641208 4252 userpg->time_running = running +
cdd6c482 4253 atomic64_read(&event->child_total_time_running);
7f8b4e4e 4254
c1317ec2 4255 arch_perf_update_userpage(event, userpg, now);
e3f3541c 4256
92f22a38 4257 barrier();
37d81828 4258 ++userpg->lock;
7b732a75 4259 preempt_enable();
38ff667b 4260unlock:
7b732a75 4261 rcu_read_unlock();
37d81828
PM
4262}
4263
906010b2
PZ
4264static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
4265{
4266 struct perf_event *event = vma->vm_file->private_data;
76369139 4267 struct ring_buffer *rb;
906010b2
PZ
4268 int ret = VM_FAULT_SIGBUS;
4269
4270 if (vmf->flags & FAULT_FLAG_MKWRITE) {
4271 if (vmf->pgoff == 0)
4272 ret = 0;
4273 return ret;
4274 }
4275
4276 rcu_read_lock();
76369139
FW
4277 rb = rcu_dereference(event->rb);
4278 if (!rb)
906010b2
PZ
4279 goto unlock;
4280
4281 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
4282 goto unlock;
4283
76369139 4284 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
906010b2
PZ
4285 if (!vmf->page)
4286 goto unlock;
4287
4288 get_page(vmf->page);
4289 vmf->page->mapping = vma->vm_file->f_mapping;
4290 vmf->page->index = vmf->pgoff;
4291
4292 ret = 0;
4293unlock:
4294 rcu_read_unlock();
4295
4296 return ret;
4297}
4298
10c6db11
PZ
4299static void ring_buffer_attach(struct perf_event *event,
4300 struct ring_buffer *rb)
4301{
b69cf536 4302 struct ring_buffer *old_rb = NULL;
10c6db11
PZ
4303 unsigned long flags;
4304
b69cf536
PZ
4305 if (event->rb) {
4306 /*
4307 * Should be impossible, we set this when removing
4308 * event->rb_entry and wait/clear when adding event->rb_entry.
4309 */
4310 WARN_ON_ONCE(event->rcu_pending);
10c6db11 4311
b69cf536
PZ
4312 old_rb = event->rb;
4313 event->rcu_batches = get_state_synchronize_rcu();
4314 event->rcu_pending = 1;
10c6db11 4315
b69cf536
PZ
4316 spin_lock_irqsave(&old_rb->event_lock, flags);
4317 list_del_rcu(&event->rb_entry);
4318 spin_unlock_irqrestore(&old_rb->event_lock, flags);
4319 }
10c6db11 4320
b69cf536
PZ
4321 if (event->rcu_pending && rb) {
4322 cond_synchronize_rcu(event->rcu_batches);
4323 event->rcu_pending = 0;
4324 }
10c6db11 4325
b69cf536
PZ
4326 if (rb) {
4327 spin_lock_irqsave(&rb->event_lock, flags);
4328 list_add_rcu(&event->rb_entry, &rb->event_list);
4329 spin_unlock_irqrestore(&rb->event_lock, flags);
4330 }
4331
4332 rcu_assign_pointer(event->rb, rb);
4333
4334 if (old_rb) {
4335 ring_buffer_put(old_rb);
4336 /*
4337 * Since we detached before setting the new rb, so that we
4338 * could attach the new rb, we could have missed a wakeup.
4339 * Provide it now.
4340 */
4341 wake_up_all(&event->waitq);
4342 }
10c6db11
PZ
4343}
4344
4345static void ring_buffer_wakeup(struct perf_event *event)
4346{
4347 struct ring_buffer *rb;
4348
4349 rcu_read_lock();
4350 rb = rcu_dereference(event->rb);
9bb5d40c
PZ
4351 if (rb) {
4352 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
4353 wake_up_all(&event->waitq);
4354 }
10c6db11
PZ
4355 rcu_read_unlock();
4356}
4357
76369139 4358static void rb_free_rcu(struct rcu_head *rcu_head)
906010b2 4359{
76369139 4360 struct ring_buffer *rb;
906010b2 4361
76369139
FW
4362 rb = container_of(rcu_head, struct ring_buffer, rcu_head);
4363 rb_free(rb);
7b732a75
PZ
4364}
4365
fdc26706 4366struct ring_buffer *ring_buffer_get(struct perf_event *event)
7b732a75 4367{
76369139 4368 struct ring_buffer *rb;
7b732a75 4369
ac9721f3 4370 rcu_read_lock();
76369139
FW
4371 rb = rcu_dereference(event->rb);
4372 if (rb) {
4373 if (!atomic_inc_not_zero(&rb->refcount))
4374 rb = NULL;
ac9721f3
PZ
4375 }
4376 rcu_read_unlock();
4377
76369139 4378 return rb;
ac9721f3
PZ
4379}
4380
fdc26706 4381void ring_buffer_put(struct ring_buffer *rb)
ac9721f3 4382{
76369139 4383 if (!atomic_dec_and_test(&rb->refcount))
ac9721f3 4384 return;
7b732a75 4385
9bb5d40c 4386 WARN_ON_ONCE(!list_empty(&rb->event_list));
10c6db11 4387
76369139 4388 call_rcu(&rb->rcu_head, rb_free_rcu);
7b732a75
PZ
4389}
4390
4391static void perf_mmap_open(struct vm_area_struct *vma)
4392{
cdd6c482 4393 struct perf_event *event = vma->vm_file->private_data;
7b732a75 4394
cdd6c482 4395 atomic_inc(&event->mmap_count);
9bb5d40c 4396 atomic_inc(&event->rb->mmap_count);
1e0fb9ec 4397
45bfb2e5
PZ
4398 if (vma->vm_pgoff)
4399 atomic_inc(&event->rb->aux_mmap_count);
4400
1e0fb9ec
AL
4401 if (event->pmu->event_mapped)
4402 event->pmu->event_mapped(event);
7b732a75
PZ
4403}
4404
9bb5d40c
PZ
4405/*
4406 * A buffer can be mmap()ed multiple times; either directly through the same
4407 * event, or through other events by use of perf_event_set_output().
4408 *
4409 * In order to undo the VM accounting done by perf_mmap() we need to destroy
4410 * the buffer here, where we still have a VM context. This means we need
4411 * to detach all events redirecting to us.
4412 */
7b732a75
PZ
4413static void perf_mmap_close(struct vm_area_struct *vma)
4414{
cdd6c482 4415 struct perf_event *event = vma->vm_file->private_data;
7b732a75 4416
b69cf536 4417 struct ring_buffer *rb = ring_buffer_get(event);
9bb5d40c
PZ
4418 struct user_struct *mmap_user = rb->mmap_user;
4419 int mmap_locked = rb->mmap_locked;
4420 unsigned long size = perf_data_size(rb);
789f90fc 4421
1e0fb9ec
AL
4422 if (event->pmu->event_unmapped)
4423 event->pmu->event_unmapped(event);
4424
45bfb2e5
PZ
4425 /*
4426 * rb->aux_mmap_count will always drop before rb->mmap_count and
4427 * event->mmap_count, so it is ok to use event->mmap_mutex to
4428 * serialize with perf_mmap here.
4429 */
4430 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
4431 atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
4432 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
4433 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
4434
4435 rb_free_aux(rb);
4436 mutex_unlock(&event->mmap_mutex);
4437 }
4438
9bb5d40c
PZ
4439 atomic_dec(&rb->mmap_count);
4440
4441 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
b69cf536 4442 goto out_put;
9bb5d40c 4443
b69cf536 4444 ring_buffer_attach(event, NULL);
9bb5d40c
PZ
4445 mutex_unlock(&event->mmap_mutex);
4446
4447 /* If there's still other mmap()s of this buffer, we're done. */
b69cf536
PZ
4448 if (atomic_read(&rb->mmap_count))
4449 goto out_put;
ac9721f3 4450
9bb5d40c
PZ
4451 /*
4452 * No other mmap()s, detach from all other events that might redirect
4453 * into the now unreachable buffer. Somewhat complicated by the
4454 * fact that rb::event_lock otherwise nests inside mmap_mutex.
4455 */
4456again:
4457 rcu_read_lock();
4458 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
4459 if (!atomic_long_inc_not_zero(&event->refcount)) {
4460 /*
4461 * This event is en-route to free_event() which will
4462 * detach it and remove it from the list.
4463 */
4464 continue;
4465 }
4466 rcu_read_unlock();
789f90fc 4467
9bb5d40c
PZ
4468 mutex_lock(&event->mmap_mutex);
4469 /*
4470 * Check we didn't race with perf_event_set_output() which can
4471 * swizzle the rb from under us while we were waiting to
4472 * acquire mmap_mutex.
4473 *
4474 * If we find a different rb; ignore this event, a next
4475 * iteration will no longer find it on the list. We have to
4476 * still restart the iteration to make sure we're not now
4477 * iterating the wrong list.
4478 */
b69cf536
PZ
4479 if (event->rb == rb)
4480 ring_buffer_attach(event, NULL);
4481
cdd6c482 4482 mutex_unlock(&event->mmap_mutex);
9bb5d40c 4483 put_event(event);
ac9721f3 4484
9bb5d40c
PZ
4485 /*
4486 * Restart the iteration; either we're on the wrong list or
4487 * destroyed its integrity by doing a deletion.
4488 */
4489 goto again;
7b732a75 4490 }
9bb5d40c
PZ
4491 rcu_read_unlock();
4492
4493 /*
4494 * It could be there's still a few 0-ref events on the list; they'll
4495 * get cleaned up by free_event() -- they'll also still have their
4496 * ref on the rb and will free it whenever they are done with it.
4497 *
4498 * Aside from that, this buffer is 'fully' detached and unmapped,
4499 * undo the VM accounting.
4500 */
4501
4502 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
4503 vma->vm_mm->pinned_vm -= mmap_locked;
4504 free_uid(mmap_user);
4505
b69cf536 4506out_put:
9bb5d40c 4507 ring_buffer_put(rb); /* could be last */
37d81828
PM
4508}
4509
f0f37e2f 4510static const struct vm_operations_struct perf_mmap_vmops = {
43a21ea8 4511 .open = perf_mmap_open,
45bfb2e5 4512 .close = perf_mmap_close, /* non mergable */
43a21ea8
PZ
4513 .fault = perf_mmap_fault,
4514 .page_mkwrite = perf_mmap_fault,
37d81828
PM
4515};
4516
4517static int perf_mmap(struct file *file, struct vm_area_struct *vma)
4518{
cdd6c482 4519 struct perf_event *event = file->private_data;
22a4f650 4520 unsigned long user_locked, user_lock_limit;
789f90fc 4521 struct user_struct *user = current_user();
22a4f650 4522 unsigned long locked, lock_limit;
45bfb2e5 4523 struct ring_buffer *rb = NULL;
7b732a75
PZ
4524 unsigned long vma_size;
4525 unsigned long nr_pages;
45bfb2e5 4526 long user_extra = 0, extra = 0;
d57e34fd 4527 int ret = 0, flags = 0;
37d81828 4528
c7920614
PZ
4529 /*
4530 * Don't allow mmap() of inherited per-task counters. This would
4531 * create a performance issue due to all children writing to the
76369139 4532 * same rb.
c7920614
PZ
4533 */
4534 if (event->cpu == -1 && event->attr.inherit)
4535 return -EINVAL;
4536
43a21ea8 4537 if (!(vma->vm_flags & VM_SHARED))
37d81828 4538 return -EINVAL;
7b732a75
PZ
4539
4540 vma_size = vma->vm_end - vma->vm_start;
45bfb2e5
PZ
4541
4542 if (vma->vm_pgoff == 0) {
4543 nr_pages = (vma_size / PAGE_SIZE) - 1;
4544 } else {
4545 /*
4546 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
4547 * mapped, all subsequent mappings should have the same size
4548 * and offset. Must be above the normal perf buffer.
4549 */
4550 u64 aux_offset, aux_size;
4551
4552 if (!event->rb)
4553 return -EINVAL;
4554
4555 nr_pages = vma_size / PAGE_SIZE;
4556
4557 mutex_lock(&event->mmap_mutex);
4558 ret = -EINVAL;
4559
4560 rb = event->rb;
4561 if (!rb)
4562 goto aux_unlock;
4563
4564 aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
4565 aux_size = ACCESS_ONCE(rb->user_page->aux_size);
4566
4567 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
4568 goto aux_unlock;
4569
4570 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
4571 goto aux_unlock;
4572
4573 /* already mapped with a different offset */
4574 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
4575 goto aux_unlock;
4576
4577 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
4578 goto aux_unlock;
4579
4580 /* already mapped with a different size */
4581 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
4582 goto aux_unlock;
4583
4584 if (!is_power_of_2(nr_pages))
4585 goto aux_unlock;
4586
4587 if (!atomic_inc_not_zero(&rb->mmap_count))
4588 goto aux_unlock;
4589
4590 if (rb_has_aux(rb)) {
4591 atomic_inc(&rb->aux_mmap_count);
4592 ret = 0;
4593 goto unlock;
4594 }
4595
4596 atomic_set(&rb->aux_mmap_count, 1);
4597 user_extra = nr_pages;
4598
4599 goto accounting;
4600 }
7b732a75 4601
7730d865 4602 /*
76369139 4603 * If we have rb pages ensure they're a power-of-two number, so we
7730d865
PZ
4604 * can do bitmasks instead of modulo.
4605 */
2ed11312 4606 if (nr_pages != 0 && !is_power_of_2(nr_pages))
37d81828
PM
4607 return -EINVAL;
4608
7b732a75 4609 if (vma_size != PAGE_SIZE * (1 + nr_pages))
37d81828
PM
4610 return -EINVAL;
4611
cdd6c482 4612 WARN_ON_ONCE(event->ctx->parent_ctx);
9bb5d40c 4613again:
cdd6c482 4614 mutex_lock(&event->mmap_mutex);
76369139 4615 if (event->rb) {
9bb5d40c 4616 if (event->rb->nr_pages != nr_pages) {
ebb3c4c4 4617 ret = -EINVAL;
9bb5d40c
PZ
4618 goto unlock;
4619 }
4620
4621 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
4622 /*
4623 * Raced against perf_mmap_close() through
4624 * perf_event_set_output(). Try again, hope for better
4625 * luck.
4626 */
4627 mutex_unlock(&event->mmap_mutex);
4628 goto again;
4629 }
4630
ebb3c4c4
PZ
4631 goto unlock;
4632 }
4633
789f90fc 4634 user_extra = nr_pages + 1;
45bfb2e5
PZ
4635
4636accounting:
cdd6c482 4637 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
a3862d3f
IM
4638
4639 /*
4640 * Increase the limit linearly with more CPUs:
4641 */
4642 user_lock_limit *= num_online_cpus();
4643
789f90fc 4644 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
c5078f78 4645
789f90fc
PZ
4646 if (user_locked > user_lock_limit)
4647 extra = user_locked - user_lock_limit;
7b732a75 4648
78d7d407 4649 lock_limit = rlimit(RLIMIT_MEMLOCK);
7b732a75 4650 lock_limit >>= PAGE_SHIFT;
bc3e53f6 4651 locked = vma->vm_mm->pinned_vm + extra;
7b732a75 4652
459ec28a
IM
4653 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
4654 !capable(CAP_IPC_LOCK)) {
ebb3c4c4
PZ
4655 ret = -EPERM;
4656 goto unlock;
4657 }
7b732a75 4658
45bfb2e5 4659 WARN_ON(!rb && event->rb);
906010b2 4660
d57e34fd 4661 if (vma->vm_flags & VM_WRITE)
76369139 4662 flags |= RING_BUFFER_WRITABLE;
d57e34fd 4663
76369139 4664 if (!rb) {
45bfb2e5
PZ
4665 rb = rb_alloc(nr_pages,
4666 event->attr.watermark ? event->attr.wakeup_watermark : 0,
4667 event->cpu, flags);
26cb63ad 4668
45bfb2e5
PZ
4669 if (!rb) {
4670 ret = -ENOMEM;
4671 goto unlock;
4672 }
43a21ea8 4673
45bfb2e5
PZ
4674 atomic_set(&rb->mmap_count, 1);
4675 rb->mmap_user = get_current_user();
4676 rb->mmap_locked = extra;
26cb63ad 4677
45bfb2e5 4678 ring_buffer_attach(event, rb);
ac9721f3 4679
45bfb2e5
PZ
4680 perf_event_init_userpage(event);
4681 perf_event_update_userpage(event);
4682 } else {
1a594131
AS
4683 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
4684 event->attr.aux_watermark, flags);
45bfb2e5
PZ
4685 if (!ret)
4686 rb->aux_mmap_locked = extra;
4687 }
9a0f05cb 4688
ebb3c4c4 4689unlock:
45bfb2e5
PZ
4690 if (!ret) {
4691 atomic_long_add(user_extra, &user->locked_vm);
4692 vma->vm_mm->pinned_vm += extra;
4693
ac9721f3 4694 atomic_inc(&event->mmap_count);
45bfb2e5
PZ
4695 } else if (rb) {
4696 atomic_dec(&rb->mmap_count);
4697 }
4698aux_unlock:
cdd6c482 4699 mutex_unlock(&event->mmap_mutex);
37d81828 4700
9bb5d40c
PZ
4701 /*
4702 * Since pinned accounting is per vm we cannot allow fork() to copy our
4703 * vma.
4704 */
26cb63ad 4705 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
37d81828 4706 vma->vm_ops = &perf_mmap_vmops;
7b732a75 4707
1e0fb9ec
AL
4708 if (event->pmu->event_mapped)
4709 event->pmu->event_mapped(event);
4710
7b732a75 4711 return ret;
37d81828
PM
4712}
4713
3c446b3d
PZ
4714static int perf_fasync(int fd, struct file *filp, int on)
4715{
496ad9aa 4716 struct inode *inode = file_inode(filp);
cdd6c482 4717 struct perf_event *event = filp->private_data;
3c446b3d
PZ
4718 int retval;
4719
4720 mutex_lock(&inode->i_mutex);
cdd6c482 4721 retval = fasync_helper(fd, filp, on, &event->fasync);
3c446b3d
PZ
4722 mutex_unlock(&inode->i_mutex);
4723
4724 if (retval < 0)
4725 return retval;
4726
4727 return 0;
4728}
4729
0793a61d 4730static const struct file_operations perf_fops = {
3326c1ce 4731 .llseek = no_llseek,
0793a61d
TG
4732 .release = perf_release,
4733 .read = perf_read,
4734 .poll = perf_poll,
d859e29f 4735 .unlocked_ioctl = perf_ioctl,
b3f20785 4736 .compat_ioctl = perf_compat_ioctl,
37d81828 4737 .mmap = perf_mmap,
3c446b3d 4738 .fasync = perf_fasync,
0793a61d
TG
4739};
4740
925d519a 4741/*
cdd6c482 4742 * Perf event wakeup
925d519a
PZ
4743 *
4744 * If there's data, ensure we set the poll() state and publish everything
4745 * to user-space before waking everybody up.
4746 */
4747
cdd6c482 4748void perf_event_wakeup(struct perf_event *event)
925d519a 4749{
10c6db11 4750 ring_buffer_wakeup(event);
4c9e2542 4751
cdd6c482
IM
4752 if (event->pending_kill) {
4753 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
4754 event->pending_kill = 0;
4c9e2542 4755 }
925d519a
PZ
4756}
4757
e360adbe 4758static void perf_pending_event(struct irq_work *entry)
79f14641 4759{
cdd6c482
IM
4760 struct perf_event *event = container_of(entry,
4761 struct perf_event, pending);
d525211f
PZ
4762 int rctx;
4763
4764 rctx = perf_swevent_get_recursion_context();
4765 /*
4766 * If we 'fail' here, that's OK, it means recursion is already disabled
4767 * and we won't recurse 'further'.
4768 */
79f14641 4769
cdd6c482
IM
4770 if (event->pending_disable) {
4771 event->pending_disable = 0;
4772 __perf_event_disable(event);
79f14641
PZ
4773 }
4774
cdd6c482
IM
4775 if (event->pending_wakeup) {
4776 event->pending_wakeup = 0;
4777 perf_event_wakeup(event);
79f14641 4778 }
d525211f
PZ
4779
4780 if (rctx >= 0)
4781 perf_swevent_put_recursion_context(rctx);
79f14641
PZ
4782}
4783
39447b38
ZY
4784/*
4785 * We assume there is only KVM supporting the callbacks.
4786 * Later on, we might change it to a list if there is
4787 * another virtualization implementation supporting the callbacks.
4788 */
4789struct perf_guest_info_callbacks *perf_guest_cbs;
4790
4791int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
4792{
4793 perf_guest_cbs = cbs;
4794 return 0;
4795}
4796EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
4797
4798int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
4799{
4800 perf_guest_cbs = NULL;
4801 return 0;
4802}
4803EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
4804
4018994f
JO
4805static void
4806perf_output_sample_regs(struct perf_output_handle *handle,
4807 struct pt_regs *regs, u64 mask)
4808{
4809 int bit;
4810
4811 for_each_set_bit(bit, (const unsigned long *) &mask,
4812 sizeof(mask) * BITS_PER_BYTE) {
4813 u64 val;
4814
4815 val = perf_reg_value(regs, bit);
4816 perf_output_put(handle, val);
4817 }
4818}
4819
60e2364e 4820static void perf_sample_regs_user(struct perf_regs *regs_user,
88a7c26a
AL
4821 struct pt_regs *regs,
4822 struct pt_regs *regs_user_copy)
4018994f 4823{
88a7c26a
AL
4824 if (user_mode(regs)) {
4825 regs_user->abi = perf_reg_abi(current);
2565711f 4826 regs_user->regs = regs;
88a7c26a
AL
4827 } else if (current->mm) {
4828 perf_get_regs_user(regs_user, regs, regs_user_copy);
2565711f
PZ
4829 } else {
4830 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
4831 regs_user->regs = NULL;
4018994f
JO
4832 }
4833}
4834
60e2364e
SE
4835static void perf_sample_regs_intr(struct perf_regs *regs_intr,
4836 struct pt_regs *regs)
4837{
4838 regs_intr->regs = regs;
4839 regs_intr->abi = perf_reg_abi(current);
4840}
4841
4842
c5ebcedb
JO
4843/*
4844 * Get remaining task size from user stack pointer.
4845 *
4846 * It'd be better to take stack vma map and limit this more
4847 * precisly, but there's no way to get it safely under interrupt,
4848 * so using TASK_SIZE as limit.
4849 */
4850static u64 perf_ustack_task_size(struct pt_regs *regs)
4851{
4852 unsigned long addr = perf_user_stack_pointer(regs);
4853
4854 if (!addr || addr >= TASK_SIZE)
4855 return 0;
4856
4857 return TASK_SIZE - addr;
4858}
4859
4860static u16
4861perf_sample_ustack_size(u16 stack_size, u16 header_size,
4862 struct pt_regs *regs)
4863{
4864 u64 task_size;
4865
4866 /* No regs, no stack pointer, no dump. */
4867 if (!regs)
4868 return 0;
4869
4870 /*
4871 * Check if we fit in with the requested stack size into the:
4872 * - TASK_SIZE
4873 * If we don't, we limit the size to the TASK_SIZE.
4874 *
4875 * - remaining sample size
4876 * If we don't, we customize the stack size to
4877 * fit in to the remaining sample size.
4878 */
4879
4880 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
4881 stack_size = min(stack_size, (u16) task_size);
4882
4883 /* Current header size plus static size and dynamic size. */
4884 header_size += 2 * sizeof(u64);
4885
4886 /* Do we fit in with the current stack dump size? */
4887 if ((u16) (header_size + stack_size) < header_size) {
4888 /*
4889 * If we overflow the maximum size for the sample,
4890 * we customize the stack dump size to fit in.
4891 */
4892 stack_size = USHRT_MAX - header_size - sizeof(u64);
4893 stack_size = round_up(stack_size, sizeof(u64));
4894 }
4895
4896 return stack_size;
4897}
4898
4899static void
4900perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
4901 struct pt_regs *regs)
4902{
4903 /* Case of a kernel thread, nothing to dump */
4904 if (!regs) {
4905 u64 size = 0;
4906 perf_output_put(handle, size);
4907 } else {
4908 unsigned long sp;
4909 unsigned int rem;
4910 u64 dyn_size;
4911
4912 /*
4913 * We dump:
4914 * static size
4915 * - the size requested by user or the best one we can fit
4916 * in to the sample max size
4917 * data
4918 * - user stack dump data
4919 * dynamic size
4920 * - the actual dumped size
4921 */
4922
4923 /* Static size. */
4924 perf_output_put(handle, dump_size);
4925
4926 /* Data. */
4927 sp = perf_user_stack_pointer(regs);
4928 rem = __output_copy_user(handle, (void *) sp, dump_size);
4929 dyn_size = dump_size - rem;
4930
4931 perf_output_skip(handle, rem);
4932
4933 /* Dynamic size. */
4934 perf_output_put(handle, dyn_size);
4935 }
4936}
4937
c980d109
ACM
4938static void __perf_event_header__init_id(struct perf_event_header *header,
4939 struct perf_sample_data *data,
4940 struct perf_event *event)
6844c09d
ACM
4941{
4942 u64 sample_type = event->attr.sample_type;
4943
4944 data->type = sample_type;
4945 header->size += event->id_header_size;
4946
4947 if (sample_type & PERF_SAMPLE_TID) {
4948 /* namespace issues */
4949 data->tid_entry.pid = perf_event_pid(event, current);
4950 data->tid_entry.tid = perf_event_tid(event, current);
4951 }
4952
4953 if (sample_type & PERF_SAMPLE_TIME)
34f43927 4954 data->time = perf_event_clock(event);
6844c09d 4955
ff3d527c 4956 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
6844c09d
ACM
4957 data->id = primary_event_id(event);
4958
4959 if (sample_type & PERF_SAMPLE_STREAM_ID)
4960 data->stream_id = event->id;
4961
4962 if (sample_type & PERF_SAMPLE_CPU) {
4963 data->cpu_entry.cpu = raw_smp_processor_id();
4964 data->cpu_entry.reserved = 0;
4965 }
4966}
4967
76369139
FW
4968void perf_event_header__init_id(struct perf_event_header *header,
4969 struct perf_sample_data *data,
4970 struct perf_event *event)
c980d109
ACM
4971{
4972 if (event->attr.sample_id_all)
4973 __perf_event_header__init_id(header, data, event);
4974}
4975
4976static void __perf_event__output_id_sample(struct perf_output_handle *handle,
4977 struct perf_sample_data *data)
4978{
4979 u64 sample_type = data->type;
4980
4981 if (sample_type & PERF_SAMPLE_TID)
4982 perf_output_put(handle, data->tid_entry);
4983
4984 if (sample_type & PERF_SAMPLE_TIME)
4985 perf_output_put(handle, data->time);
4986
4987 if (sample_type & PERF_SAMPLE_ID)
4988 perf_output_put(handle, data->id);
4989
4990 if (sample_type & PERF_SAMPLE_STREAM_ID)
4991 perf_output_put(handle, data->stream_id);
4992
4993 if (sample_type & PERF_SAMPLE_CPU)
4994 perf_output_put(handle, data->cpu_entry);
ff3d527c
AH
4995
4996 if (sample_type & PERF_SAMPLE_IDENTIFIER)
4997 perf_output_put(handle, data->id);
c980d109
ACM
4998}
4999
76369139
FW
5000void perf_event__output_id_sample(struct perf_event *event,
5001 struct perf_output_handle *handle,
5002 struct perf_sample_data *sample)
c980d109
ACM
5003{
5004 if (event->attr.sample_id_all)
5005 __perf_event__output_id_sample(handle, sample);
5006}
5007
3dab77fb 5008static void perf_output_read_one(struct perf_output_handle *handle,
eed01528
SE
5009 struct perf_event *event,
5010 u64 enabled, u64 running)
3dab77fb 5011{
cdd6c482 5012 u64 read_format = event->attr.read_format;
3dab77fb
PZ
5013 u64 values[4];
5014 int n = 0;
5015
b5e58793 5016 values[n++] = perf_event_count(event);
3dab77fb 5017 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
eed01528 5018 values[n++] = enabled +
cdd6c482 5019 atomic64_read(&event->child_total_time_enabled);
3dab77fb
PZ
5020 }
5021 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
eed01528 5022 values[n++] = running +
cdd6c482 5023 atomic64_read(&event->child_total_time_running);
3dab77fb
PZ
5024 }
5025 if (read_format & PERF_FORMAT_ID)
cdd6c482 5026 values[n++] = primary_event_id(event);
3dab77fb 5027
76369139 5028 __output_copy(handle, values, n * sizeof(u64));
3dab77fb
PZ
5029}
5030
5031/*
cdd6c482 5032 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
3dab77fb
PZ
5033 */
5034static void perf_output_read_group(struct perf_output_handle *handle,
eed01528
SE
5035 struct perf_event *event,
5036 u64 enabled, u64 running)
3dab77fb 5037{
cdd6c482
IM
5038 struct perf_event *leader = event->group_leader, *sub;
5039 u64 read_format = event->attr.read_format;
3dab77fb
PZ
5040 u64 values[5];
5041 int n = 0;
5042
5043 values[n++] = 1 + leader->nr_siblings;
5044
5045 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
eed01528 5046 values[n++] = enabled;
3dab77fb
PZ
5047
5048 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
eed01528 5049 values[n++] = running;
3dab77fb 5050
cdd6c482 5051 if (leader != event)
3dab77fb
PZ
5052 leader->pmu->read(leader);
5053
b5e58793 5054 values[n++] = perf_event_count(leader);
3dab77fb 5055 if (read_format & PERF_FORMAT_ID)
cdd6c482 5056 values[n++] = primary_event_id(leader);
3dab77fb 5057
76369139 5058 __output_copy(handle, values, n * sizeof(u64));
3dab77fb 5059
65abc865 5060 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3dab77fb
PZ
5061 n = 0;
5062
6f5ab001
JO
5063 if ((sub != event) &&
5064 (sub->state == PERF_EVENT_STATE_ACTIVE))
3dab77fb
PZ
5065 sub->pmu->read(sub);
5066
b5e58793 5067 values[n++] = perf_event_count(sub);
3dab77fb 5068 if (read_format & PERF_FORMAT_ID)
cdd6c482 5069 values[n++] = primary_event_id(sub);
3dab77fb 5070
76369139 5071 __output_copy(handle, values, n * sizeof(u64));
3dab77fb
PZ
5072 }
5073}
5074
eed01528
SE
5075#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
5076 PERF_FORMAT_TOTAL_TIME_RUNNING)
5077
3dab77fb 5078static void perf_output_read(struct perf_output_handle *handle,
cdd6c482 5079 struct perf_event *event)
3dab77fb 5080{
e3f3541c 5081 u64 enabled = 0, running = 0, now;
eed01528
SE
5082 u64 read_format = event->attr.read_format;
5083
5084 /*
5085 * compute total_time_enabled, total_time_running
5086 * based on snapshot values taken when the event
5087 * was last scheduled in.
5088 *
5089 * we cannot simply called update_context_time()
5090 * because of locking issue as we are called in
5091 * NMI context
5092 */
c4794295 5093 if (read_format & PERF_FORMAT_TOTAL_TIMES)
e3f3541c 5094 calc_timer_values(event, &now, &enabled, &running);
eed01528 5095
cdd6c482 5096 if (event->attr.read_format & PERF_FORMAT_GROUP)
eed01528 5097 perf_output_read_group(handle, event, enabled, running);
3dab77fb 5098 else
eed01528 5099 perf_output_read_one(handle, event, enabled, running);
3dab77fb
PZ
5100}
5101
5622f295
MM
5102void perf_output_sample(struct perf_output_handle *handle,
5103 struct perf_event_header *header,
5104 struct perf_sample_data *data,
cdd6c482 5105 struct perf_event *event)
5622f295
MM
5106{
5107 u64 sample_type = data->type;
5108
5109 perf_output_put(handle, *header);
5110
ff3d527c
AH
5111 if (sample_type & PERF_SAMPLE_IDENTIFIER)
5112 perf_output_put(handle, data->id);
5113
5622f295
MM
5114 if (sample_type & PERF_SAMPLE_IP)
5115 perf_output_put(handle, data->ip);
5116
5117 if (sample_type & PERF_SAMPLE_TID)
5118 perf_output_put(handle, data->tid_entry);
5119
5120 if (sample_type & PERF_SAMPLE_TIME)
5121 perf_output_put(handle, data->time);
5122
5123 if (sample_type & PERF_SAMPLE_ADDR)
5124 perf_output_put(handle, data->addr);
5125
5126 if (sample_type & PERF_SAMPLE_ID)
5127 perf_output_put(handle, data->id);
5128
5129 if (sample_type & PERF_SAMPLE_STREAM_ID)
5130 perf_output_put(handle, data->stream_id);
5131
5132 if (sample_type & PERF_SAMPLE_CPU)
5133 perf_output_put(handle, data->cpu_entry);
5134
5135 if (sample_type & PERF_SAMPLE_PERIOD)
5136 perf_output_put(handle, data->period);
5137
5138 if (sample_type & PERF_SAMPLE_READ)
cdd6c482 5139 perf_output_read(handle, event);
5622f295
MM
5140
5141 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5142 if (data->callchain) {
5143 int size = 1;
5144
5145 if (data->callchain)
5146 size += data->callchain->nr;
5147
5148 size *= sizeof(u64);
5149
76369139 5150 __output_copy(handle, data->callchain, size);
5622f295
MM
5151 } else {
5152 u64 nr = 0;
5153 perf_output_put(handle, nr);
5154 }
5155 }
5156
5157 if (sample_type & PERF_SAMPLE_RAW) {
5158 if (data->raw) {
5159 perf_output_put(handle, data->raw->size);
76369139
FW
5160 __output_copy(handle, data->raw->data,
5161 data->raw->size);
5622f295
MM
5162 } else {
5163 struct {
5164 u32 size;
5165 u32 data;
5166 } raw = {
5167 .size = sizeof(u32),
5168 .data = 0,
5169 };
5170 perf_output_put(handle, raw);
5171 }
5172 }
a7ac67ea 5173
bce38cd5
SE
5174 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5175 if (data->br_stack) {
5176 size_t size;
5177
5178 size = data->br_stack->nr
5179 * sizeof(struct perf_branch_entry);
5180
5181 perf_output_put(handle, data->br_stack->nr);
5182 perf_output_copy(handle, data->br_stack->entries, size);
5183 } else {
5184 /*
5185 * we always store at least the value of nr
5186 */
5187 u64 nr = 0;
5188 perf_output_put(handle, nr);
5189 }
5190 }
4018994f
JO
5191
5192 if (sample_type & PERF_SAMPLE_REGS_USER) {
5193 u64 abi = data->regs_user.abi;
5194
5195 /*
5196 * If there are no regs to dump, notice it through
5197 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5198 */
5199 perf_output_put(handle, abi);
5200
5201 if (abi) {
5202 u64 mask = event->attr.sample_regs_user;
5203 perf_output_sample_regs(handle,
5204 data->regs_user.regs,
5205 mask);
5206 }
5207 }
c5ebcedb 5208
a5cdd40c 5209 if (sample_type & PERF_SAMPLE_STACK_USER) {
c5ebcedb
JO
5210 perf_output_sample_ustack(handle,
5211 data->stack_user_size,
5212 data->regs_user.regs);
a5cdd40c 5213 }
c3feedf2
AK
5214
5215 if (sample_type & PERF_SAMPLE_WEIGHT)
5216 perf_output_put(handle, data->weight);
d6be9ad6
SE
5217
5218 if (sample_type & PERF_SAMPLE_DATA_SRC)
5219 perf_output_put(handle, data->data_src.val);
a5cdd40c 5220
fdfbbd07
AK
5221 if (sample_type & PERF_SAMPLE_TRANSACTION)
5222 perf_output_put(handle, data->txn);
5223
60e2364e
SE
5224 if (sample_type & PERF_SAMPLE_REGS_INTR) {
5225 u64 abi = data->regs_intr.abi;
5226 /*
5227 * If there are no regs to dump, notice it through
5228 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5229 */
5230 perf_output_put(handle, abi);
5231
5232 if (abi) {
5233 u64 mask = event->attr.sample_regs_intr;
5234
5235 perf_output_sample_regs(handle,
5236 data->regs_intr.regs,
5237 mask);
5238 }
5239 }
5240
a5cdd40c
PZ
5241 if (!event->attr.watermark) {
5242 int wakeup_events = event->attr.wakeup_events;
5243
5244 if (wakeup_events) {
5245 struct ring_buffer *rb = handle->rb;
5246 int events = local_inc_return(&rb->events);
5247
5248 if (events >= wakeup_events) {
5249 local_sub(wakeup_events, &rb->events);
5250 local_inc(&rb->wakeup);
5251 }
5252 }
5253 }
5622f295
MM
5254}
5255
5256void perf_prepare_sample(struct perf_event_header *header,
5257 struct perf_sample_data *data,
cdd6c482 5258 struct perf_event *event,
5622f295 5259 struct pt_regs *regs)
7b732a75 5260{
cdd6c482 5261 u64 sample_type = event->attr.sample_type;
7b732a75 5262
cdd6c482 5263 header->type = PERF_RECORD_SAMPLE;
c320c7b7 5264 header->size = sizeof(*header) + event->header_size;
5622f295
MM
5265
5266 header->misc = 0;
5267 header->misc |= perf_misc_flags(regs);
6fab0192 5268
c980d109 5269 __perf_event_header__init_id(header, data, event);
6844c09d 5270
c320c7b7 5271 if (sample_type & PERF_SAMPLE_IP)
5622f295
MM
5272 data->ip = perf_instruction_pointer(regs);
5273
b23f3325 5274 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5622f295 5275 int size = 1;
394ee076 5276
e6dab5ff 5277 data->callchain = perf_callchain(event, regs);
5622f295
MM
5278
5279 if (data->callchain)
5280 size += data->callchain->nr;
5281
5282 header->size += size * sizeof(u64);
394ee076
PZ
5283 }
5284
3a43ce68 5285 if (sample_type & PERF_SAMPLE_RAW) {
a044560c
PZ
5286 int size = sizeof(u32);
5287
5288 if (data->raw)
5289 size += data->raw->size;
5290 else
5291 size += sizeof(u32);
5292
5293 WARN_ON_ONCE(size & (sizeof(u64)-1));
5622f295 5294 header->size += size;
7f453c24 5295 }
bce38cd5
SE
5296
5297 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5298 int size = sizeof(u64); /* nr */
5299 if (data->br_stack) {
5300 size += data->br_stack->nr
5301 * sizeof(struct perf_branch_entry);
5302 }
5303 header->size += size;
5304 }
4018994f 5305
2565711f 5306 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
88a7c26a
AL
5307 perf_sample_regs_user(&data->regs_user, regs,
5308 &data->regs_user_copy);
2565711f 5309
4018994f
JO
5310 if (sample_type & PERF_SAMPLE_REGS_USER) {
5311 /* regs dump ABI info */
5312 int size = sizeof(u64);
5313
4018994f
JO
5314 if (data->regs_user.regs) {
5315 u64 mask = event->attr.sample_regs_user;
5316 size += hweight64(mask) * sizeof(u64);
5317 }
5318
5319 header->size += size;
5320 }
c5ebcedb
JO
5321
5322 if (sample_type & PERF_SAMPLE_STACK_USER) {
5323 /*
5324 * Either we need PERF_SAMPLE_STACK_USER bit to be allways
5325 * processed as the last one or have additional check added
5326 * in case new sample type is added, because we could eat
5327 * up the rest of the sample size.
5328 */
c5ebcedb
JO
5329 u16 stack_size = event->attr.sample_stack_user;
5330 u16 size = sizeof(u64);
5331
c5ebcedb 5332 stack_size = perf_sample_ustack_size(stack_size, header->size,
2565711f 5333 data->regs_user.regs);
c5ebcedb
JO
5334
5335 /*
5336 * If there is something to dump, add space for the dump
5337 * itself and for the field that tells the dynamic size,
5338 * which is how many have been actually dumped.
5339 */
5340 if (stack_size)
5341 size += sizeof(u64) + stack_size;
5342
5343 data->stack_user_size = stack_size;
5344 header->size += size;
5345 }
60e2364e
SE
5346
5347 if (sample_type & PERF_SAMPLE_REGS_INTR) {
5348 /* regs dump ABI info */
5349 int size = sizeof(u64);
5350
5351 perf_sample_regs_intr(&data->regs_intr, regs);
5352
5353 if (data->regs_intr.regs) {
5354 u64 mask = event->attr.sample_regs_intr;
5355
5356 size += hweight64(mask) * sizeof(u64);
5357 }
5358
5359 header->size += size;
5360 }
5622f295 5361}
7f453c24 5362
a8b0ca17 5363static void perf_event_output(struct perf_event *event,
5622f295
MM
5364 struct perf_sample_data *data,
5365 struct pt_regs *regs)
5366{
5367 struct perf_output_handle handle;
5368 struct perf_event_header header;
689802b2 5369
927c7a9e
FW
5370 /* protect the callchain buffers */
5371 rcu_read_lock();
5372
cdd6c482 5373 perf_prepare_sample(&header, data, event, regs);
5c148194 5374
a7ac67ea 5375 if (perf_output_begin(&handle, event, header.size))
927c7a9e 5376 goto exit;
0322cd6e 5377
cdd6c482 5378 perf_output_sample(&handle, &header, data, event);
f413cdb8 5379
8a057d84 5380 perf_output_end(&handle);
927c7a9e
FW
5381
5382exit:
5383 rcu_read_unlock();
0322cd6e
PZ
5384}
5385
38b200d6 5386/*
cdd6c482 5387 * read event_id
38b200d6
PZ
5388 */
5389
5390struct perf_read_event {
5391 struct perf_event_header header;
5392
5393 u32 pid;
5394 u32 tid;
38b200d6
PZ
5395};
5396
5397static void
cdd6c482 5398perf_event_read_event(struct perf_event *event,
38b200d6
PZ
5399 struct task_struct *task)
5400{
5401 struct perf_output_handle handle;
c980d109 5402 struct perf_sample_data sample;
dfc65094 5403 struct perf_read_event read_event = {
38b200d6 5404 .header = {
cdd6c482 5405 .type = PERF_RECORD_READ,
38b200d6 5406 .misc = 0,
c320c7b7 5407 .size = sizeof(read_event) + event->read_size,
38b200d6 5408 },
cdd6c482
IM
5409 .pid = perf_event_pid(event, task),
5410 .tid = perf_event_tid(event, task),
38b200d6 5411 };
3dab77fb 5412 int ret;
38b200d6 5413
c980d109 5414 perf_event_header__init_id(&read_event.header, &sample, event);
a7ac67ea 5415 ret = perf_output_begin(&handle, event, read_event.header.size);
38b200d6
PZ
5416 if (ret)
5417 return;
5418
dfc65094 5419 perf_output_put(&handle, read_event);
cdd6c482 5420 perf_output_read(&handle, event);
c980d109 5421 perf_event__output_id_sample(event, &handle, &sample);
3dab77fb 5422
38b200d6
PZ
5423 perf_output_end(&handle);
5424}
5425
52d857a8
JO
5426typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
5427
5428static void
5429perf_event_aux_ctx(struct perf_event_context *ctx,
52d857a8
JO
5430 perf_event_aux_output_cb output,
5431 void *data)
5432{
5433 struct perf_event *event;
5434
5435 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
5436 if (event->state < PERF_EVENT_STATE_INACTIVE)
5437 continue;
5438 if (!event_filter_match(event))
5439 continue;
67516844 5440 output(event, data);
52d857a8
JO
5441 }
5442}
5443
5444static void
67516844 5445perf_event_aux(perf_event_aux_output_cb output, void *data,
52d857a8
JO
5446 struct perf_event_context *task_ctx)
5447{
5448 struct perf_cpu_context *cpuctx;
5449 struct perf_event_context *ctx;
5450 struct pmu *pmu;
5451 int ctxn;
5452
5453 rcu_read_lock();
5454 list_for_each_entry_rcu(pmu, &pmus, entry) {
5455 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
5456 if (cpuctx->unique_pmu != pmu)
5457 goto next;
67516844 5458 perf_event_aux_ctx(&cpuctx->ctx, output, data);
52d857a8
JO
5459 if (task_ctx)
5460 goto next;
5461 ctxn = pmu->task_ctx_nr;
5462 if (ctxn < 0)
5463 goto next;
5464 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
5465 if (ctx)
67516844 5466 perf_event_aux_ctx(ctx, output, data);
52d857a8
JO
5467next:
5468 put_cpu_ptr(pmu->pmu_cpu_context);
5469 }
5470
5471 if (task_ctx) {
5472 preempt_disable();
67516844 5473 perf_event_aux_ctx(task_ctx, output, data);
52d857a8
JO
5474 preempt_enable();
5475 }
5476 rcu_read_unlock();
5477}
5478
60313ebe 5479/*
9f498cc5
PZ
5480 * task tracking -- fork/exit
5481 *
13d7a241 5482 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
60313ebe
PZ
5483 */
5484
9f498cc5 5485struct perf_task_event {
3a80b4a3 5486 struct task_struct *task;
cdd6c482 5487 struct perf_event_context *task_ctx;
60313ebe
PZ
5488
5489 struct {
5490 struct perf_event_header header;
5491
5492 u32 pid;
5493 u32 ppid;
9f498cc5
PZ
5494 u32 tid;
5495 u32 ptid;
393b2ad8 5496 u64 time;
cdd6c482 5497 } event_id;
60313ebe
PZ
5498};
5499
67516844
JO
5500static int perf_event_task_match(struct perf_event *event)
5501{
13d7a241
SE
5502 return event->attr.comm || event->attr.mmap ||
5503 event->attr.mmap2 || event->attr.mmap_data ||
5504 event->attr.task;
67516844
JO
5505}
5506
cdd6c482 5507static void perf_event_task_output(struct perf_event *event,
52d857a8 5508 void *data)
60313ebe 5509{
52d857a8 5510 struct perf_task_event *task_event = data;
60313ebe 5511 struct perf_output_handle handle;
c980d109 5512 struct perf_sample_data sample;
9f498cc5 5513 struct task_struct *task = task_event->task;
c980d109 5514 int ret, size = task_event->event_id.header.size;
8bb39f9a 5515
67516844
JO
5516 if (!perf_event_task_match(event))
5517 return;
5518
c980d109 5519 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
60313ebe 5520
c980d109 5521 ret = perf_output_begin(&handle, event,
a7ac67ea 5522 task_event->event_id.header.size);
ef60777c 5523 if (ret)
c980d109 5524 goto out;
60313ebe 5525
cdd6c482
IM
5526 task_event->event_id.pid = perf_event_pid(event, task);
5527 task_event->event_id.ppid = perf_event_pid(event, current);
60313ebe 5528
cdd6c482
IM
5529 task_event->event_id.tid = perf_event_tid(event, task);
5530 task_event->event_id.ptid = perf_event_tid(event, current);
9f498cc5 5531
34f43927
PZ
5532 task_event->event_id.time = perf_event_clock(event);
5533
cdd6c482 5534 perf_output_put(&handle, task_event->event_id);
393b2ad8 5535
c980d109
ACM
5536 perf_event__output_id_sample(event, &handle, &sample);
5537
60313ebe 5538 perf_output_end(&handle);
c980d109
ACM
5539out:
5540 task_event->event_id.header.size = size;
60313ebe
PZ
5541}
5542
cdd6c482
IM
5543static void perf_event_task(struct task_struct *task,
5544 struct perf_event_context *task_ctx,
3a80b4a3 5545 int new)
60313ebe 5546{
9f498cc5 5547 struct perf_task_event task_event;
60313ebe 5548
cdd6c482
IM
5549 if (!atomic_read(&nr_comm_events) &&
5550 !atomic_read(&nr_mmap_events) &&
5551 !atomic_read(&nr_task_events))
60313ebe
PZ
5552 return;
5553
9f498cc5 5554 task_event = (struct perf_task_event){
3a80b4a3
PZ
5555 .task = task,
5556 .task_ctx = task_ctx,
cdd6c482 5557 .event_id = {
60313ebe 5558 .header = {
cdd6c482 5559 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
573402db 5560 .misc = 0,
cdd6c482 5561 .size = sizeof(task_event.event_id),
60313ebe 5562 },
573402db
PZ
5563 /* .pid */
5564 /* .ppid */
9f498cc5
PZ
5565 /* .tid */
5566 /* .ptid */
34f43927 5567 /* .time */
60313ebe
PZ
5568 },
5569 };
5570
67516844 5571 perf_event_aux(perf_event_task_output,
52d857a8
JO
5572 &task_event,
5573 task_ctx);
9f498cc5
PZ
5574}
5575
cdd6c482 5576void perf_event_fork(struct task_struct *task)
9f498cc5 5577{
cdd6c482 5578 perf_event_task(task, NULL, 1);
60313ebe
PZ
5579}
5580
8d1b2d93
PZ
5581/*
5582 * comm tracking
5583 */
5584
5585struct perf_comm_event {
22a4f650
IM
5586 struct task_struct *task;
5587 char *comm;
8d1b2d93
PZ
5588 int comm_size;
5589
5590 struct {
5591 struct perf_event_header header;
5592
5593 u32 pid;
5594 u32 tid;
cdd6c482 5595 } event_id;
8d1b2d93
PZ
5596};
5597
67516844
JO
5598static int perf_event_comm_match(struct perf_event *event)
5599{
5600 return event->attr.comm;
5601}
5602
cdd6c482 5603static void perf_event_comm_output(struct perf_event *event,
52d857a8 5604 void *data)
8d1b2d93 5605{
52d857a8 5606 struct perf_comm_event *comm_event = data;
8d1b2d93 5607 struct perf_output_handle handle;
c980d109 5608 struct perf_sample_data sample;
cdd6c482 5609 int size = comm_event->event_id.header.size;
c980d109
ACM
5610 int ret;
5611
67516844
JO
5612 if (!perf_event_comm_match(event))
5613 return;
5614
c980d109
ACM
5615 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
5616 ret = perf_output_begin(&handle, event,
a7ac67ea 5617 comm_event->event_id.header.size);
8d1b2d93
PZ
5618
5619 if (ret)
c980d109 5620 goto out;
8d1b2d93 5621
cdd6c482
IM
5622 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
5623 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
709e50cf 5624
cdd6c482 5625 perf_output_put(&handle, comm_event->event_id);
76369139 5626 __output_copy(&handle, comm_event->comm,
8d1b2d93 5627 comm_event->comm_size);
c980d109
ACM
5628
5629 perf_event__output_id_sample(event, &handle, &sample);
5630
8d1b2d93 5631 perf_output_end(&handle);
c980d109
ACM
5632out:
5633 comm_event->event_id.header.size = size;
8d1b2d93
PZ
5634}
5635
cdd6c482 5636static void perf_event_comm_event(struct perf_comm_event *comm_event)
8d1b2d93 5637{
413ee3b4 5638 char comm[TASK_COMM_LEN];
8d1b2d93 5639 unsigned int size;
8d1b2d93 5640
413ee3b4 5641 memset(comm, 0, sizeof(comm));
96b02d78 5642 strlcpy(comm, comm_event->task->comm, sizeof(comm));
888fcee0 5643 size = ALIGN(strlen(comm)+1, sizeof(u64));
8d1b2d93
PZ
5644
5645 comm_event->comm = comm;
5646 comm_event->comm_size = size;
5647
cdd6c482 5648 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
8dc85d54 5649
67516844 5650 perf_event_aux(perf_event_comm_output,
52d857a8
JO
5651 comm_event,
5652 NULL);
8d1b2d93
PZ
5653}
5654
82b89778 5655void perf_event_comm(struct task_struct *task, bool exec)
8d1b2d93 5656{
9ee318a7
PZ
5657 struct perf_comm_event comm_event;
5658
cdd6c482 5659 if (!atomic_read(&nr_comm_events))
9ee318a7 5660 return;
a63eaf34 5661
9ee318a7 5662 comm_event = (struct perf_comm_event){
8d1b2d93 5663 .task = task,
573402db
PZ
5664 /* .comm */
5665 /* .comm_size */
cdd6c482 5666 .event_id = {
573402db 5667 .header = {
cdd6c482 5668 .type = PERF_RECORD_COMM,
82b89778 5669 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
573402db
PZ
5670 /* .size */
5671 },
5672 /* .pid */
5673 /* .tid */
8d1b2d93
PZ
5674 },
5675 };
5676
cdd6c482 5677 perf_event_comm_event(&comm_event);
8d1b2d93
PZ
5678}
5679
0a4a9391
PZ
5680/*
5681 * mmap tracking
5682 */
5683
5684struct perf_mmap_event {
089dd79d
PZ
5685 struct vm_area_struct *vma;
5686
5687 const char *file_name;
5688 int file_size;
13d7a241
SE
5689 int maj, min;
5690 u64 ino;
5691 u64 ino_generation;
f972eb63 5692 u32 prot, flags;
0a4a9391
PZ
5693
5694 struct {
5695 struct perf_event_header header;
5696
5697 u32 pid;
5698 u32 tid;
5699 u64 start;
5700 u64 len;
5701 u64 pgoff;
cdd6c482 5702 } event_id;
0a4a9391
PZ
5703};
5704
67516844
JO
5705static int perf_event_mmap_match(struct perf_event *event,
5706 void *data)
5707{
5708 struct perf_mmap_event *mmap_event = data;
5709 struct vm_area_struct *vma = mmap_event->vma;
5710 int executable = vma->vm_flags & VM_EXEC;
5711
5712 return (!executable && event->attr.mmap_data) ||
13d7a241 5713 (executable && (event->attr.mmap || event->attr.mmap2));
67516844
JO
5714}
5715
cdd6c482 5716static void perf_event_mmap_output(struct perf_event *event,
52d857a8 5717 void *data)
0a4a9391 5718{
52d857a8 5719 struct perf_mmap_event *mmap_event = data;
0a4a9391 5720 struct perf_output_handle handle;
c980d109 5721 struct perf_sample_data sample;
cdd6c482 5722 int size = mmap_event->event_id.header.size;
c980d109 5723 int ret;
0a4a9391 5724
67516844
JO
5725 if (!perf_event_mmap_match(event, data))
5726 return;
5727
13d7a241
SE
5728 if (event->attr.mmap2) {
5729 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
5730 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
5731 mmap_event->event_id.header.size += sizeof(mmap_event->min);
5732 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
d008d525 5733 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
f972eb63
PZ
5734 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
5735 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
13d7a241
SE
5736 }
5737
c980d109
ACM
5738 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
5739 ret = perf_output_begin(&handle, event,
a7ac67ea 5740 mmap_event->event_id.header.size);
0a4a9391 5741 if (ret)
c980d109 5742 goto out;
0a4a9391 5743
cdd6c482
IM
5744 mmap_event->event_id.pid = perf_event_pid(event, current);
5745 mmap_event->event_id.tid = perf_event_tid(event, current);
709e50cf 5746
cdd6c482 5747 perf_output_put(&handle, mmap_event->event_id);
13d7a241
SE
5748
5749 if (event->attr.mmap2) {
5750 perf_output_put(&handle, mmap_event->maj);
5751 perf_output_put(&handle, mmap_event->min);
5752 perf_output_put(&handle, mmap_event->ino);
5753 perf_output_put(&handle, mmap_event->ino_generation);
f972eb63
PZ
5754 perf_output_put(&handle, mmap_event->prot);
5755 perf_output_put(&handle, mmap_event->flags);
13d7a241
SE
5756 }
5757
76369139 5758 __output_copy(&handle, mmap_event->file_name,
0a4a9391 5759 mmap_event->file_size);
c980d109
ACM
5760
5761 perf_event__output_id_sample(event, &handle, &sample);
5762
78d613eb 5763 perf_output_end(&handle);
c980d109
ACM
5764out:
5765 mmap_event->event_id.header.size = size;
0a4a9391
PZ
5766}
5767
cdd6c482 5768static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
0a4a9391 5769{
089dd79d
PZ
5770 struct vm_area_struct *vma = mmap_event->vma;
5771 struct file *file = vma->vm_file;
13d7a241
SE
5772 int maj = 0, min = 0;
5773 u64 ino = 0, gen = 0;
f972eb63 5774 u32 prot = 0, flags = 0;
0a4a9391
PZ
5775 unsigned int size;
5776 char tmp[16];
5777 char *buf = NULL;
2c42cfbf 5778 char *name;
413ee3b4 5779
0a4a9391 5780 if (file) {
13d7a241
SE
5781 struct inode *inode;
5782 dev_t dev;
3ea2f2b9 5783
2c42cfbf 5784 buf = kmalloc(PATH_MAX, GFP_KERNEL);
0a4a9391 5785 if (!buf) {
c7e548b4
ON
5786 name = "//enomem";
5787 goto cpy_name;
0a4a9391 5788 }
413ee3b4 5789 /*
3ea2f2b9 5790 * d_path() works from the end of the rb backwards, so we
413ee3b4
AB
5791 * need to add enough zero bytes after the string to handle
5792 * the 64bit alignment we do later.
5793 */
3ea2f2b9 5794 name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
0a4a9391 5795 if (IS_ERR(name)) {
c7e548b4
ON
5796 name = "//toolong";
5797 goto cpy_name;
0a4a9391 5798 }
13d7a241
SE
5799 inode = file_inode(vma->vm_file);
5800 dev = inode->i_sb->s_dev;
5801 ino = inode->i_ino;
5802 gen = inode->i_generation;
5803 maj = MAJOR(dev);
5804 min = MINOR(dev);
f972eb63
PZ
5805
5806 if (vma->vm_flags & VM_READ)
5807 prot |= PROT_READ;
5808 if (vma->vm_flags & VM_WRITE)
5809 prot |= PROT_WRITE;
5810 if (vma->vm_flags & VM_EXEC)
5811 prot |= PROT_EXEC;
5812
5813 if (vma->vm_flags & VM_MAYSHARE)
5814 flags = MAP_SHARED;
5815 else
5816 flags = MAP_PRIVATE;
5817
5818 if (vma->vm_flags & VM_DENYWRITE)
5819 flags |= MAP_DENYWRITE;
5820 if (vma->vm_flags & VM_MAYEXEC)
5821 flags |= MAP_EXECUTABLE;
5822 if (vma->vm_flags & VM_LOCKED)
5823 flags |= MAP_LOCKED;
5824 if (vma->vm_flags & VM_HUGETLB)
5825 flags |= MAP_HUGETLB;
5826
c7e548b4 5827 goto got_name;
0a4a9391 5828 } else {
fbe26abe
JO
5829 if (vma->vm_ops && vma->vm_ops->name) {
5830 name = (char *) vma->vm_ops->name(vma);
5831 if (name)
5832 goto cpy_name;
5833 }
5834
2c42cfbf 5835 name = (char *)arch_vma_name(vma);
c7e548b4
ON
5836 if (name)
5837 goto cpy_name;
089dd79d 5838
32c5fb7e 5839 if (vma->vm_start <= vma->vm_mm->start_brk &&
3af9e859 5840 vma->vm_end >= vma->vm_mm->brk) {
c7e548b4
ON
5841 name = "[heap]";
5842 goto cpy_name;
32c5fb7e
ON
5843 }
5844 if (vma->vm_start <= vma->vm_mm->start_stack &&
3af9e859 5845 vma->vm_end >= vma->vm_mm->start_stack) {
c7e548b4
ON
5846 name = "[stack]";
5847 goto cpy_name;
089dd79d
PZ
5848 }
5849
c7e548b4
ON
5850 name = "//anon";
5851 goto cpy_name;
0a4a9391
PZ
5852 }
5853
c7e548b4
ON
5854cpy_name:
5855 strlcpy(tmp, name, sizeof(tmp));
5856 name = tmp;
0a4a9391 5857got_name:
2c42cfbf
PZ
5858 /*
5859 * Since our buffer works in 8 byte units we need to align our string
5860 * size to a multiple of 8. However, we must guarantee the tail end is
5861 * zero'd out to avoid leaking random bits to userspace.
5862 */
5863 size = strlen(name)+1;
5864 while (!IS_ALIGNED(size, sizeof(u64)))
5865 name[size++] = '\0';
0a4a9391
PZ
5866
5867 mmap_event->file_name = name;
5868 mmap_event->file_size = size;
13d7a241
SE
5869 mmap_event->maj = maj;
5870 mmap_event->min = min;
5871 mmap_event->ino = ino;
5872 mmap_event->ino_generation = gen;
f972eb63
PZ
5873 mmap_event->prot = prot;
5874 mmap_event->flags = flags;
0a4a9391 5875
2fe85427
SE
5876 if (!(vma->vm_flags & VM_EXEC))
5877 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
5878
cdd6c482 5879 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
0a4a9391 5880
67516844 5881 perf_event_aux(perf_event_mmap_output,
52d857a8
JO
5882 mmap_event,
5883 NULL);
665c2142 5884
0a4a9391
PZ
5885 kfree(buf);
5886}
5887
3af9e859 5888void perf_event_mmap(struct vm_area_struct *vma)
0a4a9391 5889{
9ee318a7
PZ
5890 struct perf_mmap_event mmap_event;
5891
cdd6c482 5892 if (!atomic_read(&nr_mmap_events))
9ee318a7
PZ
5893 return;
5894
5895 mmap_event = (struct perf_mmap_event){
089dd79d 5896 .vma = vma,
573402db
PZ
5897 /* .file_name */
5898 /* .file_size */
cdd6c482 5899 .event_id = {
573402db 5900 .header = {
cdd6c482 5901 .type = PERF_RECORD_MMAP,
39447b38 5902 .misc = PERF_RECORD_MISC_USER,
573402db
PZ
5903 /* .size */
5904 },
5905 /* .pid */
5906 /* .tid */
089dd79d
PZ
5907 .start = vma->vm_start,
5908 .len = vma->vm_end - vma->vm_start,
3a0304e9 5909 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
0a4a9391 5910 },
13d7a241
SE
5911 /* .maj (attr_mmap2 only) */
5912 /* .min (attr_mmap2 only) */
5913 /* .ino (attr_mmap2 only) */
5914 /* .ino_generation (attr_mmap2 only) */
f972eb63
PZ
5915 /* .prot (attr_mmap2 only) */
5916 /* .flags (attr_mmap2 only) */
0a4a9391
PZ
5917 };
5918
cdd6c482 5919 perf_event_mmap_event(&mmap_event);
0a4a9391
PZ
5920}
5921
68db7e98
AS
5922void perf_event_aux_event(struct perf_event *event, unsigned long head,
5923 unsigned long size, u64 flags)
5924{
5925 struct perf_output_handle handle;
5926 struct perf_sample_data sample;
5927 struct perf_aux_event {
5928 struct perf_event_header header;
5929 u64 offset;
5930 u64 size;
5931 u64 flags;
5932 } rec = {
5933 .header = {
5934 .type = PERF_RECORD_AUX,
5935 .misc = 0,
5936 .size = sizeof(rec),
5937 },
5938 .offset = head,
5939 .size = size,
5940 .flags = flags,
5941 };
5942 int ret;
5943
5944 perf_event_header__init_id(&rec.header, &sample, event);
5945 ret = perf_output_begin(&handle, event, rec.header.size);
5946
5947 if (ret)
5948 return;
5949
5950 perf_output_put(&handle, rec);
5951 perf_event__output_id_sample(event, &handle, &sample);
5952
5953 perf_output_end(&handle);
5954}
5955
a78ac325
PZ
5956/*
5957 * IRQ throttle logging
5958 */
5959
cdd6c482 5960static void perf_log_throttle(struct perf_event *event, int enable)
a78ac325
PZ
5961{
5962 struct perf_output_handle handle;
c980d109 5963 struct perf_sample_data sample;
a78ac325
PZ
5964 int ret;
5965
5966 struct {
5967 struct perf_event_header header;
5968 u64 time;
cca3f454 5969 u64 id;
7f453c24 5970 u64 stream_id;
a78ac325
PZ
5971 } throttle_event = {
5972 .header = {
cdd6c482 5973 .type = PERF_RECORD_THROTTLE,
a78ac325
PZ
5974 .misc = 0,
5975 .size = sizeof(throttle_event),
5976 },
34f43927 5977 .time = perf_event_clock(event),
cdd6c482
IM
5978 .id = primary_event_id(event),
5979 .stream_id = event->id,
a78ac325
PZ
5980 };
5981
966ee4d6 5982 if (enable)
cdd6c482 5983 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
966ee4d6 5984
c980d109
ACM
5985 perf_event_header__init_id(&throttle_event.header, &sample, event);
5986
5987 ret = perf_output_begin(&handle, event,
a7ac67ea 5988 throttle_event.header.size);
a78ac325
PZ
5989 if (ret)
5990 return;
5991
5992 perf_output_put(&handle, throttle_event);
c980d109 5993 perf_event__output_id_sample(event, &handle, &sample);
a78ac325
PZ
5994 perf_output_end(&handle);
5995}
5996
ec0d7729
AS
5997static void perf_log_itrace_start(struct perf_event *event)
5998{
5999 struct perf_output_handle handle;
6000 struct perf_sample_data sample;
6001 struct perf_aux_event {
6002 struct perf_event_header header;
6003 u32 pid;
6004 u32 tid;
6005 } rec;
6006 int ret;
6007
6008 if (event->parent)
6009 event = event->parent;
6010
6011 if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
6012 event->hw.itrace_started)
6013 return;
6014
6015 event->hw.itrace_started = 1;
6016
6017 rec.header.type = PERF_RECORD_ITRACE_START;
6018 rec.header.misc = 0;
6019 rec.header.size = sizeof(rec);
6020 rec.pid = perf_event_pid(event, current);
6021 rec.tid = perf_event_tid(event, current);
6022
6023 perf_event_header__init_id(&rec.header, &sample, event);
6024 ret = perf_output_begin(&handle, event, rec.header.size);
6025
6026 if (ret)
6027 return;
6028
6029 perf_output_put(&handle, rec);
6030 perf_event__output_id_sample(event, &handle, &sample);
6031
6032 perf_output_end(&handle);
6033}
6034
f6c7d5fe 6035/*
cdd6c482 6036 * Generic event overflow handling, sampling.
f6c7d5fe
PZ
6037 */
6038
a8b0ca17 6039static int __perf_event_overflow(struct perf_event *event,
5622f295
MM
6040 int throttle, struct perf_sample_data *data,
6041 struct pt_regs *regs)
f6c7d5fe 6042{
cdd6c482
IM
6043 int events = atomic_read(&event->event_limit);
6044 struct hw_perf_event *hwc = &event->hw;
e050e3f0 6045 u64 seq;
79f14641
PZ
6046 int ret = 0;
6047
96398826
PZ
6048 /*
6049 * Non-sampling counters might still use the PMI to fold short
6050 * hardware counters, ignore those.
6051 */
6052 if (unlikely(!is_sampling_event(event)))
6053 return 0;
6054
e050e3f0
SE
6055 seq = __this_cpu_read(perf_throttled_seq);
6056 if (seq != hwc->interrupts_seq) {
6057 hwc->interrupts_seq = seq;
6058 hwc->interrupts = 1;
6059 } else {
6060 hwc->interrupts++;
6061 if (unlikely(throttle
6062 && hwc->interrupts >= max_samples_per_tick)) {
6063 __this_cpu_inc(perf_throttled_count);
163ec435
PZ
6064 hwc->interrupts = MAX_INTERRUPTS;
6065 perf_log_throttle(event, 0);
d84153d6 6066 tick_nohz_full_kick();
a78ac325
PZ
6067 ret = 1;
6068 }
e050e3f0 6069 }
60db5e09 6070
cdd6c482 6071 if (event->attr.freq) {
def0a9b2 6072 u64 now = perf_clock();
abd50713 6073 s64 delta = now - hwc->freq_time_stamp;
bd2b5b12 6074
abd50713 6075 hwc->freq_time_stamp = now;
bd2b5b12 6076
abd50713 6077 if (delta > 0 && delta < 2*TICK_NSEC)
f39d47ff 6078 perf_adjust_period(event, delta, hwc->last_period, true);
bd2b5b12
PZ
6079 }
6080
2023b359
PZ
6081 /*
6082 * XXX event_limit might not quite work as expected on inherited
cdd6c482 6083 * events
2023b359
PZ
6084 */
6085
cdd6c482
IM
6086 event->pending_kill = POLL_IN;
6087 if (events && atomic_dec_and_test(&event->event_limit)) {
79f14641 6088 ret = 1;
cdd6c482 6089 event->pending_kill = POLL_HUP;
a8b0ca17
PZ
6090 event->pending_disable = 1;
6091 irq_work_queue(&event->pending);
79f14641
PZ
6092 }
6093
453f19ee 6094 if (event->overflow_handler)
a8b0ca17 6095 event->overflow_handler(event, data, regs);
453f19ee 6096 else
a8b0ca17 6097 perf_event_output(event, data, regs);
453f19ee 6098
f506b3dc 6099 if (event->fasync && event->pending_kill) {
a8b0ca17
PZ
6100 event->pending_wakeup = 1;
6101 irq_work_queue(&event->pending);
f506b3dc
PZ
6102 }
6103
79f14641 6104 return ret;
f6c7d5fe
PZ
6105}
6106
a8b0ca17 6107int perf_event_overflow(struct perf_event *event,
5622f295
MM
6108 struct perf_sample_data *data,
6109 struct pt_regs *regs)
850bc73f 6110{
a8b0ca17 6111 return __perf_event_overflow(event, 1, data, regs);
850bc73f
PZ
6112}
6113
15dbf27c 6114/*
cdd6c482 6115 * Generic software event infrastructure
15dbf27c
PZ
6116 */
6117
b28ab83c
PZ
6118struct swevent_htable {
6119 struct swevent_hlist *swevent_hlist;
6120 struct mutex hlist_mutex;
6121 int hlist_refcount;
6122
6123 /* Recursion avoidance in each contexts */
6124 int recursion[PERF_NR_CONTEXTS];
39af6b16
JO
6125
6126 /* Keeps track of cpu being initialized/exited */
6127 bool online;
b28ab83c
PZ
6128};
6129
6130static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
6131
7b4b6658 6132/*
cdd6c482
IM
6133 * We directly increment event->count and keep a second value in
6134 * event->hw.period_left to count intervals. This period event
7b4b6658
PZ
6135 * is kept in the range [-sample_period, 0] so that we can use the
6136 * sign as trigger.
6137 */
6138
ab573844 6139u64 perf_swevent_set_period(struct perf_event *event)
15dbf27c 6140{
cdd6c482 6141 struct hw_perf_event *hwc = &event->hw;
7b4b6658
PZ
6142 u64 period = hwc->last_period;
6143 u64 nr, offset;
6144 s64 old, val;
6145
6146 hwc->last_period = hwc->sample_period;
15dbf27c
PZ
6147
6148again:
e7850595 6149 old = val = local64_read(&hwc->period_left);
7b4b6658
PZ
6150 if (val < 0)
6151 return 0;
15dbf27c 6152
7b4b6658
PZ
6153 nr = div64_u64(period + val, period);
6154 offset = nr * period;
6155 val -= offset;
e7850595 6156 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
7b4b6658 6157 goto again;
15dbf27c 6158
7b4b6658 6159 return nr;
15dbf27c
PZ
6160}
6161
0cff784a 6162static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
a8b0ca17 6163 struct perf_sample_data *data,
5622f295 6164 struct pt_regs *regs)
15dbf27c 6165{
cdd6c482 6166 struct hw_perf_event *hwc = &event->hw;
850bc73f 6167 int throttle = 0;
15dbf27c 6168
0cff784a
PZ
6169 if (!overflow)
6170 overflow = perf_swevent_set_period(event);
15dbf27c 6171
7b4b6658
PZ
6172 if (hwc->interrupts == MAX_INTERRUPTS)
6173 return;
15dbf27c 6174
7b4b6658 6175 for (; overflow; overflow--) {
a8b0ca17 6176 if (__perf_event_overflow(event, throttle,
5622f295 6177 data, regs)) {
7b4b6658
PZ
6178 /*
6179 * We inhibit the overflow from happening when
6180 * hwc->interrupts == MAX_INTERRUPTS.
6181 */
6182 break;
6183 }
cf450a73 6184 throttle = 1;
7b4b6658 6185 }
15dbf27c
PZ
6186}
6187
a4eaf7f1 6188static void perf_swevent_event(struct perf_event *event, u64 nr,
a8b0ca17 6189 struct perf_sample_data *data,
5622f295 6190 struct pt_regs *regs)
7b4b6658 6191{
cdd6c482 6192 struct hw_perf_event *hwc = &event->hw;
d6d020e9 6193
e7850595 6194 local64_add(nr, &event->count);
d6d020e9 6195
0cff784a
PZ
6196 if (!regs)
6197 return;
6198
6c7e550f 6199 if (!is_sampling_event(event))
7b4b6658 6200 return;
d6d020e9 6201
5d81e5cf
AV
6202 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
6203 data->period = nr;
6204 return perf_swevent_overflow(event, 1, data, regs);
6205 } else
6206 data->period = event->hw.last_period;
6207
0cff784a 6208 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
a8b0ca17 6209 return perf_swevent_overflow(event, 1, data, regs);
0cff784a 6210
e7850595 6211 if (local64_add_negative(nr, &hwc->period_left))
7b4b6658 6212 return;
df1a132b 6213
a8b0ca17 6214 perf_swevent_overflow(event, 0, data, regs);
d6d020e9
PZ
6215}
6216
f5ffe02e
FW
6217static int perf_exclude_event(struct perf_event *event,
6218 struct pt_regs *regs)
6219{
a4eaf7f1 6220 if (event->hw.state & PERF_HES_STOPPED)
91b2f482 6221 return 1;
a4eaf7f1 6222
f5ffe02e
FW
6223 if (regs) {
6224 if (event->attr.exclude_user && user_mode(regs))
6225 return 1;
6226
6227 if (event->attr.exclude_kernel && !user_mode(regs))
6228 return 1;
6229 }
6230
6231 return 0;
6232}
6233
cdd6c482 6234static int perf_swevent_match(struct perf_event *event,
1c432d89 6235 enum perf_type_id type,
6fb2915d
LZ
6236 u32 event_id,
6237 struct perf_sample_data *data,
6238 struct pt_regs *regs)
15dbf27c 6239{
cdd6c482 6240 if (event->attr.type != type)
a21ca2ca 6241 return 0;
f5ffe02e 6242
cdd6c482 6243 if (event->attr.config != event_id)
15dbf27c
PZ
6244 return 0;
6245
f5ffe02e
FW
6246 if (perf_exclude_event(event, regs))
6247 return 0;
15dbf27c
PZ
6248
6249 return 1;
6250}
6251
76e1d904
FW
6252static inline u64 swevent_hash(u64 type, u32 event_id)
6253{
6254 u64 val = event_id | (type << 32);
6255
6256 return hash_64(val, SWEVENT_HLIST_BITS);
6257}
6258
49f135ed
FW
6259static inline struct hlist_head *
6260__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
76e1d904 6261{
49f135ed
FW
6262 u64 hash = swevent_hash(type, event_id);
6263
6264 return &hlist->heads[hash];
6265}
76e1d904 6266
49f135ed
FW
6267/* For the read side: events when they trigger */
6268static inline struct hlist_head *
b28ab83c 6269find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
49f135ed
FW
6270{
6271 struct swevent_hlist *hlist;
76e1d904 6272
b28ab83c 6273 hlist = rcu_dereference(swhash->swevent_hlist);
76e1d904
FW
6274 if (!hlist)
6275 return NULL;
6276
49f135ed
FW
6277 return __find_swevent_head(hlist, type, event_id);
6278}
6279
6280/* For the event head insertion and removal in the hlist */
6281static inline struct hlist_head *
b28ab83c 6282find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
49f135ed
FW
6283{
6284 struct swevent_hlist *hlist;
6285 u32 event_id = event->attr.config;
6286 u64 type = event->attr.type;
6287
6288 /*
6289 * Event scheduling is always serialized against hlist allocation
6290 * and release. Which makes the protected version suitable here.
6291 * The context lock guarantees that.
6292 */
b28ab83c 6293 hlist = rcu_dereference_protected(swhash->swevent_hlist,
49f135ed
FW
6294 lockdep_is_held(&event->ctx->lock));
6295 if (!hlist)
6296 return NULL;
6297
6298 return __find_swevent_head(hlist, type, event_id);
76e1d904
FW
6299}
6300
6301static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
a8b0ca17 6302 u64 nr,
76e1d904
FW
6303 struct perf_sample_data *data,
6304 struct pt_regs *regs)
15dbf27c 6305{
4a32fea9 6306 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
cdd6c482 6307 struct perf_event *event;
76e1d904 6308 struct hlist_head *head;
15dbf27c 6309
76e1d904 6310 rcu_read_lock();
b28ab83c 6311 head = find_swevent_head_rcu(swhash, type, event_id);
76e1d904
FW
6312 if (!head)
6313 goto end;
6314
b67bfe0d 6315 hlist_for_each_entry_rcu(event, head, hlist_entry) {
6fb2915d 6316 if (perf_swevent_match(event, type, event_id, data, regs))
a8b0ca17 6317 perf_swevent_event(event, nr, data, regs);
15dbf27c 6318 }
76e1d904
FW
6319end:
6320 rcu_read_unlock();
15dbf27c
PZ
6321}
6322
86038c5e
PZI
6323DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
6324
4ed7c92d 6325int perf_swevent_get_recursion_context(void)
96f6d444 6326{
4a32fea9 6327 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
96f6d444 6328
b28ab83c 6329 return get_recursion_context(swhash->recursion);
96f6d444 6330}
645e8cc0 6331EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
96f6d444 6332
fa9f90be 6333inline void perf_swevent_put_recursion_context(int rctx)
15dbf27c 6334{
4a32fea9 6335 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
927c7a9e 6336
b28ab83c 6337 put_recursion_context(swhash->recursion, rctx);
ce71b9df 6338}
15dbf27c 6339
86038c5e 6340void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
b8e83514 6341{
a4234bfc 6342 struct perf_sample_data data;
4ed7c92d 6343
86038c5e 6344 if (WARN_ON_ONCE(!regs))
4ed7c92d 6345 return;
a4234bfc 6346
fd0d000b 6347 perf_sample_data_init(&data, addr, 0);
a8b0ca17 6348 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
86038c5e
PZI
6349}
6350
6351void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
6352{
6353 int rctx;
6354
6355 preempt_disable_notrace();
6356 rctx = perf_swevent_get_recursion_context();
6357 if (unlikely(rctx < 0))
6358 goto fail;
6359
6360 ___perf_sw_event(event_id, nr, regs, addr);
4ed7c92d
PZ
6361
6362 perf_swevent_put_recursion_context(rctx);
86038c5e 6363fail:
1c024eca 6364 preempt_enable_notrace();
b8e83514
PZ
6365}
6366
cdd6c482 6367static void perf_swevent_read(struct perf_event *event)
15dbf27c 6368{
15dbf27c
PZ
6369}
6370
a4eaf7f1 6371static int perf_swevent_add(struct perf_event *event, int flags)
15dbf27c 6372{
4a32fea9 6373 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
cdd6c482 6374 struct hw_perf_event *hwc = &event->hw;
76e1d904
FW
6375 struct hlist_head *head;
6376
6c7e550f 6377 if (is_sampling_event(event)) {
7b4b6658 6378 hwc->last_period = hwc->sample_period;
cdd6c482 6379 perf_swevent_set_period(event);
7b4b6658 6380 }
76e1d904 6381
a4eaf7f1
PZ
6382 hwc->state = !(flags & PERF_EF_START);
6383
b28ab83c 6384 head = find_swevent_head(swhash, event);
39af6b16
JO
6385 if (!head) {
6386 /*
6387 * We can race with cpu hotplug code. Do not
6388 * WARN if the cpu just got unplugged.
6389 */
6390 WARN_ON_ONCE(swhash->online);
76e1d904 6391 return -EINVAL;
39af6b16 6392 }
76e1d904
FW
6393
6394 hlist_add_head_rcu(&event->hlist_entry, head);
6a694a60 6395 perf_event_update_userpage(event);
76e1d904 6396
15dbf27c
PZ
6397 return 0;
6398}
6399
a4eaf7f1 6400static void perf_swevent_del(struct perf_event *event, int flags)
15dbf27c 6401{
76e1d904 6402 hlist_del_rcu(&event->hlist_entry);
15dbf27c
PZ
6403}
6404
a4eaf7f1 6405static void perf_swevent_start(struct perf_event *event, int flags)
5c92d124 6406{
a4eaf7f1 6407 event->hw.state = 0;
d6d020e9 6408}
aa9c4c0f 6409
a4eaf7f1 6410static void perf_swevent_stop(struct perf_event *event, int flags)
d6d020e9 6411{
a4eaf7f1 6412 event->hw.state = PERF_HES_STOPPED;
bae43c99
IM
6413}
6414
49f135ed
FW
6415/* Deref the hlist from the update side */
6416static inline struct swevent_hlist *
b28ab83c 6417swevent_hlist_deref(struct swevent_htable *swhash)
49f135ed 6418{
b28ab83c
PZ
6419 return rcu_dereference_protected(swhash->swevent_hlist,
6420 lockdep_is_held(&swhash->hlist_mutex));
49f135ed
FW
6421}
6422
b28ab83c 6423static void swevent_hlist_release(struct swevent_htable *swhash)
76e1d904 6424{
b28ab83c 6425 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
76e1d904 6426
49f135ed 6427 if (!hlist)
76e1d904
FW
6428 return;
6429
70691d4a 6430 RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
fa4bbc4c 6431 kfree_rcu(hlist, rcu_head);
76e1d904
FW
6432}
6433
6434static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
6435{
b28ab83c 6436 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
76e1d904 6437
b28ab83c 6438 mutex_lock(&swhash->hlist_mutex);
76e1d904 6439
b28ab83c
PZ
6440 if (!--swhash->hlist_refcount)
6441 swevent_hlist_release(swhash);
76e1d904 6442
b28ab83c 6443 mutex_unlock(&swhash->hlist_mutex);
76e1d904
FW
6444}
6445
6446static void swevent_hlist_put(struct perf_event *event)
6447{
6448 int cpu;
6449
76e1d904
FW
6450 for_each_possible_cpu(cpu)
6451 swevent_hlist_put_cpu(event, cpu);
6452}
6453
6454static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
6455{
b28ab83c 6456 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
76e1d904
FW
6457 int err = 0;
6458
b28ab83c 6459 mutex_lock(&swhash->hlist_mutex);
76e1d904 6460
b28ab83c 6461 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
76e1d904
FW
6462 struct swevent_hlist *hlist;
6463
6464 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
6465 if (!hlist) {
6466 err = -ENOMEM;
6467 goto exit;
6468 }
b28ab83c 6469 rcu_assign_pointer(swhash->swevent_hlist, hlist);
76e1d904 6470 }
b28ab83c 6471 swhash->hlist_refcount++;
9ed6060d 6472exit:
b28ab83c 6473 mutex_unlock(&swhash->hlist_mutex);
76e1d904
FW
6474
6475 return err;
6476}
6477
6478static int swevent_hlist_get(struct perf_event *event)
6479{
6480 int err;
6481 int cpu, failed_cpu;
6482
76e1d904
FW
6483 get_online_cpus();
6484 for_each_possible_cpu(cpu) {
6485 err = swevent_hlist_get_cpu(event, cpu);
6486 if (err) {
6487 failed_cpu = cpu;
6488 goto fail;
6489 }
6490 }
6491 put_online_cpus();
6492
6493 return 0;
9ed6060d 6494fail:
76e1d904
FW
6495 for_each_possible_cpu(cpu) {
6496 if (cpu == failed_cpu)
6497 break;
6498 swevent_hlist_put_cpu(event, cpu);
6499 }
6500
6501 put_online_cpus();
6502 return err;
6503}
6504
c5905afb 6505struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
95476b64 6506
b0a873eb
PZ
6507static void sw_perf_event_destroy(struct perf_event *event)
6508{
6509 u64 event_id = event->attr.config;
95476b64 6510
b0a873eb
PZ
6511 WARN_ON(event->parent);
6512
c5905afb 6513 static_key_slow_dec(&perf_swevent_enabled[event_id]);
b0a873eb
PZ
6514 swevent_hlist_put(event);
6515}
6516
6517static int perf_swevent_init(struct perf_event *event)
6518{
8176cced 6519 u64 event_id = event->attr.config;
b0a873eb
PZ
6520
6521 if (event->attr.type != PERF_TYPE_SOFTWARE)
6522 return -ENOENT;
6523
2481c5fa
SE
6524 /*
6525 * no branch sampling for software events
6526 */
6527 if (has_branch_stack(event))
6528 return -EOPNOTSUPP;
6529
b0a873eb
PZ
6530 switch (event_id) {
6531 case PERF_COUNT_SW_CPU_CLOCK:
6532 case PERF_COUNT_SW_TASK_CLOCK:
6533 return -ENOENT;
6534
6535 default:
6536 break;
6537 }
6538
ce677831 6539 if (event_id >= PERF_COUNT_SW_MAX)
b0a873eb
PZ
6540 return -ENOENT;
6541
6542 if (!event->parent) {
6543 int err;
6544
6545 err = swevent_hlist_get(event);
6546 if (err)
6547 return err;
6548
c5905afb 6549 static_key_slow_inc(&perf_swevent_enabled[event_id]);
b0a873eb
PZ
6550 event->destroy = sw_perf_event_destroy;
6551 }
6552
6553 return 0;
6554}
6555
6556static struct pmu perf_swevent = {
89a1e187 6557 .task_ctx_nr = perf_sw_context,
95476b64 6558
34f43927
PZ
6559 .capabilities = PERF_PMU_CAP_NO_NMI,
6560
b0a873eb 6561 .event_init = perf_swevent_init,
a4eaf7f1
PZ
6562 .add = perf_swevent_add,
6563 .del = perf_swevent_del,
6564 .start = perf_swevent_start,
6565 .stop = perf_swevent_stop,
1c024eca 6566 .read = perf_swevent_read,
1c024eca
PZ
6567};
6568
b0a873eb
PZ
6569#ifdef CONFIG_EVENT_TRACING
6570
1c024eca
PZ
6571static int perf_tp_filter_match(struct perf_event *event,
6572 struct perf_sample_data *data)
6573{
6574 void *record = data->raw->data;
6575
6576 if (likely(!event->filter) || filter_match_preds(event->filter, record))
6577 return 1;
6578 return 0;
6579}
6580
6581static int perf_tp_event_match(struct perf_event *event,
6582 struct perf_sample_data *data,
6583 struct pt_regs *regs)
6584{
a0f7d0f7
FW
6585 if (event->hw.state & PERF_HES_STOPPED)
6586 return 0;
580d607c
PZ
6587 /*
6588 * All tracepoints are from kernel-space.
6589 */
6590 if (event->attr.exclude_kernel)
1c024eca
PZ
6591 return 0;
6592
6593 if (!perf_tp_filter_match(event, data))
6594 return 0;
6595
6596 return 1;
6597}
6598
6599void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
e6dab5ff
AV
6600 struct pt_regs *regs, struct hlist_head *head, int rctx,
6601 struct task_struct *task)
95476b64
FW
6602{
6603 struct perf_sample_data data;
1c024eca 6604 struct perf_event *event;
1c024eca 6605
95476b64
FW
6606 struct perf_raw_record raw = {
6607 .size = entry_size,
6608 .data = record,
6609 };
6610
fd0d000b 6611 perf_sample_data_init(&data, addr, 0);
95476b64
FW
6612 data.raw = &raw;
6613
b67bfe0d 6614 hlist_for_each_entry_rcu(event, head, hlist_entry) {
1c024eca 6615 if (perf_tp_event_match(event, &data, regs))
a8b0ca17 6616 perf_swevent_event(event, count, &data, regs);
4f41c013 6617 }
ecc55f84 6618
e6dab5ff
AV
6619 /*
6620 * If we got specified a target task, also iterate its context and
6621 * deliver this event there too.
6622 */
6623 if (task && task != current) {
6624 struct perf_event_context *ctx;
6625 struct trace_entry *entry = record;
6626
6627 rcu_read_lock();
6628 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
6629 if (!ctx)
6630 goto unlock;
6631
6632 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
6633 if (event->attr.type != PERF_TYPE_TRACEPOINT)
6634 continue;
6635 if (event->attr.config != entry->type)
6636 continue;
6637 if (perf_tp_event_match(event, &data, regs))
6638 perf_swevent_event(event, count, &data, regs);
6639 }
6640unlock:
6641 rcu_read_unlock();
6642 }
6643
ecc55f84 6644 perf_swevent_put_recursion_context(rctx);
95476b64
FW
6645}
6646EXPORT_SYMBOL_GPL(perf_tp_event);
6647
cdd6c482 6648static void tp_perf_event_destroy(struct perf_event *event)
e077df4f 6649{
1c024eca 6650 perf_trace_destroy(event);
e077df4f
PZ
6651}
6652
b0a873eb 6653static int perf_tp_event_init(struct perf_event *event)
e077df4f 6654{
76e1d904
FW
6655 int err;
6656
b0a873eb
PZ
6657 if (event->attr.type != PERF_TYPE_TRACEPOINT)
6658 return -ENOENT;
6659
2481c5fa
SE
6660 /*
6661 * no branch sampling for tracepoint events
6662 */
6663 if (has_branch_stack(event))
6664 return -EOPNOTSUPP;
6665
1c024eca
PZ
6666 err = perf_trace_init(event);
6667 if (err)
b0a873eb 6668 return err;
e077df4f 6669
cdd6c482 6670 event->destroy = tp_perf_event_destroy;
e077df4f 6671
b0a873eb
PZ
6672 return 0;
6673}
6674
6675static struct pmu perf_tracepoint = {
89a1e187
PZ
6676 .task_ctx_nr = perf_sw_context,
6677
b0a873eb 6678 .event_init = perf_tp_event_init,
a4eaf7f1
PZ
6679 .add = perf_trace_add,
6680 .del = perf_trace_del,
6681 .start = perf_swevent_start,
6682 .stop = perf_swevent_stop,
b0a873eb 6683 .read = perf_swevent_read,
b0a873eb
PZ
6684};
6685
6686static inline void perf_tp_register(void)
6687{
2e80a82a 6688 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
e077df4f 6689}
6fb2915d
LZ
6690
6691static int perf_event_set_filter(struct perf_event *event, void __user *arg)
6692{
6693 char *filter_str;
6694 int ret;
6695
6696 if (event->attr.type != PERF_TYPE_TRACEPOINT)
6697 return -EINVAL;
6698
6699 filter_str = strndup_user(arg, PAGE_SIZE);
6700 if (IS_ERR(filter_str))
6701 return PTR_ERR(filter_str);
6702
6703 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
6704
6705 kfree(filter_str);
6706 return ret;
6707}
6708
6709static void perf_event_free_filter(struct perf_event *event)
6710{
6711 ftrace_profile_free_filter(event);
6712}
6713
2541517c
AS
6714static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
6715{
6716 struct bpf_prog *prog;
6717
6718 if (event->attr.type != PERF_TYPE_TRACEPOINT)
6719 return -EINVAL;
6720
6721 if (event->tp_event->prog)
6722 return -EEXIST;
6723
6724 if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
6725 /* bpf programs can only be attached to kprobes */
6726 return -EINVAL;
6727
6728 prog = bpf_prog_get(prog_fd);
6729 if (IS_ERR(prog))
6730 return PTR_ERR(prog);
6731
6c373ca8 6732 if (prog->type != BPF_PROG_TYPE_KPROBE) {
2541517c
AS
6733 /* valid fd, but invalid bpf program type */
6734 bpf_prog_put(prog);
6735 return -EINVAL;
6736 }
6737
6738 event->tp_event->prog = prog;
6739
6740 return 0;
6741}
6742
6743static void perf_event_free_bpf_prog(struct perf_event *event)
6744{
6745 struct bpf_prog *prog;
6746
6747 if (!event->tp_event)
6748 return;
6749
6750 prog = event->tp_event->prog;
6751 if (prog) {
6752 event->tp_event->prog = NULL;
6753 bpf_prog_put(prog);
6754 }
6755}
6756
e077df4f 6757#else
6fb2915d 6758
b0a873eb 6759static inline void perf_tp_register(void)
e077df4f 6760{
e077df4f 6761}
6fb2915d
LZ
6762
6763static int perf_event_set_filter(struct perf_event *event, void __user *arg)
6764{
6765 return -ENOENT;
6766}
6767
6768static void perf_event_free_filter(struct perf_event *event)
6769{
6770}
6771
2541517c
AS
6772static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
6773{
6774 return -ENOENT;
6775}
6776
6777static void perf_event_free_bpf_prog(struct perf_event *event)
6778{
6779}
07b139c8 6780#endif /* CONFIG_EVENT_TRACING */
e077df4f 6781
24f1e32c 6782#ifdef CONFIG_HAVE_HW_BREAKPOINT
f5ffe02e 6783void perf_bp_event(struct perf_event *bp, void *data)
24f1e32c 6784{
f5ffe02e
FW
6785 struct perf_sample_data sample;
6786 struct pt_regs *regs = data;
6787
fd0d000b 6788 perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
f5ffe02e 6789
a4eaf7f1 6790 if (!bp->hw.state && !perf_exclude_event(bp, regs))
a8b0ca17 6791 perf_swevent_event(bp, 1, &sample, regs);
24f1e32c
FW
6792}
6793#endif
6794
b0a873eb
PZ
6795/*
6796 * hrtimer based swevent callback
6797 */
f29ac756 6798
b0a873eb 6799static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
f29ac756 6800{
b0a873eb
PZ
6801 enum hrtimer_restart ret = HRTIMER_RESTART;
6802 struct perf_sample_data data;
6803 struct pt_regs *regs;
6804 struct perf_event *event;
6805 u64 period;
f29ac756 6806
b0a873eb 6807 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
ba3dd36c
PZ
6808
6809 if (event->state != PERF_EVENT_STATE_ACTIVE)
6810 return HRTIMER_NORESTART;
6811
b0a873eb 6812 event->pmu->read(event);
f344011c 6813
fd0d000b 6814 perf_sample_data_init(&data, 0, event->hw.last_period);
b0a873eb
PZ
6815 regs = get_irq_regs();
6816
6817 if (regs && !perf_exclude_event(event, regs)) {
77aeeebd 6818 if (!(event->attr.exclude_idle && is_idle_task(current)))
33b07b8b 6819 if (__perf_event_overflow(event, 1, &data, regs))
b0a873eb
PZ
6820 ret = HRTIMER_NORESTART;
6821 }
24f1e32c 6822
b0a873eb
PZ
6823 period = max_t(u64, 10000, event->hw.sample_period);
6824 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
24f1e32c 6825
b0a873eb 6826 return ret;
f29ac756
PZ
6827}
6828
b0a873eb 6829static void perf_swevent_start_hrtimer(struct perf_event *event)
5c92d124 6830{
b0a873eb 6831 struct hw_perf_event *hwc = &event->hw;
5d508e82
FBH
6832 s64 period;
6833
6834 if (!is_sampling_event(event))
6835 return;
f5ffe02e 6836
5d508e82
FBH
6837 period = local64_read(&hwc->period_left);
6838 if (period) {
6839 if (period < 0)
6840 period = 10000;
fa407f35 6841
5d508e82
FBH
6842 local64_set(&hwc->period_left, 0);
6843 } else {
6844 period = max_t(u64, 10000, hwc->sample_period);
6845 }
6846 __hrtimer_start_range_ns(&hwc->hrtimer,
b0a873eb 6847 ns_to_ktime(period), 0,
b5ab4cd5 6848 HRTIMER_MODE_REL_PINNED, 0);
24f1e32c 6849}
b0a873eb
PZ
6850
6851static void perf_swevent_cancel_hrtimer(struct perf_event *event)
24f1e32c 6852{
b0a873eb
PZ
6853 struct hw_perf_event *hwc = &event->hw;
6854
6c7e550f 6855 if (is_sampling_event(event)) {
b0a873eb 6856 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
fa407f35 6857 local64_set(&hwc->period_left, ktime_to_ns(remaining));
b0a873eb
PZ
6858
6859 hrtimer_cancel(&hwc->hrtimer);
6860 }
24f1e32c
FW
6861}
6862
ba3dd36c
PZ
6863static void perf_swevent_init_hrtimer(struct perf_event *event)
6864{
6865 struct hw_perf_event *hwc = &event->hw;
6866
6867 if (!is_sampling_event(event))
6868 return;
6869
6870 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
6871 hwc->hrtimer.function = perf_swevent_hrtimer;
6872
6873 /*
6874 * Since hrtimers have a fixed rate, we can do a static freq->period
6875 * mapping and avoid the whole period adjust feedback stuff.
6876 */
6877 if (event->attr.freq) {
6878 long freq = event->attr.sample_freq;
6879
6880 event->attr.sample_period = NSEC_PER_SEC / freq;
6881 hwc->sample_period = event->attr.sample_period;
6882 local64_set(&hwc->period_left, hwc->sample_period);
778141e3 6883 hwc->last_period = hwc->sample_period;
ba3dd36c
PZ
6884 event->attr.freq = 0;
6885 }
6886}
6887
b0a873eb
PZ
6888/*
6889 * Software event: cpu wall time clock
6890 */
6891
6892static void cpu_clock_event_update(struct perf_event *event)
24f1e32c 6893{
b0a873eb
PZ
6894 s64 prev;
6895 u64 now;
6896
a4eaf7f1 6897 now = local_clock();
b0a873eb
PZ
6898 prev = local64_xchg(&event->hw.prev_count, now);
6899 local64_add(now - prev, &event->count);
24f1e32c 6900}
24f1e32c 6901
a4eaf7f1 6902static void cpu_clock_event_start(struct perf_event *event, int flags)
b0a873eb 6903{
a4eaf7f1 6904 local64_set(&event->hw.prev_count, local_clock());
b0a873eb 6905 perf_swevent_start_hrtimer(event);
b0a873eb
PZ
6906}
6907
a4eaf7f1 6908static void cpu_clock_event_stop(struct perf_event *event, int flags)
f29ac756 6909{
b0a873eb
PZ
6910 perf_swevent_cancel_hrtimer(event);
6911 cpu_clock_event_update(event);
6912}
f29ac756 6913
a4eaf7f1
PZ
6914static int cpu_clock_event_add(struct perf_event *event, int flags)
6915{
6916 if (flags & PERF_EF_START)
6917 cpu_clock_event_start(event, flags);
6a694a60 6918 perf_event_update_userpage(event);
a4eaf7f1
PZ
6919
6920 return 0;
6921}
6922
6923static void cpu_clock_event_del(struct perf_event *event, int flags)
6924{
6925 cpu_clock_event_stop(event, flags);
6926}
6927
b0a873eb
PZ
6928static void cpu_clock_event_read(struct perf_event *event)
6929{
6930 cpu_clock_event_update(event);
6931}
f344011c 6932
b0a873eb
PZ
6933static int cpu_clock_event_init(struct perf_event *event)
6934{
6935 if (event->attr.type != PERF_TYPE_SOFTWARE)
6936 return -ENOENT;
6937
6938 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
6939 return -ENOENT;
6940
2481c5fa
SE
6941 /*
6942 * no branch sampling for software events
6943 */
6944 if (has_branch_stack(event))
6945 return -EOPNOTSUPP;
6946
ba3dd36c
PZ
6947 perf_swevent_init_hrtimer(event);
6948
b0a873eb 6949 return 0;
f29ac756
PZ
6950}
6951
b0a873eb 6952static struct pmu perf_cpu_clock = {
89a1e187
PZ
6953 .task_ctx_nr = perf_sw_context,
6954
34f43927
PZ
6955 .capabilities = PERF_PMU_CAP_NO_NMI,
6956
b0a873eb 6957 .event_init = cpu_clock_event_init,
a4eaf7f1
PZ
6958 .add = cpu_clock_event_add,
6959 .del = cpu_clock_event_del,
6960 .start = cpu_clock_event_start,
6961 .stop = cpu_clock_event_stop,
b0a873eb
PZ
6962 .read = cpu_clock_event_read,
6963};
6964
6965/*
6966 * Software event: task time clock
6967 */
6968
6969static void task_clock_event_update(struct perf_event *event, u64 now)
5c92d124 6970{
b0a873eb
PZ
6971 u64 prev;
6972 s64 delta;
5c92d124 6973
b0a873eb
PZ
6974 prev = local64_xchg(&event->hw.prev_count, now);
6975 delta = now - prev;
6976 local64_add(delta, &event->count);
6977}
5c92d124 6978
a4eaf7f1 6979static void task_clock_event_start(struct perf_event *event, int flags)
b0a873eb 6980{
a4eaf7f1 6981 local64_set(&event->hw.prev_count, event->ctx->time);
b0a873eb 6982 perf_swevent_start_hrtimer(event);
b0a873eb
PZ
6983}
6984
a4eaf7f1 6985static void task_clock_event_stop(struct perf_event *event, int flags)
b0a873eb
PZ
6986{
6987 perf_swevent_cancel_hrtimer(event);
6988 task_clock_event_update(event, event->ctx->time);
a4eaf7f1
PZ
6989}
6990
6991static int task_clock_event_add(struct perf_event *event, int flags)
6992{
6993 if (flags & PERF_EF_START)
6994 task_clock_event_start(event, flags);
6a694a60 6995 perf_event_update_userpage(event);
b0a873eb 6996
a4eaf7f1
PZ
6997 return 0;
6998}
6999
7000static void task_clock_event_del(struct perf_event *event, int flags)
7001{
7002 task_clock_event_stop(event, PERF_EF_UPDATE);
b0a873eb
PZ
7003}
7004
7005static void task_clock_event_read(struct perf_event *event)
7006{
768a06e2
PZ
7007 u64 now = perf_clock();
7008 u64 delta = now - event->ctx->timestamp;
7009 u64 time = event->ctx->time + delta;
b0a873eb
PZ
7010
7011 task_clock_event_update(event, time);
7012}
7013
7014static int task_clock_event_init(struct perf_event *event)
6fb2915d 7015{
b0a873eb
PZ
7016 if (event->attr.type != PERF_TYPE_SOFTWARE)
7017 return -ENOENT;
7018
7019 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
7020 return -ENOENT;
7021
2481c5fa
SE
7022 /*
7023 * no branch sampling for software events
7024 */
7025 if (has_branch_stack(event))
7026 return -EOPNOTSUPP;
7027
ba3dd36c
PZ
7028 perf_swevent_init_hrtimer(event);
7029
b0a873eb 7030 return 0;
6fb2915d
LZ
7031}
7032
b0a873eb 7033static struct pmu perf_task_clock = {
89a1e187
PZ
7034 .task_ctx_nr = perf_sw_context,
7035
34f43927
PZ
7036 .capabilities = PERF_PMU_CAP_NO_NMI,
7037
b0a873eb 7038 .event_init = task_clock_event_init,
a4eaf7f1
PZ
7039 .add = task_clock_event_add,
7040 .del = task_clock_event_del,
7041 .start = task_clock_event_start,
7042 .stop = task_clock_event_stop,
b0a873eb
PZ
7043 .read = task_clock_event_read,
7044};
6fb2915d 7045
ad5133b7 7046static void perf_pmu_nop_void(struct pmu *pmu)
e077df4f 7047{
e077df4f 7048}
6fb2915d 7049
ad5133b7 7050static int perf_pmu_nop_int(struct pmu *pmu)
6fb2915d 7051{
ad5133b7 7052 return 0;
6fb2915d
LZ
7053}
7054
ad5133b7 7055static void perf_pmu_start_txn(struct pmu *pmu)
6fb2915d 7056{
ad5133b7 7057 perf_pmu_disable(pmu);
6fb2915d
LZ
7058}
7059
ad5133b7
PZ
7060static int perf_pmu_commit_txn(struct pmu *pmu)
7061{
7062 perf_pmu_enable(pmu);
7063 return 0;
7064}
e077df4f 7065
ad5133b7 7066static void perf_pmu_cancel_txn(struct pmu *pmu)
24f1e32c 7067{
ad5133b7 7068 perf_pmu_enable(pmu);
24f1e32c
FW
7069}
7070
35edc2a5
PZ
7071static int perf_event_idx_default(struct perf_event *event)
7072{
c719f560 7073 return 0;
35edc2a5
PZ
7074}
7075
8dc85d54
PZ
7076/*
7077 * Ensures all contexts with the same task_ctx_nr have the same
7078 * pmu_cpu_context too.
7079 */
9e317041 7080static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
24f1e32c 7081{
8dc85d54 7082 struct pmu *pmu;
b326e956 7083
8dc85d54
PZ
7084 if (ctxn < 0)
7085 return NULL;
24f1e32c 7086
8dc85d54
PZ
7087 list_for_each_entry(pmu, &pmus, entry) {
7088 if (pmu->task_ctx_nr == ctxn)
7089 return pmu->pmu_cpu_context;
7090 }
24f1e32c 7091
8dc85d54 7092 return NULL;
24f1e32c
FW
7093}
7094
51676957 7095static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
24f1e32c 7096{
51676957
PZ
7097 int cpu;
7098
7099 for_each_possible_cpu(cpu) {
7100 struct perf_cpu_context *cpuctx;
7101
7102 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
7103
3f1f3320
PZ
7104 if (cpuctx->unique_pmu == old_pmu)
7105 cpuctx->unique_pmu = pmu;
51676957
PZ
7106 }
7107}
7108
7109static void free_pmu_context(struct pmu *pmu)
7110{
7111 struct pmu *i;
f5ffe02e 7112
8dc85d54 7113 mutex_lock(&pmus_lock);
0475f9ea 7114 /*
8dc85d54 7115 * Like a real lame refcount.
0475f9ea 7116 */
51676957
PZ
7117 list_for_each_entry(i, &pmus, entry) {
7118 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
7119 update_pmu_context(i, pmu);
8dc85d54 7120 goto out;
51676957 7121 }
8dc85d54 7122 }
d6d020e9 7123
51676957 7124 free_percpu(pmu->pmu_cpu_context);
8dc85d54
PZ
7125out:
7126 mutex_unlock(&pmus_lock);
24f1e32c 7127}
2e80a82a 7128static struct idr pmu_idr;
d6d020e9 7129
abe43400
PZ
7130static ssize_t
7131type_show(struct device *dev, struct device_attribute *attr, char *page)
7132{
7133 struct pmu *pmu = dev_get_drvdata(dev);
7134
7135 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
7136}
90826ca7 7137static DEVICE_ATTR_RO(type);
abe43400 7138
62b85639
SE
7139static ssize_t
7140perf_event_mux_interval_ms_show(struct device *dev,
7141 struct device_attribute *attr,
7142 char *page)
7143{
7144 struct pmu *pmu = dev_get_drvdata(dev);
7145
7146 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
7147}
7148
7149static ssize_t
7150perf_event_mux_interval_ms_store(struct device *dev,
7151 struct device_attribute *attr,
7152 const char *buf, size_t count)
7153{
7154 struct pmu *pmu = dev_get_drvdata(dev);
7155 int timer, cpu, ret;
7156
7157 ret = kstrtoint(buf, 0, &timer);
7158 if (ret)
7159 return ret;
7160
7161 if (timer < 1)
7162 return -EINVAL;
7163
7164 /* same value, noting to do */
7165 if (timer == pmu->hrtimer_interval_ms)
7166 return count;
7167
7168 pmu->hrtimer_interval_ms = timer;
7169
7170 /* update all cpuctx for this PMU */
7171 for_each_possible_cpu(cpu) {
7172 struct perf_cpu_context *cpuctx;
7173 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
7174 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
7175
7176 if (hrtimer_active(&cpuctx->hrtimer))
7177 hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
7178 }
7179
7180 return count;
7181}
90826ca7 7182static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
62b85639 7183
90826ca7
GKH
7184static struct attribute *pmu_dev_attrs[] = {
7185 &dev_attr_type.attr,
7186 &dev_attr_perf_event_mux_interval_ms.attr,
7187 NULL,
abe43400 7188};
90826ca7 7189ATTRIBUTE_GROUPS(pmu_dev);
abe43400
PZ
7190
7191static int pmu_bus_running;
7192static struct bus_type pmu_bus = {
7193 .name = "event_source",
90826ca7 7194 .dev_groups = pmu_dev_groups,
abe43400
PZ
7195};
7196
7197static void pmu_dev_release(struct device *dev)
7198{
7199 kfree(dev);
7200}
7201
7202static int pmu_dev_alloc(struct pmu *pmu)
7203{
7204 int ret = -ENOMEM;
7205
7206 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
7207 if (!pmu->dev)
7208 goto out;
7209
0c9d42ed 7210 pmu->dev->groups = pmu->attr_groups;
abe43400
PZ
7211 device_initialize(pmu->dev);
7212 ret = dev_set_name(pmu->dev, "%s", pmu->name);
7213 if (ret)
7214 goto free_dev;
7215
7216 dev_set_drvdata(pmu->dev, pmu);
7217 pmu->dev->bus = &pmu_bus;
7218 pmu->dev->release = pmu_dev_release;
7219 ret = device_add(pmu->dev);
7220 if (ret)
7221 goto free_dev;
7222
7223out:
7224 return ret;
7225
7226free_dev:
7227 put_device(pmu->dev);
7228 goto out;
7229}
7230
547e9fd7 7231static struct lock_class_key cpuctx_mutex;
facc4307 7232static struct lock_class_key cpuctx_lock;
547e9fd7 7233
03d8e80b 7234int perf_pmu_register(struct pmu *pmu, const char *name, int type)
24f1e32c 7235{
108b02cf 7236 int cpu, ret;
24f1e32c 7237
b0a873eb 7238 mutex_lock(&pmus_lock);
33696fc0
PZ
7239 ret = -ENOMEM;
7240 pmu->pmu_disable_count = alloc_percpu(int);
7241 if (!pmu->pmu_disable_count)
7242 goto unlock;
f29ac756 7243
2e80a82a
PZ
7244 pmu->type = -1;
7245 if (!name)
7246 goto skip_type;
7247 pmu->name = name;
7248
7249 if (type < 0) {
0e9c3be2
TH
7250 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
7251 if (type < 0) {
7252 ret = type;
2e80a82a
PZ
7253 goto free_pdc;
7254 }
7255 }
7256 pmu->type = type;
7257
abe43400
PZ
7258 if (pmu_bus_running) {
7259 ret = pmu_dev_alloc(pmu);
7260 if (ret)
7261 goto free_idr;
7262 }
7263
2e80a82a 7264skip_type:
8dc85d54
PZ
7265 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
7266 if (pmu->pmu_cpu_context)
7267 goto got_cpu_context;
f29ac756 7268
c4814202 7269 ret = -ENOMEM;
108b02cf
PZ
7270 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
7271 if (!pmu->pmu_cpu_context)
abe43400 7272 goto free_dev;
f344011c 7273
108b02cf
PZ
7274 for_each_possible_cpu(cpu) {
7275 struct perf_cpu_context *cpuctx;
7276
7277 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
eb184479 7278 __perf_event_init_context(&cpuctx->ctx);
547e9fd7 7279 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
facc4307 7280 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
108b02cf 7281 cpuctx->ctx.pmu = pmu;
9e630205
SE
7282
7283 __perf_cpu_hrtimer_init(cpuctx, cpu);
7284
3f1f3320 7285 cpuctx->unique_pmu = pmu;
108b02cf 7286 }
76e1d904 7287
8dc85d54 7288got_cpu_context:
ad5133b7
PZ
7289 if (!pmu->start_txn) {
7290 if (pmu->pmu_enable) {
7291 /*
7292 * If we have pmu_enable/pmu_disable calls, install
7293 * transaction stubs that use that to try and batch
7294 * hardware accesses.
7295 */
7296 pmu->start_txn = perf_pmu_start_txn;
7297 pmu->commit_txn = perf_pmu_commit_txn;
7298 pmu->cancel_txn = perf_pmu_cancel_txn;
7299 } else {
7300 pmu->start_txn = perf_pmu_nop_void;
7301 pmu->commit_txn = perf_pmu_nop_int;
7302 pmu->cancel_txn = perf_pmu_nop_void;
f344011c 7303 }
5c92d124 7304 }
15dbf27c 7305
ad5133b7
PZ
7306 if (!pmu->pmu_enable) {
7307 pmu->pmu_enable = perf_pmu_nop_void;
7308 pmu->pmu_disable = perf_pmu_nop_void;
7309 }
7310
35edc2a5
PZ
7311 if (!pmu->event_idx)
7312 pmu->event_idx = perf_event_idx_default;
7313
b0a873eb 7314 list_add_rcu(&pmu->entry, &pmus);
bed5b25a 7315 atomic_set(&pmu->exclusive_cnt, 0);
33696fc0
PZ
7316 ret = 0;
7317unlock:
b0a873eb
PZ
7318 mutex_unlock(&pmus_lock);
7319
33696fc0 7320 return ret;
108b02cf 7321
abe43400
PZ
7322free_dev:
7323 device_del(pmu->dev);
7324 put_device(pmu->dev);
7325
2e80a82a
PZ
7326free_idr:
7327 if (pmu->type >= PERF_TYPE_MAX)
7328 idr_remove(&pmu_idr, pmu->type);
7329
108b02cf
PZ
7330free_pdc:
7331 free_percpu(pmu->pmu_disable_count);
7332 goto unlock;
f29ac756 7333}
c464c76e 7334EXPORT_SYMBOL_GPL(perf_pmu_register);
f29ac756 7335
b0a873eb 7336void perf_pmu_unregister(struct pmu *pmu)
5c92d124 7337{
b0a873eb
PZ
7338 mutex_lock(&pmus_lock);
7339 list_del_rcu(&pmu->entry);
7340 mutex_unlock(&pmus_lock);
5c92d124 7341
0475f9ea 7342 /*
cde8e884
PZ
7343 * We dereference the pmu list under both SRCU and regular RCU, so
7344 * synchronize against both of those.
0475f9ea 7345 */
b0a873eb 7346 synchronize_srcu(&pmus_srcu);
cde8e884 7347 synchronize_rcu();
d6d020e9 7348
33696fc0 7349 free_percpu(pmu->pmu_disable_count);
2e80a82a
PZ
7350 if (pmu->type >= PERF_TYPE_MAX)
7351 idr_remove(&pmu_idr, pmu->type);
abe43400
PZ
7352 device_del(pmu->dev);
7353 put_device(pmu->dev);
51676957 7354 free_pmu_context(pmu);
b0a873eb 7355}
c464c76e 7356EXPORT_SYMBOL_GPL(perf_pmu_unregister);
d6d020e9 7357
cc34b98b
MR
7358static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
7359{
ccd41c86 7360 struct perf_event_context *ctx = NULL;
cc34b98b
MR
7361 int ret;
7362
7363 if (!try_module_get(pmu->module))
7364 return -ENODEV;
ccd41c86
PZ
7365
7366 if (event->group_leader != event) {
7367 ctx = perf_event_ctx_lock(event->group_leader);
7368 BUG_ON(!ctx);
7369 }
7370
cc34b98b
MR
7371 event->pmu = pmu;
7372 ret = pmu->event_init(event);
ccd41c86
PZ
7373
7374 if (ctx)
7375 perf_event_ctx_unlock(event->group_leader, ctx);
7376
cc34b98b
MR
7377 if (ret)
7378 module_put(pmu->module);
7379
7380 return ret;
7381}
7382
b0a873eb
PZ
7383struct pmu *perf_init_event(struct perf_event *event)
7384{
7385 struct pmu *pmu = NULL;
7386 int idx;
940c5b29 7387 int ret;
b0a873eb
PZ
7388
7389 idx = srcu_read_lock(&pmus_srcu);
2e80a82a
PZ
7390
7391 rcu_read_lock();
7392 pmu = idr_find(&pmu_idr, event->attr.type);
7393 rcu_read_unlock();
940c5b29 7394 if (pmu) {
cc34b98b 7395 ret = perf_try_init_event(pmu, event);
940c5b29
LM
7396 if (ret)
7397 pmu = ERR_PTR(ret);
2e80a82a 7398 goto unlock;
940c5b29 7399 }
2e80a82a 7400
b0a873eb 7401 list_for_each_entry_rcu(pmu, &pmus, entry) {
cc34b98b 7402 ret = perf_try_init_event(pmu, event);
b0a873eb 7403 if (!ret)
e5f4d339 7404 goto unlock;
76e1d904 7405
b0a873eb
PZ
7406 if (ret != -ENOENT) {
7407 pmu = ERR_PTR(ret);
e5f4d339 7408 goto unlock;
f344011c 7409 }
5c92d124 7410 }
e5f4d339
PZ
7411 pmu = ERR_PTR(-ENOENT);
7412unlock:
b0a873eb 7413 srcu_read_unlock(&pmus_srcu, idx);
15dbf27c 7414
4aeb0b42 7415 return pmu;
5c92d124
IM
7416}
7417
4beb31f3
FW
7418static void account_event_cpu(struct perf_event *event, int cpu)
7419{
7420 if (event->parent)
7421 return;
7422
4beb31f3
FW
7423 if (is_cgroup_event(event))
7424 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
7425}
7426
766d6c07
FW
7427static void account_event(struct perf_event *event)
7428{
4beb31f3
FW
7429 if (event->parent)
7430 return;
7431
766d6c07
FW
7432 if (event->attach_state & PERF_ATTACH_TASK)
7433 static_key_slow_inc(&perf_sched_events.key);
7434 if (event->attr.mmap || event->attr.mmap_data)
7435 atomic_inc(&nr_mmap_events);
7436 if (event->attr.comm)
7437 atomic_inc(&nr_comm_events);
7438 if (event->attr.task)
7439 atomic_inc(&nr_task_events);
948b26b6
FW
7440 if (event->attr.freq) {
7441 if (atomic_inc_return(&nr_freq_events) == 1)
7442 tick_nohz_full_kick_all();
7443 }
4beb31f3 7444 if (has_branch_stack(event))
766d6c07 7445 static_key_slow_inc(&perf_sched_events.key);
4beb31f3 7446 if (is_cgroup_event(event))
766d6c07 7447 static_key_slow_inc(&perf_sched_events.key);
4beb31f3
FW
7448
7449 account_event_cpu(event, event->cpu);
766d6c07
FW
7450}
7451
0793a61d 7452/*
cdd6c482 7453 * Allocate and initialize a event structure
0793a61d 7454 */
cdd6c482 7455static struct perf_event *
c3f00c70 7456perf_event_alloc(struct perf_event_attr *attr, int cpu,
d580ff86
PZ
7457 struct task_struct *task,
7458 struct perf_event *group_leader,
7459 struct perf_event *parent_event,
4dc0da86 7460 perf_overflow_handler_t overflow_handler,
79dff51e 7461 void *context, int cgroup_fd)
0793a61d 7462{
51b0fe39 7463 struct pmu *pmu;
cdd6c482
IM
7464 struct perf_event *event;
7465 struct hw_perf_event *hwc;
90983b16 7466 long err = -EINVAL;
0793a61d 7467
66832eb4
ON
7468 if ((unsigned)cpu >= nr_cpu_ids) {
7469 if (!task || cpu != -1)
7470 return ERR_PTR(-EINVAL);
7471 }
7472
c3f00c70 7473 event = kzalloc(sizeof(*event), GFP_KERNEL);
cdd6c482 7474 if (!event)
d5d2bc0d 7475 return ERR_PTR(-ENOMEM);
0793a61d 7476
04289bb9 7477 /*
cdd6c482 7478 * Single events are their own group leaders, with an
04289bb9
IM
7479 * empty sibling list:
7480 */
7481 if (!group_leader)
cdd6c482 7482 group_leader = event;
04289bb9 7483
cdd6c482
IM
7484 mutex_init(&event->child_mutex);
7485 INIT_LIST_HEAD(&event->child_list);
fccc714b 7486
cdd6c482
IM
7487 INIT_LIST_HEAD(&event->group_entry);
7488 INIT_LIST_HEAD(&event->event_entry);
7489 INIT_LIST_HEAD(&event->sibling_list);
10c6db11 7490 INIT_LIST_HEAD(&event->rb_entry);
71ad88ef 7491 INIT_LIST_HEAD(&event->active_entry);
f3ae75de
SE
7492 INIT_HLIST_NODE(&event->hlist_entry);
7493
10c6db11 7494
cdd6c482 7495 init_waitqueue_head(&event->waitq);
e360adbe 7496 init_irq_work(&event->pending, perf_pending_event);
0793a61d 7497
cdd6c482 7498 mutex_init(&event->mmap_mutex);
7b732a75 7499
a6fa941d 7500 atomic_long_set(&event->refcount, 1);
cdd6c482
IM
7501 event->cpu = cpu;
7502 event->attr = *attr;
7503 event->group_leader = group_leader;
7504 event->pmu = NULL;
cdd6c482 7505 event->oncpu = -1;
a96bbc16 7506
cdd6c482 7507 event->parent = parent_event;
b84fbc9f 7508
17cf22c3 7509 event->ns = get_pid_ns(task_active_pid_ns(current));
cdd6c482 7510 event->id = atomic64_inc_return(&perf_event_id);
a96bbc16 7511
cdd6c482 7512 event->state = PERF_EVENT_STATE_INACTIVE;
329d876d 7513
d580ff86
PZ
7514 if (task) {
7515 event->attach_state = PERF_ATTACH_TASK;
d580ff86 7516 /*
50f16a8b
PZ
7517 * XXX pmu::event_init needs to know what task to account to
7518 * and we cannot use the ctx information because we need the
7519 * pmu before we get a ctx.
d580ff86 7520 */
50f16a8b 7521 event->hw.target = task;
d580ff86
PZ
7522 }
7523
34f43927
PZ
7524 event->clock = &local_clock;
7525 if (parent_event)
7526 event->clock = parent_event->clock;
7527
4dc0da86 7528 if (!overflow_handler && parent_event) {
b326e956 7529 overflow_handler = parent_event->overflow_handler;
4dc0da86
AK
7530 context = parent_event->overflow_handler_context;
7531 }
66832eb4 7532
b326e956 7533 event->overflow_handler = overflow_handler;
4dc0da86 7534 event->overflow_handler_context = context;
97eaf530 7535
0231bb53 7536 perf_event__state_init(event);
a86ed508 7537
4aeb0b42 7538 pmu = NULL;
b8e83514 7539
cdd6c482 7540 hwc = &event->hw;
bd2b5b12 7541 hwc->sample_period = attr->sample_period;
0d48696f 7542 if (attr->freq && attr->sample_freq)
bd2b5b12 7543 hwc->sample_period = 1;
eced1dfc 7544 hwc->last_period = hwc->sample_period;
bd2b5b12 7545
e7850595 7546 local64_set(&hwc->period_left, hwc->sample_period);
60db5e09 7547
2023b359 7548 /*
cdd6c482 7549 * we currently do not support PERF_FORMAT_GROUP on inherited events
2023b359 7550 */
3dab77fb 7551 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
90983b16 7552 goto err_ns;
a46a2300
YZ
7553
7554 if (!has_branch_stack(event))
7555 event->attr.branch_sample_type = 0;
2023b359 7556
79dff51e
MF
7557 if (cgroup_fd != -1) {
7558 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
7559 if (err)
7560 goto err_ns;
7561 }
7562
b0a873eb 7563 pmu = perf_init_event(event);
4aeb0b42 7564 if (!pmu)
90983b16
FW
7565 goto err_ns;
7566 else if (IS_ERR(pmu)) {
4aeb0b42 7567 err = PTR_ERR(pmu);
90983b16 7568 goto err_ns;
621a01ea 7569 }
d5d2bc0d 7570
bed5b25a
AS
7571 err = exclusive_event_init(event);
7572 if (err)
7573 goto err_pmu;
7574
cdd6c482 7575 if (!event->parent) {
927c7a9e
FW
7576 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
7577 err = get_callchain_buffers();
90983b16 7578 if (err)
bed5b25a 7579 goto err_per_task;
d010b332 7580 }
f344011c 7581 }
9ee318a7 7582
cdd6c482 7583 return event;
90983b16 7584
bed5b25a
AS
7585err_per_task:
7586 exclusive_event_destroy(event);
7587
90983b16
FW
7588err_pmu:
7589 if (event->destroy)
7590 event->destroy(event);
c464c76e 7591 module_put(pmu->module);
90983b16 7592err_ns:
79dff51e
MF
7593 if (is_cgroup_event(event))
7594 perf_detach_cgroup(event);
90983b16
FW
7595 if (event->ns)
7596 put_pid_ns(event->ns);
7597 kfree(event);
7598
7599 return ERR_PTR(err);
0793a61d
TG
7600}
7601
cdd6c482
IM
7602static int perf_copy_attr(struct perf_event_attr __user *uattr,
7603 struct perf_event_attr *attr)
974802ea 7604{
974802ea 7605 u32 size;
cdf8073d 7606 int ret;
974802ea
PZ
7607
7608 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
7609 return -EFAULT;
7610
7611 /*
7612 * zero the full structure, so that a short copy will be nice.
7613 */
7614 memset(attr, 0, sizeof(*attr));
7615
7616 ret = get_user(size, &uattr->size);
7617 if (ret)
7618 return ret;
7619
7620 if (size > PAGE_SIZE) /* silly large */
7621 goto err_size;
7622
7623 if (!size) /* abi compat */
7624 size = PERF_ATTR_SIZE_VER0;
7625
7626 if (size < PERF_ATTR_SIZE_VER0)
7627 goto err_size;
7628
7629 /*
7630 * If we're handed a bigger struct than we know of,
cdf8073d
IS
7631 * ensure all the unknown bits are 0 - i.e. new
7632 * user-space does not rely on any kernel feature
7633 * extensions we dont know about yet.
974802ea
PZ
7634 */
7635 if (size > sizeof(*attr)) {
cdf8073d
IS
7636 unsigned char __user *addr;
7637 unsigned char __user *end;
7638 unsigned char val;
974802ea 7639
cdf8073d
IS
7640 addr = (void __user *)uattr + sizeof(*attr);
7641 end = (void __user *)uattr + size;
974802ea 7642
cdf8073d 7643 for (; addr < end; addr++) {
974802ea
PZ
7644 ret = get_user(val, addr);
7645 if (ret)
7646 return ret;
7647 if (val)
7648 goto err_size;
7649 }
b3e62e35 7650 size = sizeof(*attr);
974802ea
PZ
7651 }
7652
7653 ret = copy_from_user(attr, uattr, size);
7654 if (ret)
7655 return -EFAULT;
7656
cd757645 7657 if (attr->__reserved_1)
974802ea
PZ
7658 return -EINVAL;
7659
7660 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
7661 return -EINVAL;
7662
7663 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
7664 return -EINVAL;
7665
bce38cd5
SE
7666 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
7667 u64 mask = attr->branch_sample_type;
7668
7669 /* only using defined bits */
7670 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
7671 return -EINVAL;
7672
7673 /* at least one branch bit must be set */
7674 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
7675 return -EINVAL;
7676
bce38cd5
SE
7677 /* propagate priv level, when not set for branch */
7678 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
7679
7680 /* exclude_kernel checked on syscall entry */
7681 if (!attr->exclude_kernel)
7682 mask |= PERF_SAMPLE_BRANCH_KERNEL;
7683
7684 if (!attr->exclude_user)
7685 mask |= PERF_SAMPLE_BRANCH_USER;
7686
7687 if (!attr->exclude_hv)
7688 mask |= PERF_SAMPLE_BRANCH_HV;
7689 /*
7690 * adjust user setting (for HW filter setup)
7691 */
7692 attr->branch_sample_type = mask;
7693 }
e712209a
SE
7694 /* privileged levels capture (kernel, hv): check permissions */
7695 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
2b923c8f
SE
7696 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
7697 return -EACCES;
bce38cd5 7698 }
4018994f 7699
c5ebcedb 7700 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
4018994f 7701 ret = perf_reg_validate(attr->sample_regs_user);
c5ebcedb
JO
7702 if (ret)
7703 return ret;
7704 }
7705
7706 if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
7707 if (!arch_perf_have_user_stack_dump())
7708 return -ENOSYS;
7709
7710 /*
7711 * We have __u32 type for the size, but so far
7712 * we can only use __u16 as maximum due to the
7713 * __u16 sample size limit.
7714 */
7715 if (attr->sample_stack_user >= USHRT_MAX)
7716 ret = -EINVAL;
7717 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
7718 ret = -EINVAL;
7719 }
4018994f 7720
60e2364e
SE
7721 if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
7722 ret = perf_reg_validate(attr->sample_regs_intr);
974802ea
PZ
7723out:
7724 return ret;
7725
7726err_size:
7727 put_user(sizeof(*attr), &uattr->size);
7728 ret = -E2BIG;
7729 goto out;
7730}
7731
ac9721f3
PZ
7732static int
7733perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
a4be7c27 7734{
b69cf536 7735 struct ring_buffer *rb = NULL;
a4be7c27
PZ
7736 int ret = -EINVAL;
7737
ac9721f3 7738 if (!output_event)
a4be7c27
PZ
7739 goto set;
7740
ac9721f3
PZ
7741 /* don't allow circular references */
7742 if (event == output_event)
a4be7c27
PZ
7743 goto out;
7744
0f139300
PZ
7745 /*
7746 * Don't allow cross-cpu buffers
7747 */
7748 if (output_event->cpu != event->cpu)
7749 goto out;
7750
7751 /*
76369139 7752 * If its not a per-cpu rb, it must be the same task.
0f139300
PZ
7753 */
7754 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
7755 goto out;
7756
34f43927
PZ
7757 /*
7758 * Mixing clocks in the same buffer is trouble you don't need.
7759 */
7760 if (output_event->clock != event->clock)
7761 goto out;
7762
45bfb2e5
PZ
7763 /*
7764 * If both events generate aux data, they must be on the same PMU
7765 */
7766 if (has_aux(event) && has_aux(output_event) &&
7767 event->pmu != output_event->pmu)
7768 goto out;
7769
a4be7c27 7770set:
cdd6c482 7771 mutex_lock(&event->mmap_mutex);
ac9721f3
PZ
7772 /* Can't redirect output if we've got an active mmap() */
7773 if (atomic_read(&event->mmap_count))
7774 goto unlock;
a4be7c27 7775
ac9721f3 7776 if (output_event) {
76369139
FW
7777 /* get the rb we want to redirect to */
7778 rb = ring_buffer_get(output_event);
7779 if (!rb)
ac9721f3 7780 goto unlock;
a4be7c27
PZ
7781 }
7782
b69cf536 7783 ring_buffer_attach(event, rb);
9bb5d40c 7784
a4be7c27 7785 ret = 0;
ac9721f3
PZ
7786unlock:
7787 mutex_unlock(&event->mmap_mutex);
7788
a4be7c27 7789out:
a4be7c27
PZ
7790 return ret;
7791}
7792
f63a8daa
PZ
7793static void mutex_lock_double(struct mutex *a, struct mutex *b)
7794{
7795 if (b < a)
7796 swap(a, b);
7797
7798 mutex_lock(a);
7799 mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
7800}
7801
34f43927
PZ
7802static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
7803{
7804 bool nmi_safe = false;
7805
7806 switch (clk_id) {
7807 case CLOCK_MONOTONIC:
7808 event->clock = &ktime_get_mono_fast_ns;
7809 nmi_safe = true;
7810 break;
7811
7812 case CLOCK_MONOTONIC_RAW:
7813 event->clock = &ktime_get_raw_fast_ns;
7814 nmi_safe = true;
7815 break;
7816
7817 case CLOCK_REALTIME:
7818 event->clock = &ktime_get_real_ns;
7819 break;
7820
7821 case CLOCK_BOOTTIME:
7822 event->clock = &ktime_get_boot_ns;
7823 break;
7824
7825 case CLOCK_TAI:
7826 event->clock = &ktime_get_tai_ns;
7827 break;
7828
7829 default:
7830 return -EINVAL;
7831 }
7832
7833 if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
7834 return -EINVAL;
7835
7836 return 0;
7837}
7838
0793a61d 7839/**
cdd6c482 7840 * sys_perf_event_open - open a performance event, associate it to a task/cpu
9f66a381 7841 *
cdd6c482 7842 * @attr_uptr: event_id type attributes for monitoring/sampling
0793a61d 7843 * @pid: target pid
9f66a381 7844 * @cpu: target cpu
cdd6c482 7845 * @group_fd: group leader event fd
0793a61d 7846 */
cdd6c482
IM
7847SYSCALL_DEFINE5(perf_event_open,
7848 struct perf_event_attr __user *, attr_uptr,
2743a5b0 7849 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
0793a61d 7850{
b04243ef
PZ
7851 struct perf_event *group_leader = NULL, *output_event = NULL;
7852 struct perf_event *event, *sibling;
cdd6c482 7853 struct perf_event_attr attr;
f63a8daa 7854 struct perf_event_context *ctx, *uninitialized_var(gctx);
cdd6c482 7855 struct file *event_file = NULL;
2903ff01 7856 struct fd group = {NULL, 0};
38a81da2 7857 struct task_struct *task = NULL;
89a1e187 7858 struct pmu *pmu;
ea635c64 7859 int event_fd;
b04243ef 7860 int move_group = 0;
dc86cabe 7861 int err;
a21b0b35 7862 int f_flags = O_RDWR;
79dff51e 7863 int cgroup_fd = -1;
0793a61d 7864
2743a5b0 7865 /* for future expandability... */
e5d1367f 7866 if (flags & ~PERF_FLAG_ALL)
2743a5b0
PM
7867 return -EINVAL;
7868
dc86cabe
IM
7869 err = perf_copy_attr(attr_uptr, &attr);
7870 if (err)
7871 return err;
eab656ae 7872
0764771d
PZ
7873 if (!attr.exclude_kernel) {
7874 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
7875 return -EACCES;
7876 }
7877
df58ab24 7878 if (attr.freq) {
cdd6c482 7879 if (attr.sample_freq > sysctl_perf_event_sample_rate)
df58ab24 7880 return -EINVAL;
0819b2e3
PZ
7881 } else {
7882 if (attr.sample_period & (1ULL << 63))
7883 return -EINVAL;
df58ab24
PZ
7884 }
7885
e5d1367f
SE
7886 /*
7887 * In cgroup mode, the pid argument is used to pass the fd
7888 * opened to the cgroup directory in cgroupfs. The cpu argument
7889 * designates the cpu on which to monitor threads from that
7890 * cgroup.
7891 */
7892 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
7893 return -EINVAL;
7894
a21b0b35
YD
7895 if (flags & PERF_FLAG_FD_CLOEXEC)
7896 f_flags |= O_CLOEXEC;
7897
7898 event_fd = get_unused_fd_flags(f_flags);
ea635c64
AV
7899 if (event_fd < 0)
7900 return event_fd;
7901
ac9721f3 7902 if (group_fd != -1) {
2903ff01
AV
7903 err = perf_fget_light(group_fd, &group);
7904 if (err)
d14b12d7 7905 goto err_fd;
2903ff01 7906 group_leader = group.file->private_data;
ac9721f3
PZ
7907 if (flags & PERF_FLAG_FD_OUTPUT)
7908 output_event = group_leader;
7909 if (flags & PERF_FLAG_FD_NO_GROUP)
7910 group_leader = NULL;
7911 }
7912
e5d1367f 7913 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
c6be5a5c
PZ
7914 task = find_lively_task_by_vpid(pid);
7915 if (IS_ERR(task)) {
7916 err = PTR_ERR(task);
7917 goto err_group_fd;
7918 }
7919 }
7920
1f4ee503
PZ
7921 if (task && group_leader &&
7922 group_leader->attr.inherit != attr.inherit) {
7923 err = -EINVAL;
7924 goto err_task;
7925 }
7926
fbfc623f
YZ
7927 get_online_cpus();
7928
79dff51e
MF
7929 if (flags & PERF_FLAG_PID_CGROUP)
7930 cgroup_fd = pid;
7931
4dc0da86 7932 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
79dff51e 7933 NULL, NULL, cgroup_fd);
d14b12d7
SE
7934 if (IS_ERR(event)) {
7935 err = PTR_ERR(event);
1f4ee503 7936 goto err_cpus;
d14b12d7
SE
7937 }
7938
53b25335
VW
7939 if (is_sampling_event(event)) {
7940 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
7941 err = -ENOTSUPP;
7942 goto err_alloc;
7943 }
7944 }
7945
766d6c07
FW
7946 account_event(event);
7947
89a1e187
PZ
7948 /*
7949 * Special case software events and allow them to be part of
7950 * any hardware group.
7951 */
7952 pmu = event->pmu;
b04243ef 7953
34f43927
PZ
7954 if (attr.use_clockid) {
7955 err = perf_event_set_clock(event, attr.clockid);
7956 if (err)
7957 goto err_alloc;
7958 }
7959
b04243ef
PZ
7960 if (group_leader &&
7961 (is_software_event(event) != is_software_event(group_leader))) {
7962 if (is_software_event(event)) {
7963 /*
7964 * If event and group_leader are not both a software
7965 * event, and event is, then group leader is not.
7966 *
7967 * Allow the addition of software events to !software
7968 * groups, this is safe because software events never
7969 * fail to schedule.
7970 */
7971 pmu = group_leader->pmu;
7972 } else if (is_software_event(group_leader) &&
7973 (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
7974 /*
7975 * In case the group is a pure software group, and we
7976 * try to add a hardware event, move the whole group to
7977 * the hardware context.
7978 */
7979 move_group = 1;
7980 }
7981 }
89a1e187
PZ
7982
7983 /*
7984 * Get the target context (task or percpu):
7985 */
4af57ef2 7986 ctx = find_get_context(pmu, task, event);
89a1e187
PZ
7987 if (IS_ERR(ctx)) {
7988 err = PTR_ERR(ctx);
c6be5a5c 7989 goto err_alloc;
89a1e187
PZ
7990 }
7991
bed5b25a
AS
7992 if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
7993 err = -EBUSY;
7994 goto err_context;
7995 }
7996
fd1edb3a
PZ
7997 if (task) {
7998 put_task_struct(task);
7999 task = NULL;
8000 }
8001
ccff286d 8002 /*
cdd6c482 8003 * Look up the group leader (we will attach this event to it):
04289bb9 8004 */
ac9721f3 8005 if (group_leader) {
dc86cabe 8006 err = -EINVAL;
04289bb9 8007
04289bb9 8008 /*
ccff286d
IM
8009 * Do not allow a recursive hierarchy (this new sibling
8010 * becoming part of another group-sibling):
8011 */
8012 if (group_leader->group_leader != group_leader)
c3f00c70 8013 goto err_context;
34f43927
PZ
8014
8015 /* All events in a group should have the same clock */
8016 if (group_leader->clock != event->clock)
8017 goto err_context;
8018
ccff286d
IM
8019 /*
8020 * Do not allow to attach to a group in a different
8021 * task or CPU context:
04289bb9 8022 */
b04243ef 8023 if (move_group) {
c3c87e77
PZ
8024 /*
8025 * Make sure we're both on the same task, or both
8026 * per-cpu events.
8027 */
8028 if (group_leader->ctx->task != ctx->task)
8029 goto err_context;
8030
8031 /*
8032 * Make sure we're both events for the same CPU;
8033 * grouping events for different CPUs is broken; since
8034 * you can never concurrently schedule them anyhow.
8035 */
8036 if (group_leader->cpu != event->cpu)
b04243ef
PZ
8037 goto err_context;
8038 } else {
8039 if (group_leader->ctx != ctx)
8040 goto err_context;
8041 }
8042
3b6f9e5c
PM
8043 /*
8044 * Only a group leader can be exclusive or pinned
8045 */
0d48696f 8046 if (attr.exclusive || attr.pinned)
c3f00c70 8047 goto err_context;
ac9721f3
PZ
8048 }
8049
8050 if (output_event) {
8051 err = perf_event_set_output(event, output_event);
8052 if (err)
c3f00c70 8053 goto err_context;
ac9721f3 8054 }
0793a61d 8055
a21b0b35
YD
8056 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
8057 f_flags);
ea635c64
AV
8058 if (IS_ERR(event_file)) {
8059 err = PTR_ERR(event_file);
c3f00c70 8060 goto err_context;
ea635c64 8061 }
9b51f66d 8062
b04243ef 8063 if (move_group) {
f63a8daa
PZ
8064 gctx = group_leader->ctx;
8065
8066 /*
8067 * See perf_event_ctx_lock() for comments on the details
8068 * of swizzling perf_event::ctx.
8069 */
8070 mutex_lock_double(&gctx->mutex, &ctx->mutex);
b04243ef 8071
46ce0fe9 8072 perf_remove_from_context(group_leader, false);
0231bb53 8073
b04243ef
PZ
8074 list_for_each_entry(sibling, &group_leader->sibling_list,
8075 group_entry) {
46ce0fe9 8076 perf_remove_from_context(sibling, false);
b04243ef
PZ
8077 put_ctx(gctx);
8078 }
f63a8daa
PZ
8079 } else {
8080 mutex_lock(&ctx->mutex);
ea635c64 8081 }
9b51f66d 8082
ad3a37de 8083 WARN_ON_ONCE(ctx->parent_ctx);
b04243ef
PZ
8084
8085 if (move_group) {
f63a8daa
PZ
8086 /*
8087 * Wait for everybody to stop referencing the events through
8088 * the old lists, before installing it on new lists.
8089 */
0cda4c02 8090 synchronize_rcu();
f63a8daa 8091
8f95b435
PZI
8092 /*
8093 * Install the group siblings before the group leader.
8094 *
8095 * Because a group leader will try and install the entire group
8096 * (through the sibling list, which is still in-tact), we can
8097 * end up with siblings installed in the wrong context.
8098 *
8099 * By installing siblings first we NO-OP because they're not
8100 * reachable through the group lists.
8101 */
b04243ef
PZ
8102 list_for_each_entry(sibling, &group_leader->sibling_list,
8103 group_entry) {
8f95b435 8104 perf_event__state_init(sibling);
9fc81d87 8105 perf_install_in_context(ctx, sibling, sibling->cpu);
b04243ef
PZ
8106 get_ctx(ctx);
8107 }
8f95b435
PZI
8108
8109 /*
8110 * Removing from the context ends up with disabled
8111 * event. What we want here is event in the initial
8112 * startup state, ready to be add into new context.
8113 */
8114 perf_event__state_init(group_leader);
8115 perf_install_in_context(ctx, group_leader, group_leader->cpu);
8116 get_ctx(ctx);
b04243ef
PZ
8117 }
8118
bed5b25a
AS
8119 if (!exclusive_event_installable(event, ctx)) {
8120 err = -EBUSY;
8121 mutex_unlock(&ctx->mutex);
8122 fput(event_file);
8123 goto err_context;
8124 }
8125
e2d37cd2 8126 perf_install_in_context(ctx, event, event->cpu);
fe4b04fa 8127 perf_unpin_context(ctx);
f63a8daa
PZ
8128
8129 if (move_group) {
8130 mutex_unlock(&gctx->mutex);
8131 put_ctx(gctx);
8132 }
d859e29f 8133 mutex_unlock(&ctx->mutex);
9b51f66d 8134
fbfc623f
YZ
8135 put_online_cpus();
8136
cdd6c482 8137 event->owner = current;
8882135b 8138
cdd6c482
IM
8139 mutex_lock(&current->perf_event_mutex);
8140 list_add_tail(&event->owner_entry, &current->perf_event_list);
8141 mutex_unlock(&current->perf_event_mutex);
082ff5a2 8142
c320c7b7
ACM
8143 /*
8144 * Precalculate sample_data sizes
8145 */
8146 perf_event__header_size(event);
6844c09d 8147 perf_event__id_header_size(event);
c320c7b7 8148
8a49542c
PZ
8149 /*
8150 * Drop the reference on the group_event after placing the
8151 * new event on the sibling_list. This ensures destruction
8152 * of the group leader will find the pointer to itself in
8153 * perf_group_detach().
8154 */
2903ff01 8155 fdput(group);
ea635c64
AV
8156 fd_install(event_fd, event_file);
8157 return event_fd;
0793a61d 8158
c3f00c70 8159err_context:
fe4b04fa 8160 perf_unpin_context(ctx);
ea635c64 8161 put_ctx(ctx);
c6be5a5c 8162err_alloc:
ea635c64 8163 free_event(event);
1f4ee503 8164err_cpus:
fbfc623f 8165 put_online_cpus();
1f4ee503 8166err_task:
e7d0bc04
PZ
8167 if (task)
8168 put_task_struct(task);
89a1e187 8169err_group_fd:
2903ff01 8170 fdput(group);
ea635c64
AV
8171err_fd:
8172 put_unused_fd(event_fd);
dc86cabe 8173 return err;
0793a61d
TG
8174}
8175
fb0459d7
AV
8176/**
8177 * perf_event_create_kernel_counter
8178 *
8179 * @attr: attributes of the counter to create
8180 * @cpu: cpu in which the counter is bound
38a81da2 8181 * @task: task to profile (NULL for percpu)
fb0459d7
AV
8182 */
8183struct perf_event *
8184perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
38a81da2 8185 struct task_struct *task,
4dc0da86
AK
8186 perf_overflow_handler_t overflow_handler,
8187 void *context)
fb0459d7 8188{
fb0459d7 8189 struct perf_event_context *ctx;
c3f00c70 8190 struct perf_event *event;
fb0459d7 8191 int err;
d859e29f 8192
fb0459d7
AV
8193 /*
8194 * Get the target context (task or percpu):
8195 */
d859e29f 8196
4dc0da86 8197 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
79dff51e 8198 overflow_handler, context, -1);
c3f00c70
PZ
8199 if (IS_ERR(event)) {
8200 err = PTR_ERR(event);
8201 goto err;
8202 }
d859e29f 8203
f8697762
JO
8204 /* Mark owner so we could distinguish it from user events. */
8205 event->owner = EVENT_OWNER_KERNEL;
8206
766d6c07
FW
8207 account_event(event);
8208
4af57ef2 8209 ctx = find_get_context(event->pmu, task, event);
c6567f64
FW
8210 if (IS_ERR(ctx)) {
8211 err = PTR_ERR(ctx);
c3f00c70 8212 goto err_free;
d859e29f 8213 }
fb0459d7 8214
fb0459d7
AV
8215 WARN_ON_ONCE(ctx->parent_ctx);
8216 mutex_lock(&ctx->mutex);
bed5b25a
AS
8217 if (!exclusive_event_installable(event, ctx)) {
8218 mutex_unlock(&ctx->mutex);
8219 perf_unpin_context(ctx);
8220 put_ctx(ctx);
8221 err = -EBUSY;
8222 goto err_free;
8223 }
8224
fb0459d7 8225 perf_install_in_context(ctx, event, cpu);
fe4b04fa 8226 perf_unpin_context(ctx);
fb0459d7
AV
8227 mutex_unlock(&ctx->mutex);
8228
fb0459d7
AV
8229 return event;
8230
c3f00c70
PZ
8231err_free:
8232 free_event(event);
8233err:
c6567f64 8234 return ERR_PTR(err);
9b51f66d 8235}
fb0459d7 8236EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
9b51f66d 8237
0cda4c02
YZ
8238void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
8239{
8240 struct perf_event_context *src_ctx;
8241 struct perf_event_context *dst_ctx;
8242 struct perf_event *event, *tmp;
8243 LIST_HEAD(events);
8244
8245 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
8246 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
8247
f63a8daa
PZ
8248 /*
8249 * See perf_event_ctx_lock() for comments on the details
8250 * of swizzling perf_event::ctx.
8251 */
8252 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
0cda4c02
YZ
8253 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
8254 event_entry) {
46ce0fe9 8255 perf_remove_from_context(event, false);
9a545de0 8256 unaccount_event_cpu(event, src_cpu);
0cda4c02 8257 put_ctx(src_ctx);
9886167d 8258 list_add(&event->migrate_entry, &events);
0cda4c02 8259 }
0cda4c02 8260
8f95b435
PZI
8261 /*
8262 * Wait for the events to quiesce before re-instating them.
8263 */
0cda4c02
YZ
8264 synchronize_rcu();
8265
8f95b435
PZI
8266 /*
8267 * Re-instate events in 2 passes.
8268 *
8269 * Skip over group leaders and only install siblings on this first
8270 * pass, siblings will not get enabled without a leader, however a
8271 * leader will enable its siblings, even if those are still on the old
8272 * context.
8273 */
8274 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
8275 if (event->group_leader == event)
8276 continue;
8277
8278 list_del(&event->migrate_entry);
8279 if (event->state >= PERF_EVENT_STATE_OFF)
8280 event->state = PERF_EVENT_STATE_INACTIVE;
8281 account_event_cpu(event, dst_cpu);
8282 perf_install_in_context(dst_ctx, event, dst_cpu);
8283 get_ctx(dst_ctx);
8284 }
8285
8286 /*
8287 * Once all the siblings are setup properly, install the group leaders
8288 * to make it go.
8289 */
9886167d
PZ
8290 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
8291 list_del(&event->migrate_entry);
0cda4c02
YZ
8292 if (event->state >= PERF_EVENT_STATE_OFF)
8293 event->state = PERF_EVENT_STATE_INACTIVE;
9a545de0 8294 account_event_cpu(event, dst_cpu);
0cda4c02
YZ
8295 perf_install_in_context(dst_ctx, event, dst_cpu);
8296 get_ctx(dst_ctx);
8297 }
8298 mutex_unlock(&dst_ctx->mutex);
f63a8daa 8299 mutex_unlock(&src_ctx->mutex);
0cda4c02
YZ
8300}
8301EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
8302
cdd6c482 8303static void sync_child_event(struct perf_event *child_event,
38b200d6 8304 struct task_struct *child)
d859e29f 8305{
cdd6c482 8306 struct perf_event *parent_event = child_event->parent;
8bc20959 8307 u64 child_val;
d859e29f 8308
cdd6c482
IM
8309 if (child_event->attr.inherit_stat)
8310 perf_event_read_event(child_event, child);
38b200d6 8311
b5e58793 8312 child_val = perf_event_count(child_event);
d859e29f
PM
8313
8314 /*
8315 * Add back the child's count to the parent's count:
8316 */
a6e6dea6 8317 atomic64_add(child_val, &parent_event->child_count);
cdd6c482
IM
8318 atomic64_add(child_event->total_time_enabled,
8319 &parent_event->child_total_time_enabled);
8320 atomic64_add(child_event->total_time_running,
8321 &parent_event->child_total_time_running);
d859e29f
PM
8322
8323 /*
cdd6c482 8324 * Remove this event from the parent's list
d859e29f 8325 */
cdd6c482
IM
8326 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
8327 mutex_lock(&parent_event->child_mutex);
8328 list_del_init(&child_event->child_list);
8329 mutex_unlock(&parent_event->child_mutex);
d859e29f 8330
dc633982
JO
8331 /*
8332 * Make sure user/parent get notified, that we just
8333 * lost one event.
8334 */
8335 perf_event_wakeup(parent_event);
8336
d859e29f 8337 /*
cdd6c482 8338 * Release the parent event, if this was the last
d859e29f
PM
8339 * reference to it.
8340 */
a6fa941d 8341 put_event(parent_event);
d859e29f
PM
8342}
8343
9b51f66d 8344static void
cdd6c482
IM
8345__perf_event_exit_task(struct perf_event *child_event,
8346 struct perf_event_context *child_ctx,
38b200d6 8347 struct task_struct *child)
9b51f66d 8348{
1903d50c
PZ
8349 /*
8350 * Do not destroy the 'original' grouping; because of the context
8351 * switch optimization the original events could've ended up in a
8352 * random child task.
8353 *
8354 * If we were to destroy the original group, all group related
8355 * operations would cease to function properly after this random
8356 * child dies.
8357 *
8358 * Do destroy all inherited groups, we don't care about those
8359 * and being thorough is better.
8360 */
8361 perf_remove_from_context(child_event, !!child_event->parent);
0cc0c027 8362
9b51f66d 8363 /*
38b435b1 8364 * It can happen that the parent exits first, and has events
9b51f66d 8365 * that are still around due to the child reference. These
38b435b1 8366 * events need to be zapped.
9b51f66d 8367 */
38b435b1 8368 if (child_event->parent) {
cdd6c482
IM
8369 sync_child_event(child_event, child);
8370 free_event(child_event);
179033b3
JO
8371 } else {
8372 child_event->state = PERF_EVENT_STATE_EXIT;
8373 perf_event_wakeup(child_event);
4bcf349a 8374 }
9b51f66d
IM
8375}
8376
8dc85d54 8377static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
9b51f66d 8378{
ebf905fc 8379 struct perf_event *child_event, *next;
211de6eb 8380 struct perf_event_context *child_ctx, *clone_ctx = NULL;
a63eaf34 8381 unsigned long flags;
9b51f66d 8382
8dc85d54 8383 if (likely(!child->perf_event_ctxp[ctxn])) {
cdd6c482 8384 perf_event_task(child, NULL, 0);
9b51f66d 8385 return;
9f498cc5 8386 }
9b51f66d 8387
a63eaf34 8388 local_irq_save(flags);
ad3a37de
PM
8389 /*
8390 * We can't reschedule here because interrupts are disabled,
8391 * and either child is current or it is a task that can't be
8392 * scheduled, so we are now safe from rescheduling changing
8393 * our context.
8394 */
806839b2 8395 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
c93f7669
PM
8396
8397 /*
8398 * Take the context lock here so that if find_get_context is
cdd6c482 8399 * reading child->perf_event_ctxp, we wait until it has
c93f7669
PM
8400 * incremented the context's refcount before we do put_ctx below.
8401 */
e625cce1 8402 raw_spin_lock(&child_ctx->lock);
04dc2dbb 8403 task_ctx_sched_out(child_ctx);
8dc85d54 8404 child->perf_event_ctxp[ctxn] = NULL;
4a1c0f26 8405
71a851b4
PZ
8406 /*
8407 * If this context is a clone; unclone it so it can't get
8408 * swapped to another process while we're removing all
cdd6c482 8409 * the events from it.
71a851b4 8410 */
211de6eb 8411 clone_ctx = unclone_ctx(child_ctx);
5e942bb3 8412 update_context_time(child_ctx);
e625cce1 8413 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
9f498cc5 8414
211de6eb
PZ
8415 if (clone_ctx)
8416 put_ctx(clone_ctx);
4a1c0f26 8417
9f498cc5 8418 /*
cdd6c482
IM
8419 * Report the task dead after unscheduling the events so that we
8420 * won't get any samples after PERF_RECORD_EXIT. We can however still
8421 * get a few PERF_RECORD_READ events.
9f498cc5 8422 */
cdd6c482 8423 perf_event_task(child, child_ctx, 0);
a63eaf34 8424
66fff224
PZ
8425 /*
8426 * We can recurse on the same lock type through:
8427 *
cdd6c482
IM
8428 * __perf_event_exit_task()
8429 * sync_child_event()
a6fa941d
AV
8430 * put_event()
8431 * mutex_lock(&ctx->mutex)
66fff224
PZ
8432 *
8433 * But since its the parent context it won't be the same instance.
8434 */
a0507c84 8435 mutex_lock(&child_ctx->mutex);
a63eaf34 8436
ebf905fc 8437 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
cdd6c482 8438 __perf_event_exit_task(child_event, child_ctx, child);
8bc20959 8439
a63eaf34
PM
8440 mutex_unlock(&child_ctx->mutex);
8441
8442 put_ctx(child_ctx);
9b51f66d
IM
8443}
8444
8dc85d54
PZ
8445/*
8446 * When a child task exits, feed back event values to parent events.
8447 */
8448void perf_event_exit_task(struct task_struct *child)
8449{
8882135b 8450 struct perf_event *event, *tmp;
8dc85d54
PZ
8451 int ctxn;
8452
8882135b
PZ
8453 mutex_lock(&child->perf_event_mutex);
8454 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
8455 owner_entry) {
8456 list_del_init(&event->owner_entry);
8457
8458 /*
8459 * Ensure the list deletion is visible before we clear
8460 * the owner, closes a race against perf_release() where
8461 * we need to serialize on the owner->perf_event_mutex.
8462 */
8463 smp_wmb();
8464 event->owner = NULL;
8465 }
8466 mutex_unlock(&child->perf_event_mutex);
8467
8dc85d54
PZ
8468 for_each_task_context_nr(ctxn)
8469 perf_event_exit_task_context(child, ctxn);
8470}
8471
889ff015
FW
8472static void perf_free_event(struct perf_event *event,
8473 struct perf_event_context *ctx)
8474{
8475 struct perf_event *parent = event->parent;
8476
8477 if (WARN_ON_ONCE(!parent))
8478 return;
8479
8480 mutex_lock(&parent->child_mutex);
8481 list_del_init(&event->child_list);
8482 mutex_unlock(&parent->child_mutex);
8483
a6fa941d 8484 put_event(parent);
889ff015 8485
652884fe 8486 raw_spin_lock_irq(&ctx->lock);
8a49542c 8487 perf_group_detach(event);
889ff015 8488 list_del_event(event, ctx);
652884fe 8489 raw_spin_unlock_irq(&ctx->lock);
889ff015
FW
8490 free_event(event);
8491}
8492
bbbee908 8493/*
652884fe 8494 * Free an unexposed, unused context as created by inheritance by
8dc85d54 8495 * perf_event_init_task below, used by fork() in case of fail.
652884fe
PZ
8496 *
8497 * Not all locks are strictly required, but take them anyway to be nice and
8498 * help out with the lockdep assertions.
bbbee908 8499 */
cdd6c482 8500void perf_event_free_task(struct task_struct *task)
bbbee908 8501{
8dc85d54 8502 struct perf_event_context *ctx;
cdd6c482 8503 struct perf_event *event, *tmp;
8dc85d54 8504 int ctxn;
bbbee908 8505
8dc85d54
PZ
8506 for_each_task_context_nr(ctxn) {
8507 ctx = task->perf_event_ctxp[ctxn];
8508 if (!ctx)
8509 continue;
bbbee908 8510
8dc85d54 8511 mutex_lock(&ctx->mutex);
bbbee908 8512again:
8dc85d54
PZ
8513 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
8514 group_entry)
8515 perf_free_event(event, ctx);
bbbee908 8516
8dc85d54
PZ
8517 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
8518 group_entry)
8519 perf_free_event(event, ctx);
bbbee908 8520
8dc85d54
PZ
8521 if (!list_empty(&ctx->pinned_groups) ||
8522 !list_empty(&ctx->flexible_groups))
8523 goto again;
bbbee908 8524
8dc85d54 8525 mutex_unlock(&ctx->mutex);
bbbee908 8526
8dc85d54
PZ
8527 put_ctx(ctx);
8528 }
889ff015
FW
8529}
8530
4e231c79
PZ
8531void perf_event_delayed_put(struct task_struct *task)
8532{
8533 int ctxn;
8534
8535 for_each_task_context_nr(ctxn)
8536 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
8537}
8538
97dee4f3
PZ
8539/*
8540 * inherit a event from parent task to child task:
8541 */
8542static struct perf_event *
8543inherit_event(struct perf_event *parent_event,
8544 struct task_struct *parent,
8545 struct perf_event_context *parent_ctx,
8546 struct task_struct *child,
8547 struct perf_event *group_leader,
8548 struct perf_event_context *child_ctx)
8549{
1929def9 8550 enum perf_event_active_state parent_state = parent_event->state;
97dee4f3 8551 struct perf_event *child_event;
cee010ec 8552 unsigned long flags;
97dee4f3
PZ
8553
8554 /*
8555 * Instead of creating recursive hierarchies of events,
8556 * we link inherited events back to the original parent,
8557 * which has a filp for sure, which we use as the reference
8558 * count:
8559 */
8560 if (parent_event->parent)
8561 parent_event = parent_event->parent;
8562
8563 child_event = perf_event_alloc(&parent_event->attr,
8564 parent_event->cpu,
d580ff86 8565 child,
97dee4f3 8566 group_leader, parent_event,
79dff51e 8567 NULL, NULL, -1);
97dee4f3
PZ
8568 if (IS_ERR(child_event))
8569 return child_event;
a6fa941d 8570
fadfe7be
JO
8571 if (is_orphaned_event(parent_event) ||
8572 !atomic_long_inc_not_zero(&parent_event->refcount)) {
a6fa941d
AV
8573 free_event(child_event);
8574 return NULL;
8575 }
8576
97dee4f3
PZ
8577 get_ctx(child_ctx);
8578
8579 /*
8580 * Make the child state follow the state of the parent event,
8581 * not its attr.disabled bit. We hold the parent's mutex,
8582 * so we won't race with perf_event_{en, dis}able_family.
8583 */
1929def9 8584 if (parent_state >= PERF_EVENT_STATE_INACTIVE)
97dee4f3
PZ
8585 child_event->state = PERF_EVENT_STATE_INACTIVE;
8586 else
8587 child_event->state = PERF_EVENT_STATE_OFF;
8588
8589 if (parent_event->attr.freq) {
8590 u64 sample_period = parent_event->hw.sample_period;
8591 struct hw_perf_event *hwc = &child_event->hw;
8592
8593 hwc->sample_period = sample_period;
8594 hwc->last_period = sample_period;
8595
8596 local64_set(&hwc->period_left, sample_period);
8597 }
8598
8599 child_event->ctx = child_ctx;
8600 child_event->overflow_handler = parent_event->overflow_handler;
4dc0da86
AK
8601 child_event->overflow_handler_context
8602 = parent_event->overflow_handler_context;
97dee4f3 8603
614b6780
TG
8604 /*
8605 * Precalculate sample_data sizes
8606 */
8607 perf_event__header_size(child_event);
6844c09d 8608 perf_event__id_header_size(child_event);
614b6780 8609
97dee4f3
PZ
8610 /*
8611 * Link it up in the child's context:
8612 */
cee010ec 8613 raw_spin_lock_irqsave(&child_ctx->lock, flags);
97dee4f3 8614 add_event_to_ctx(child_event, child_ctx);
cee010ec 8615 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
97dee4f3 8616
97dee4f3
PZ
8617 /*
8618 * Link this into the parent event's child list
8619 */
8620 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
8621 mutex_lock(&parent_event->child_mutex);
8622 list_add_tail(&child_event->child_list, &parent_event->child_list);
8623 mutex_unlock(&parent_event->child_mutex);
8624
8625 return child_event;
8626}
8627
8628static int inherit_group(struct perf_event *parent_event,
8629 struct task_struct *parent,
8630 struct perf_event_context *parent_ctx,
8631 struct task_struct *child,
8632 struct perf_event_context *child_ctx)
8633{
8634 struct perf_event *leader;
8635 struct perf_event *sub;
8636 struct perf_event *child_ctr;
8637
8638 leader = inherit_event(parent_event, parent, parent_ctx,
8639 child, NULL, child_ctx);
8640 if (IS_ERR(leader))
8641 return PTR_ERR(leader);
8642 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
8643 child_ctr = inherit_event(sub, parent, parent_ctx,
8644 child, leader, child_ctx);
8645 if (IS_ERR(child_ctr))
8646 return PTR_ERR(child_ctr);
8647 }
8648 return 0;
889ff015
FW
8649}
8650
8651static int
8652inherit_task_group(struct perf_event *event, struct task_struct *parent,
8653 struct perf_event_context *parent_ctx,
8dc85d54 8654 struct task_struct *child, int ctxn,
889ff015
FW
8655 int *inherited_all)
8656{
8657 int ret;
8dc85d54 8658 struct perf_event_context *child_ctx;
889ff015
FW
8659
8660 if (!event->attr.inherit) {
8661 *inherited_all = 0;
8662 return 0;
bbbee908
PZ
8663 }
8664
fe4b04fa 8665 child_ctx = child->perf_event_ctxp[ctxn];
889ff015
FW
8666 if (!child_ctx) {
8667 /*
8668 * This is executed from the parent task context, so
8669 * inherit events that have been marked for cloning.
8670 * First allocate and initialize a context for the
8671 * child.
8672 */
bbbee908 8673
734df5ab 8674 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
889ff015
FW
8675 if (!child_ctx)
8676 return -ENOMEM;
bbbee908 8677
8dc85d54 8678 child->perf_event_ctxp[ctxn] = child_ctx;
889ff015
FW
8679 }
8680
8681 ret = inherit_group(event, parent, parent_ctx,
8682 child, child_ctx);
8683
8684 if (ret)
8685 *inherited_all = 0;
8686
8687 return ret;
bbbee908
PZ
8688}
8689
9b51f66d 8690/*
cdd6c482 8691 * Initialize the perf_event context in task_struct
9b51f66d 8692 */
985c8dcb 8693static int perf_event_init_context(struct task_struct *child, int ctxn)
9b51f66d 8694{
889ff015 8695 struct perf_event_context *child_ctx, *parent_ctx;
cdd6c482
IM
8696 struct perf_event_context *cloned_ctx;
8697 struct perf_event *event;
9b51f66d 8698 struct task_struct *parent = current;
564c2b21 8699 int inherited_all = 1;
dddd3379 8700 unsigned long flags;
6ab423e0 8701 int ret = 0;
9b51f66d 8702
8dc85d54 8703 if (likely(!parent->perf_event_ctxp[ctxn]))
6ab423e0
PZ
8704 return 0;
8705
ad3a37de 8706 /*
25346b93
PM
8707 * If the parent's context is a clone, pin it so it won't get
8708 * swapped under us.
ad3a37de 8709 */
8dc85d54 8710 parent_ctx = perf_pin_task_context(parent, ctxn);
ffb4ef21
PZ
8711 if (!parent_ctx)
8712 return 0;
25346b93 8713
ad3a37de
PM
8714 /*
8715 * No need to check if parent_ctx != NULL here; since we saw
8716 * it non-NULL earlier, the only reason for it to become NULL
8717 * is if we exit, and since we're currently in the middle of
8718 * a fork we can't be exiting at the same time.
8719 */
ad3a37de 8720
9b51f66d
IM
8721 /*
8722 * Lock the parent list. No need to lock the child - not PID
8723 * hashed yet and not running, so nobody can access it.
8724 */
d859e29f 8725 mutex_lock(&parent_ctx->mutex);
9b51f66d
IM
8726
8727 /*
8728 * We dont have to disable NMIs - we are only looking at
8729 * the list, not manipulating it:
8730 */
889ff015 8731 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
8dc85d54
PZ
8732 ret = inherit_task_group(event, parent, parent_ctx,
8733 child, ctxn, &inherited_all);
889ff015
FW
8734 if (ret)
8735 break;
8736 }
b93f7978 8737
dddd3379
TG
8738 /*
8739 * We can't hold ctx->lock when iterating the ->flexible_group list due
8740 * to allocations, but we need to prevent rotation because
8741 * rotate_ctx() will change the list from interrupt context.
8742 */
8743 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
8744 parent_ctx->rotate_disable = 1;
8745 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
8746
889ff015 8747 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
8dc85d54
PZ
8748 ret = inherit_task_group(event, parent, parent_ctx,
8749 child, ctxn, &inherited_all);
889ff015 8750 if (ret)
9b51f66d 8751 break;
564c2b21
PM
8752 }
8753
dddd3379
TG
8754 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
8755 parent_ctx->rotate_disable = 0;
dddd3379 8756
8dc85d54 8757 child_ctx = child->perf_event_ctxp[ctxn];
889ff015 8758
05cbaa28 8759 if (child_ctx && inherited_all) {
564c2b21
PM
8760 /*
8761 * Mark the child context as a clone of the parent
8762 * context, or of whatever the parent is a clone of.
c5ed5145
PZ
8763 *
8764 * Note that if the parent is a clone, the holding of
8765 * parent_ctx->lock avoids it from being uncloned.
564c2b21 8766 */
c5ed5145 8767 cloned_ctx = parent_ctx->parent_ctx;
ad3a37de
PM
8768 if (cloned_ctx) {
8769 child_ctx->parent_ctx = cloned_ctx;
25346b93 8770 child_ctx->parent_gen = parent_ctx->parent_gen;
564c2b21
PM
8771 } else {
8772 child_ctx->parent_ctx = parent_ctx;
8773 child_ctx->parent_gen = parent_ctx->generation;
8774 }
8775 get_ctx(child_ctx->parent_ctx);
9b51f66d
IM
8776 }
8777
c5ed5145 8778 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
d859e29f 8779 mutex_unlock(&parent_ctx->mutex);
6ab423e0 8780
25346b93 8781 perf_unpin_context(parent_ctx);
fe4b04fa 8782 put_ctx(parent_ctx);
ad3a37de 8783
6ab423e0 8784 return ret;
9b51f66d
IM
8785}
8786
8dc85d54
PZ
8787/*
8788 * Initialize the perf_event context in task_struct
8789 */
8790int perf_event_init_task(struct task_struct *child)
8791{
8792 int ctxn, ret;
8793
8550d7cb
ON
8794 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
8795 mutex_init(&child->perf_event_mutex);
8796 INIT_LIST_HEAD(&child->perf_event_list);
8797
8dc85d54
PZ
8798 for_each_task_context_nr(ctxn) {
8799 ret = perf_event_init_context(child, ctxn);
6c72e350
PZ
8800 if (ret) {
8801 perf_event_free_task(child);
8dc85d54 8802 return ret;
6c72e350 8803 }
8dc85d54
PZ
8804 }
8805
8806 return 0;
8807}
8808
220b140b
PM
8809static void __init perf_event_init_all_cpus(void)
8810{
b28ab83c 8811 struct swevent_htable *swhash;
220b140b 8812 int cpu;
220b140b
PM
8813
8814 for_each_possible_cpu(cpu) {
b28ab83c
PZ
8815 swhash = &per_cpu(swevent_htable, cpu);
8816 mutex_init(&swhash->hlist_mutex);
2fde4f94 8817 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
220b140b
PM
8818 }
8819}
8820
0db0628d 8821static void perf_event_init_cpu(int cpu)
0793a61d 8822{
108b02cf 8823 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
0793a61d 8824
b28ab83c 8825 mutex_lock(&swhash->hlist_mutex);
39af6b16 8826 swhash->online = true;
4536e4d1 8827 if (swhash->hlist_refcount > 0) {
76e1d904
FW
8828 struct swevent_hlist *hlist;
8829
b28ab83c
PZ
8830 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
8831 WARN_ON(!hlist);
8832 rcu_assign_pointer(swhash->swevent_hlist, hlist);
76e1d904 8833 }
b28ab83c 8834 mutex_unlock(&swhash->hlist_mutex);
0793a61d
TG
8835}
8836
c277443c 8837#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
108b02cf 8838static void __perf_event_exit_context(void *__info)
0793a61d 8839{
226424ee 8840 struct remove_event re = { .detach_group = true };
108b02cf 8841 struct perf_event_context *ctx = __info;
0793a61d 8842
e3703f8c 8843 rcu_read_lock();
46ce0fe9
PZ
8844 list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
8845 __perf_remove_from_context(&re);
e3703f8c 8846 rcu_read_unlock();
0793a61d 8847}
108b02cf
PZ
8848
8849static void perf_event_exit_cpu_context(int cpu)
8850{
8851 struct perf_event_context *ctx;
8852 struct pmu *pmu;
8853 int idx;
8854
8855 idx = srcu_read_lock(&pmus_srcu);
8856 list_for_each_entry_rcu(pmu, &pmus, entry) {
917bdd1c 8857 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
108b02cf
PZ
8858
8859 mutex_lock(&ctx->mutex);
8860 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
8861 mutex_unlock(&ctx->mutex);
8862 }
8863 srcu_read_unlock(&pmus_srcu, idx);
108b02cf
PZ
8864}
8865
cdd6c482 8866static void perf_event_exit_cpu(int cpu)
0793a61d 8867{
b28ab83c 8868 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
d859e29f 8869
e3703f8c
PZ
8870 perf_event_exit_cpu_context(cpu);
8871
b28ab83c 8872 mutex_lock(&swhash->hlist_mutex);
39af6b16 8873 swhash->online = false;
b28ab83c
PZ
8874 swevent_hlist_release(swhash);
8875 mutex_unlock(&swhash->hlist_mutex);
0793a61d
TG
8876}
8877#else
cdd6c482 8878static inline void perf_event_exit_cpu(int cpu) { }
0793a61d
TG
8879#endif
8880
c277443c
PZ
8881static int
8882perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
8883{
8884 int cpu;
8885
8886 for_each_online_cpu(cpu)
8887 perf_event_exit_cpu(cpu);
8888
8889 return NOTIFY_OK;
8890}
8891
8892/*
8893 * Run the perf reboot notifier at the very last possible moment so that
8894 * the generic watchdog code runs as long as possible.
8895 */
8896static struct notifier_block perf_reboot_notifier = {
8897 .notifier_call = perf_reboot,
8898 .priority = INT_MIN,
8899};
8900
0db0628d 8901static int
0793a61d
TG
8902perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
8903{
8904 unsigned int cpu = (long)hcpu;
8905
4536e4d1 8906 switch (action & ~CPU_TASKS_FROZEN) {
0793a61d
TG
8907
8908 case CPU_UP_PREPARE:
5e11637e 8909 case CPU_DOWN_FAILED:
cdd6c482 8910 perf_event_init_cpu(cpu);
0793a61d
TG
8911 break;
8912
5e11637e 8913 case CPU_UP_CANCELED:
0793a61d 8914 case CPU_DOWN_PREPARE:
cdd6c482 8915 perf_event_exit_cpu(cpu);
0793a61d 8916 break;
0793a61d
TG
8917 default:
8918 break;
8919 }
8920
8921 return NOTIFY_OK;
8922}
8923
cdd6c482 8924void __init perf_event_init(void)
0793a61d 8925{
3c502e7a
JW
8926 int ret;
8927
2e80a82a
PZ
8928 idr_init(&pmu_idr);
8929
220b140b 8930 perf_event_init_all_cpus();
b0a873eb 8931 init_srcu_struct(&pmus_srcu);
2e80a82a
PZ
8932 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
8933 perf_pmu_register(&perf_cpu_clock, NULL, -1);
8934 perf_pmu_register(&perf_task_clock, NULL, -1);
b0a873eb
PZ
8935 perf_tp_register();
8936 perf_cpu_notifier(perf_cpu_notify);
c277443c 8937 register_reboot_notifier(&perf_reboot_notifier);
3c502e7a
JW
8938
8939 ret = init_hw_breakpoint();
8940 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
b2029520
GN
8941
8942 /* do not patch jump label more than once per second */
8943 jump_label_rate_limit(&perf_sched_events, HZ);
b01c3a00
JO
8944
8945 /*
8946 * Build time assertion that we keep the data_head at the intended
8947 * location. IOW, validation we got the __reserved[] size right.
8948 */
8949 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
8950 != 1024);
0793a61d 8951}
abe43400 8952
fd979c01
CS
8953ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
8954 char *page)
8955{
8956 struct perf_pmu_events_attr *pmu_attr =
8957 container_of(attr, struct perf_pmu_events_attr, attr);
8958
8959 if (pmu_attr->event_str)
8960 return sprintf(page, "%s\n", pmu_attr->event_str);
8961
8962 return 0;
8963}
8964
abe43400
PZ
8965static int __init perf_event_sysfs_init(void)
8966{
8967 struct pmu *pmu;
8968 int ret;
8969
8970 mutex_lock(&pmus_lock);
8971
8972 ret = bus_register(&pmu_bus);
8973 if (ret)
8974 goto unlock;
8975
8976 list_for_each_entry(pmu, &pmus, entry) {
8977 if (!pmu->name || pmu->type < 0)
8978 continue;
8979
8980 ret = pmu_dev_alloc(pmu);
8981 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
8982 }
8983 pmu_bus_running = 1;
8984 ret = 0;
8985
8986unlock:
8987 mutex_unlock(&pmus_lock);
8988
8989 return ret;
8990}
8991device_initcall(perf_event_sysfs_init);
e5d1367f
SE
8992
8993#ifdef CONFIG_CGROUP_PERF
eb95419b
TH
8994static struct cgroup_subsys_state *
8995perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
e5d1367f
SE
8996{
8997 struct perf_cgroup *jc;
e5d1367f 8998
1b15d055 8999 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
e5d1367f
SE
9000 if (!jc)
9001 return ERR_PTR(-ENOMEM);
9002
e5d1367f
SE
9003 jc->info = alloc_percpu(struct perf_cgroup_info);
9004 if (!jc->info) {
9005 kfree(jc);
9006 return ERR_PTR(-ENOMEM);
9007 }
9008
e5d1367f
SE
9009 return &jc->css;
9010}
9011
eb95419b 9012static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
e5d1367f 9013{
eb95419b
TH
9014 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
9015
e5d1367f
SE
9016 free_percpu(jc->info);
9017 kfree(jc);
9018}
9019
9020static int __perf_cgroup_move(void *info)
9021{
9022 struct task_struct *task = info;
9023 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
9024 return 0;
9025}
9026
eb95419b
TH
9027static void perf_cgroup_attach(struct cgroup_subsys_state *css,
9028 struct cgroup_taskset *tset)
e5d1367f 9029{
bb9d97b6
TH
9030 struct task_struct *task;
9031
924f0d9a 9032 cgroup_taskset_for_each(task, tset)
bb9d97b6 9033 task_function_call(task, __perf_cgroup_move, task);
e5d1367f
SE
9034}
9035
eb95419b
TH
9036static void perf_cgroup_exit(struct cgroup_subsys_state *css,
9037 struct cgroup_subsys_state *old_css,
761b3ef5 9038 struct task_struct *task)
e5d1367f
SE
9039{
9040 /*
9041 * cgroup_exit() is called in the copy_process() failure path.
9042 * Ignore this case since the task hasn't ran yet, this avoids
9043 * trying to poke a half freed task state from generic code.
9044 */
9045 if (!(task->flags & PF_EXITING))
9046 return;
9047
bb9d97b6 9048 task_function_call(task, __perf_cgroup_move, task);
e5d1367f
SE
9049}
9050
073219e9 9051struct cgroup_subsys perf_event_cgrp_subsys = {
92fb9748
TH
9052 .css_alloc = perf_cgroup_css_alloc,
9053 .css_free = perf_cgroup_css_free,
e7e7ee2e 9054 .exit = perf_cgroup_exit,
bb9d97b6 9055 .attach = perf_cgroup_attach,
e5d1367f
SE
9056};
9057#endif /* CONFIG_CGROUP_PERF */