ALSA: drivers: pcsp: Fix printout of resolution
[linux-2.6-block.git] / kernel / events / core.c
CommitLineData
0793a61d 1/*
57c0c15b 2 * Performance events core code:
0793a61d 3 *
98144511 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
e7e7ee2e
IM
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
d36b6910 7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
7b732a75 8 *
57c0c15b 9 * For licensing details see kernel-base/COPYING
0793a61d
TG
10 */
11
12#include <linux/fs.h>
b9cacc7b 13#include <linux/mm.h>
0793a61d
TG
14#include <linux/cpu.h>
15#include <linux/smp.h>
2e80a82a 16#include <linux/idr.h>
04289bb9 17#include <linux/file.h>
0793a61d 18#include <linux/poll.h>
5a0e3ad6 19#include <linux/slab.h>
76e1d904 20#include <linux/hash.h>
12351ef8 21#include <linux/tick.h>
0793a61d 22#include <linux/sysfs.h>
22a4f650 23#include <linux/dcache.h>
0793a61d 24#include <linux/percpu.h>
22a4f650 25#include <linux/ptrace.h>
c277443c 26#include <linux/reboot.h>
b9cacc7b 27#include <linux/vmstat.h>
abe43400 28#include <linux/device.h>
6e5fdeed 29#include <linux/export.h>
906010b2 30#include <linux/vmalloc.h>
b9cacc7b
PZ
31#include <linux/hardirq.h>
32#include <linux/rculist.h>
0793a61d
TG
33#include <linux/uaccess.h>
34#include <linux/syscalls.h>
35#include <linux/anon_inodes.h>
aa9c4c0f 36#include <linux/kernel_stat.h>
39bed6cb 37#include <linux/cgroup.h>
cdd6c482 38#include <linux/perf_event.h>
6fb2915d 39#include <linux/ftrace_event.h>
3c502e7a 40#include <linux/hw_breakpoint.h>
c5ebcedb 41#include <linux/mm_types.h>
c464c76e 42#include <linux/module.h>
f972eb63 43#include <linux/mman.h>
b3f20785 44#include <linux/compat.h>
2541517c
AS
45#include <linux/bpf.h>
46#include <linux/filter.h>
0793a61d 47
76369139
FW
48#include "internal.h"
49
4e193bd4
TB
50#include <asm/irq_regs.h>
51
fadfe7be
JO
52static struct workqueue_struct *perf_wq;
53
272325c4
PZ
54typedef int (*remote_function_f)(void *);
55
fe4b04fa 56struct remote_function_call {
e7e7ee2e 57 struct task_struct *p;
272325c4 58 remote_function_f func;
e7e7ee2e
IM
59 void *info;
60 int ret;
fe4b04fa
PZ
61};
62
63static void remote_function(void *data)
64{
65 struct remote_function_call *tfc = data;
66 struct task_struct *p = tfc->p;
67
68 if (p) {
69 tfc->ret = -EAGAIN;
70 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
71 return;
72 }
73
74 tfc->ret = tfc->func(tfc->info);
75}
76
77/**
78 * task_function_call - call a function on the cpu on which a task runs
79 * @p: the task to evaluate
80 * @func: the function to be called
81 * @info: the function call argument
82 *
83 * Calls the function @func when the task is currently running. This might
84 * be on the current CPU, which just calls the function directly
85 *
86 * returns: @func return value, or
87 * -ESRCH - when the process isn't running
88 * -EAGAIN - when the process moved away
89 */
90static int
272325c4 91task_function_call(struct task_struct *p, remote_function_f func, void *info)
fe4b04fa
PZ
92{
93 struct remote_function_call data = {
e7e7ee2e
IM
94 .p = p,
95 .func = func,
96 .info = info,
97 .ret = -ESRCH, /* No such (running) process */
fe4b04fa
PZ
98 };
99
100 if (task_curr(p))
101 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
102
103 return data.ret;
104}
105
106/**
107 * cpu_function_call - call a function on the cpu
108 * @func: the function to be called
109 * @info: the function call argument
110 *
111 * Calls the function @func on the remote cpu.
112 *
113 * returns: @func return value or -ENXIO when the cpu is offline
114 */
272325c4 115static int cpu_function_call(int cpu, remote_function_f func, void *info)
fe4b04fa
PZ
116{
117 struct remote_function_call data = {
e7e7ee2e
IM
118 .p = NULL,
119 .func = func,
120 .info = info,
121 .ret = -ENXIO, /* No such CPU */
fe4b04fa
PZ
122 };
123
124 smp_call_function_single(cpu, remote_function, &data, 1);
125
126 return data.ret;
127}
128
f8697762
JO
129#define EVENT_OWNER_KERNEL ((void *) -1)
130
131static bool is_kernel_event(struct perf_event *event)
132{
133 return event->owner == EVENT_OWNER_KERNEL;
134}
135
e5d1367f
SE
136#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
137 PERF_FLAG_FD_OUTPUT |\
a21b0b35
YD
138 PERF_FLAG_PID_CGROUP |\
139 PERF_FLAG_FD_CLOEXEC)
e5d1367f 140
bce38cd5
SE
141/*
142 * branch priv levels that need permission checks
143 */
144#define PERF_SAMPLE_BRANCH_PERM_PLM \
145 (PERF_SAMPLE_BRANCH_KERNEL |\
146 PERF_SAMPLE_BRANCH_HV)
147
0b3fcf17
SE
148enum event_type_t {
149 EVENT_FLEXIBLE = 0x1,
150 EVENT_PINNED = 0x2,
151 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
152};
153
e5d1367f
SE
154/*
155 * perf_sched_events : >0 events exist
156 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
157 */
c5905afb 158struct static_key_deferred perf_sched_events __read_mostly;
e5d1367f 159static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
ba532500 160static DEFINE_PER_CPU(int, perf_sched_cb_usages);
e5d1367f 161
cdd6c482
IM
162static atomic_t nr_mmap_events __read_mostly;
163static atomic_t nr_comm_events __read_mostly;
164static atomic_t nr_task_events __read_mostly;
948b26b6 165static atomic_t nr_freq_events __read_mostly;
9ee318a7 166
108b02cf
PZ
167static LIST_HEAD(pmus);
168static DEFINE_MUTEX(pmus_lock);
169static struct srcu_struct pmus_srcu;
170
0764771d 171/*
cdd6c482 172 * perf event paranoia level:
0fbdea19
IM
173 * -1 - not paranoid at all
174 * 0 - disallow raw tracepoint access for unpriv
cdd6c482 175 * 1 - disallow cpu events for unpriv
0fbdea19 176 * 2 - disallow kernel profiling for unpriv
0764771d 177 */
cdd6c482 178int sysctl_perf_event_paranoid __read_mostly = 1;
0764771d 179
20443384
FW
180/* Minimum for 512 kiB + 1 user control page */
181int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
df58ab24
PZ
182
183/*
cdd6c482 184 * max perf event sample rate
df58ab24 185 */
14c63f17
DH
186#define DEFAULT_MAX_SAMPLE_RATE 100000
187#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
188#define DEFAULT_CPU_TIME_MAX_PERCENT 25
189
190int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
191
192static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
193static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
194
d9494cb4
PZ
195static int perf_sample_allowed_ns __read_mostly =
196 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
14c63f17
DH
197
198void update_perf_cpu_limits(void)
199{
200 u64 tmp = perf_sample_period_ns;
201
202 tmp *= sysctl_perf_cpu_time_max_percent;
e5302920 203 do_div(tmp, 100);
d9494cb4 204 ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
14c63f17 205}
163ec435 206
9e630205
SE
207static int perf_rotate_context(struct perf_cpu_context *cpuctx);
208
163ec435
PZ
209int perf_proc_update_handler(struct ctl_table *table, int write,
210 void __user *buffer, size_t *lenp,
211 loff_t *ppos)
212{
723478c8 213 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
163ec435
PZ
214
215 if (ret || !write)
216 return ret;
217
218 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
14c63f17
DH
219 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
220 update_perf_cpu_limits();
221
222 return 0;
223}
224
225int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
226
227int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
228 void __user *buffer, size_t *lenp,
229 loff_t *ppos)
230{
231 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
232
233 if (ret || !write)
234 return ret;
235
236 update_perf_cpu_limits();
163ec435
PZ
237
238 return 0;
239}
1ccd1549 240
14c63f17
DH
241/*
242 * perf samples are done in some very critical code paths (NMIs).
243 * If they take too much CPU time, the system can lock up and not
244 * get any real work done. This will drop the sample rate when
245 * we detect that events are taking too long.
246 */
247#define NR_ACCUMULATED_SAMPLES 128
d9494cb4 248static DEFINE_PER_CPU(u64, running_sample_length);
14c63f17 249
6a02ad66 250static void perf_duration_warn(struct irq_work *w)
14c63f17 251{
6a02ad66 252 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
14c63f17 253 u64 avg_local_sample_len;
e5302920 254 u64 local_samples_len;
6a02ad66 255
4a32fea9 256 local_samples_len = __this_cpu_read(running_sample_length);
6a02ad66
PZ
257 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
258
259 printk_ratelimited(KERN_WARNING
260 "perf interrupt took too long (%lld > %lld), lowering "
261 "kernel.perf_event_max_sample_rate to %d\n",
cd578abb 262 avg_local_sample_len, allowed_ns >> 1,
6a02ad66
PZ
263 sysctl_perf_event_sample_rate);
264}
265
266static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
267
268void perf_sample_event_took(u64 sample_len_ns)
269{
d9494cb4 270 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
6a02ad66
PZ
271 u64 avg_local_sample_len;
272 u64 local_samples_len;
14c63f17 273
d9494cb4 274 if (allowed_ns == 0)
14c63f17
DH
275 return;
276
277 /* decay the counter by 1 average sample */
4a32fea9 278 local_samples_len = __this_cpu_read(running_sample_length);
14c63f17
DH
279 local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
280 local_samples_len += sample_len_ns;
4a32fea9 281 __this_cpu_write(running_sample_length, local_samples_len);
14c63f17
DH
282
283 /*
284 * note: this will be biased artifically low until we have
285 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
286 * from having to maintain a count.
287 */
288 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
289
d9494cb4 290 if (avg_local_sample_len <= allowed_ns)
14c63f17
DH
291 return;
292
293 if (max_samples_per_tick <= 1)
294 return;
295
296 max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
297 sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
298 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
299
14c63f17 300 update_perf_cpu_limits();
6a02ad66 301
cd578abb
PZ
302 if (!irq_work_queue(&perf_duration_work)) {
303 early_printk("perf interrupt took too long (%lld > %lld), lowering "
304 "kernel.perf_event_max_sample_rate to %d\n",
305 avg_local_sample_len, allowed_ns >> 1,
306 sysctl_perf_event_sample_rate);
307 }
14c63f17
DH
308}
309
cdd6c482 310static atomic64_t perf_event_id;
a96bbc16 311
0b3fcf17
SE
312static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
313 enum event_type_t event_type);
314
315static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
e5d1367f
SE
316 enum event_type_t event_type,
317 struct task_struct *task);
318
319static void update_context_time(struct perf_event_context *ctx);
320static u64 perf_event_time(struct perf_event *event);
0b3fcf17 321
cdd6c482 322void __weak perf_event_print_debug(void) { }
0793a61d 323
84c79910 324extern __weak const char *perf_pmu_name(void)
0793a61d 325{
84c79910 326 return "pmu";
0793a61d
TG
327}
328
0b3fcf17
SE
329static inline u64 perf_clock(void)
330{
331 return local_clock();
332}
333
34f43927
PZ
334static inline u64 perf_event_clock(struct perf_event *event)
335{
336 return event->clock();
337}
338
e5d1367f
SE
339static inline struct perf_cpu_context *
340__get_cpu_context(struct perf_event_context *ctx)
341{
342 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
343}
344
facc4307
PZ
345static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
346 struct perf_event_context *ctx)
347{
348 raw_spin_lock(&cpuctx->ctx.lock);
349 if (ctx)
350 raw_spin_lock(&ctx->lock);
351}
352
353static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
354 struct perf_event_context *ctx)
355{
356 if (ctx)
357 raw_spin_unlock(&ctx->lock);
358 raw_spin_unlock(&cpuctx->ctx.lock);
359}
360
e5d1367f
SE
361#ifdef CONFIG_CGROUP_PERF
362
e5d1367f
SE
363static inline bool
364perf_cgroup_match(struct perf_event *event)
365{
366 struct perf_event_context *ctx = event->ctx;
367 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
368
ef824fa1
TH
369 /* @event doesn't care about cgroup */
370 if (!event->cgrp)
371 return true;
372
373 /* wants specific cgroup scope but @cpuctx isn't associated with any */
374 if (!cpuctx->cgrp)
375 return false;
376
377 /*
378 * Cgroup scoping is recursive. An event enabled for a cgroup is
379 * also enabled for all its descendant cgroups. If @cpuctx's
380 * cgroup is a descendant of @event's (the test covers identity
381 * case), it's a match.
382 */
383 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
384 event->cgrp->css.cgroup);
e5d1367f
SE
385}
386
e5d1367f
SE
387static inline void perf_detach_cgroup(struct perf_event *event)
388{
4e2ba650 389 css_put(&event->cgrp->css);
e5d1367f
SE
390 event->cgrp = NULL;
391}
392
393static inline int is_cgroup_event(struct perf_event *event)
394{
395 return event->cgrp != NULL;
396}
397
398static inline u64 perf_cgroup_event_time(struct perf_event *event)
399{
400 struct perf_cgroup_info *t;
401
402 t = per_cpu_ptr(event->cgrp->info, event->cpu);
403 return t->time;
404}
405
406static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
407{
408 struct perf_cgroup_info *info;
409 u64 now;
410
411 now = perf_clock();
412
413 info = this_cpu_ptr(cgrp->info);
414
415 info->time += now - info->timestamp;
416 info->timestamp = now;
417}
418
419static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
420{
421 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
422 if (cgrp_out)
423 __update_cgrp_time(cgrp_out);
424}
425
426static inline void update_cgrp_time_from_event(struct perf_event *event)
427{
3f7cce3c
SE
428 struct perf_cgroup *cgrp;
429
e5d1367f 430 /*
3f7cce3c
SE
431 * ensure we access cgroup data only when needed and
432 * when we know the cgroup is pinned (css_get)
e5d1367f 433 */
3f7cce3c 434 if (!is_cgroup_event(event))
e5d1367f
SE
435 return;
436
3f7cce3c
SE
437 cgrp = perf_cgroup_from_task(current);
438 /*
439 * Do not update time when cgroup is not active
440 */
441 if (cgrp == event->cgrp)
442 __update_cgrp_time(event->cgrp);
e5d1367f
SE
443}
444
445static inline void
3f7cce3c
SE
446perf_cgroup_set_timestamp(struct task_struct *task,
447 struct perf_event_context *ctx)
e5d1367f
SE
448{
449 struct perf_cgroup *cgrp;
450 struct perf_cgroup_info *info;
451
3f7cce3c
SE
452 /*
453 * ctx->lock held by caller
454 * ensure we do not access cgroup data
455 * unless we have the cgroup pinned (css_get)
456 */
457 if (!task || !ctx->nr_cgroups)
e5d1367f
SE
458 return;
459
460 cgrp = perf_cgroup_from_task(task);
461 info = this_cpu_ptr(cgrp->info);
3f7cce3c 462 info->timestamp = ctx->timestamp;
e5d1367f
SE
463}
464
465#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
466#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
467
468/*
469 * reschedule events based on the cgroup constraint of task.
470 *
471 * mode SWOUT : schedule out everything
472 * mode SWIN : schedule in based on cgroup for next
473 */
474void perf_cgroup_switch(struct task_struct *task, int mode)
475{
476 struct perf_cpu_context *cpuctx;
477 struct pmu *pmu;
478 unsigned long flags;
479
480 /*
481 * disable interrupts to avoid geting nr_cgroup
482 * changes via __perf_event_disable(). Also
483 * avoids preemption.
484 */
485 local_irq_save(flags);
486
487 /*
488 * we reschedule only in the presence of cgroup
489 * constrained events.
490 */
491 rcu_read_lock();
492
493 list_for_each_entry_rcu(pmu, &pmus, entry) {
e5d1367f 494 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
95cf59ea
PZ
495 if (cpuctx->unique_pmu != pmu)
496 continue; /* ensure we process each cpuctx once */
e5d1367f 497
e5d1367f
SE
498 /*
499 * perf_cgroup_events says at least one
500 * context on this CPU has cgroup events.
501 *
502 * ctx->nr_cgroups reports the number of cgroup
503 * events for a context.
504 */
505 if (cpuctx->ctx.nr_cgroups > 0) {
facc4307
PZ
506 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
507 perf_pmu_disable(cpuctx->ctx.pmu);
e5d1367f
SE
508
509 if (mode & PERF_CGROUP_SWOUT) {
510 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
511 /*
512 * must not be done before ctxswout due
513 * to event_filter_match() in event_sched_out()
514 */
515 cpuctx->cgrp = NULL;
516 }
517
518 if (mode & PERF_CGROUP_SWIN) {
e566b76e 519 WARN_ON_ONCE(cpuctx->cgrp);
95cf59ea
PZ
520 /*
521 * set cgrp before ctxsw in to allow
522 * event_filter_match() to not have to pass
523 * task around
e5d1367f
SE
524 */
525 cpuctx->cgrp = perf_cgroup_from_task(task);
526 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
527 }
facc4307
PZ
528 perf_pmu_enable(cpuctx->ctx.pmu);
529 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
e5d1367f 530 }
e5d1367f
SE
531 }
532
533 rcu_read_unlock();
534
535 local_irq_restore(flags);
536}
537
a8d757ef
SE
538static inline void perf_cgroup_sched_out(struct task_struct *task,
539 struct task_struct *next)
e5d1367f 540{
a8d757ef
SE
541 struct perf_cgroup *cgrp1;
542 struct perf_cgroup *cgrp2 = NULL;
543
544 /*
545 * we come here when we know perf_cgroup_events > 0
546 */
547 cgrp1 = perf_cgroup_from_task(task);
548
549 /*
550 * next is NULL when called from perf_event_enable_on_exec()
551 * that will systematically cause a cgroup_switch()
552 */
553 if (next)
554 cgrp2 = perf_cgroup_from_task(next);
555
556 /*
557 * only schedule out current cgroup events if we know
558 * that we are switching to a different cgroup. Otherwise,
559 * do no touch the cgroup events.
560 */
561 if (cgrp1 != cgrp2)
562 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
e5d1367f
SE
563}
564
a8d757ef
SE
565static inline void perf_cgroup_sched_in(struct task_struct *prev,
566 struct task_struct *task)
e5d1367f 567{
a8d757ef
SE
568 struct perf_cgroup *cgrp1;
569 struct perf_cgroup *cgrp2 = NULL;
570
571 /*
572 * we come here when we know perf_cgroup_events > 0
573 */
574 cgrp1 = perf_cgroup_from_task(task);
575
576 /* prev can never be NULL */
577 cgrp2 = perf_cgroup_from_task(prev);
578
579 /*
580 * only need to schedule in cgroup events if we are changing
581 * cgroup during ctxsw. Cgroup events were not scheduled
582 * out of ctxsw out if that was not the case.
583 */
584 if (cgrp1 != cgrp2)
585 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
e5d1367f
SE
586}
587
588static inline int perf_cgroup_connect(int fd, struct perf_event *event,
589 struct perf_event_attr *attr,
590 struct perf_event *group_leader)
591{
592 struct perf_cgroup *cgrp;
593 struct cgroup_subsys_state *css;
2903ff01
AV
594 struct fd f = fdget(fd);
595 int ret = 0;
e5d1367f 596
2903ff01 597 if (!f.file)
e5d1367f
SE
598 return -EBADF;
599
b583043e 600 css = css_tryget_online_from_dir(f.file->f_path.dentry,
ec903c0c 601 &perf_event_cgrp_subsys);
3db272c0
LZ
602 if (IS_ERR(css)) {
603 ret = PTR_ERR(css);
604 goto out;
605 }
e5d1367f
SE
606
607 cgrp = container_of(css, struct perf_cgroup, css);
608 event->cgrp = cgrp;
609
610 /*
611 * all events in a group must monitor
612 * the same cgroup because a task belongs
613 * to only one perf cgroup at a time
614 */
615 if (group_leader && group_leader->cgrp != cgrp) {
616 perf_detach_cgroup(event);
617 ret = -EINVAL;
e5d1367f 618 }
3db272c0 619out:
2903ff01 620 fdput(f);
e5d1367f
SE
621 return ret;
622}
623
624static inline void
625perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
626{
627 struct perf_cgroup_info *t;
628 t = per_cpu_ptr(event->cgrp->info, event->cpu);
629 event->shadow_ctx_time = now - t->timestamp;
630}
631
632static inline void
633perf_cgroup_defer_enabled(struct perf_event *event)
634{
635 /*
636 * when the current task's perf cgroup does not match
637 * the event's, we need to remember to call the
638 * perf_mark_enable() function the first time a task with
639 * a matching perf cgroup is scheduled in.
640 */
641 if (is_cgroup_event(event) && !perf_cgroup_match(event))
642 event->cgrp_defer_enabled = 1;
643}
644
645static inline void
646perf_cgroup_mark_enabled(struct perf_event *event,
647 struct perf_event_context *ctx)
648{
649 struct perf_event *sub;
650 u64 tstamp = perf_event_time(event);
651
652 if (!event->cgrp_defer_enabled)
653 return;
654
655 event->cgrp_defer_enabled = 0;
656
657 event->tstamp_enabled = tstamp - event->total_time_enabled;
658 list_for_each_entry(sub, &event->sibling_list, group_entry) {
659 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
660 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
661 sub->cgrp_defer_enabled = 0;
662 }
663 }
664}
665#else /* !CONFIG_CGROUP_PERF */
666
667static inline bool
668perf_cgroup_match(struct perf_event *event)
669{
670 return true;
671}
672
673static inline void perf_detach_cgroup(struct perf_event *event)
674{}
675
676static inline int is_cgroup_event(struct perf_event *event)
677{
678 return 0;
679}
680
681static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
682{
683 return 0;
684}
685
686static inline void update_cgrp_time_from_event(struct perf_event *event)
687{
688}
689
690static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
691{
692}
693
a8d757ef
SE
694static inline void perf_cgroup_sched_out(struct task_struct *task,
695 struct task_struct *next)
e5d1367f
SE
696{
697}
698
a8d757ef
SE
699static inline void perf_cgroup_sched_in(struct task_struct *prev,
700 struct task_struct *task)
e5d1367f
SE
701{
702}
703
704static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
705 struct perf_event_attr *attr,
706 struct perf_event *group_leader)
707{
708 return -EINVAL;
709}
710
711static inline void
3f7cce3c
SE
712perf_cgroup_set_timestamp(struct task_struct *task,
713 struct perf_event_context *ctx)
e5d1367f
SE
714{
715}
716
717void
718perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
719{
720}
721
722static inline void
723perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
724{
725}
726
727static inline u64 perf_cgroup_event_time(struct perf_event *event)
728{
729 return 0;
730}
731
732static inline void
733perf_cgroup_defer_enabled(struct perf_event *event)
734{
735}
736
737static inline void
738perf_cgroup_mark_enabled(struct perf_event *event,
739 struct perf_event_context *ctx)
740{
741}
742#endif
743
9e630205
SE
744/*
745 * set default to be dependent on timer tick just
746 * like original code
747 */
748#define PERF_CPU_HRTIMER (1000 / HZ)
749/*
750 * function must be called with interrupts disbled
751 */
272325c4 752static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
9e630205
SE
753{
754 struct perf_cpu_context *cpuctx;
755 enum hrtimer_restart ret = HRTIMER_NORESTART;
756 int rotations = 0;
757
758 WARN_ON(!irqs_disabled());
759
760 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
761
762 rotations = perf_rotate_context(cpuctx);
763
764 /*
765 * arm timer if needed
766 */
767 if (rotations) {
768 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
769 ret = HRTIMER_RESTART;
770 }
771
772 return ret;
773}
774
272325c4 775static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
9e630205 776{
272325c4 777 struct hrtimer *timer = &cpuctx->hrtimer;
9e630205 778 struct pmu *pmu = cpuctx->ctx.pmu;
272325c4 779 u64 interval;
9e630205
SE
780
781 /* no multiplexing needed for SW PMU */
782 if (pmu->task_ctx_nr == perf_sw_context)
783 return;
784
62b85639
SE
785 /*
786 * check default is sane, if not set then force to
787 * default interval (1/tick)
788 */
272325c4
PZ
789 interval = pmu->hrtimer_interval_ms;
790 if (interval < 1)
791 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
62b85639 792
272325c4 793 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
9e630205 794
272325c4
PZ
795 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
796 timer->function = perf_mux_hrtimer_handler;
9e630205
SE
797}
798
272325c4 799static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
9e630205 800{
272325c4 801 struct hrtimer *timer = &cpuctx->hrtimer;
9e630205
SE
802 struct pmu *pmu = cpuctx->ctx.pmu;
803
804 /* not for SW PMU */
805 if (pmu->task_ctx_nr == perf_sw_context)
272325c4 806 return 0;
9e630205 807
272325c4
PZ
808 if (hrtimer_is_queued(timer))
809 return 0;
9e630205 810
272325c4
PZ
811 hrtimer_start(timer, cpuctx->hrtimer_interval, HRTIMER_MODE_REL_PINNED);
812 return 0;
9e630205
SE
813}
814
33696fc0 815void perf_pmu_disable(struct pmu *pmu)
9e35ad38 816{
33696fc0
PZ
817 int *count = this_cpu_ptr(pmu->pmu_disable_count);
818 if (!(*count)++)
819 pmu->pmu_disable(pmu);
9e35ad38 820}
9e35ad38 821
33696fc0 822void perf_pmu_enable(struct pmu *pmu)
9e35ad38 823{
33696fc0
PZ
824 int *count = this_cpu_ptr(pmu->pmu_disable_count);
825 if (!--(*count))
826 pmu->pmu_enable(pmu);
9e35ad38 827}
9e35ad38 828
2fde4f94 829static DEFINE_PER_CPU(struct list_head, active_ctx_list);
e9d2b064
PZ
830
831/*
2fde4f94
MR
832 * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
833 * perf_event_task_tick() are fully serialized because they're strictly cpu
834 * affine and perf_event_ctx{activate,deactivate} are called with IRQs
835 * disabled, while perf_event_task_tick is called from IRQ context.
e9d2b064 836 */
2fde4f94 837static void perf_event_ctx_activate(struct perf_event_context *ctx)
9e35ad38 838{
2fde4f94 839 struct list_head *head = this_cpu_ptr(&active_ctx_list);
b5ab4cd5 840
e9d2b064 841 WARN_ON(!irqs_disabled());
b5ab4cd5 842
2fde4f94
MR
843 WARN_ON(!list_empty(&ctx->active_ctx_list));
844
845 list_add(&ctx->active_ctx_list, head);
846}
847
848static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
849{
850 WARN_ON(!irqs_disabled());
851
852 WARN_ON(list_empty(&ctx->active_ctx_list));
853
854 list_del_init(&ctx->active_ctx_list);
9e35ad38 855}
9e35ad38 856
cdd6c482 857static void get_ctx(struct perf_event_context *ctx)
a63eaf34 858{
e5289d4a 859 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
a63eaf34
PM
860}
861
4af57ef2
YZ
862static void free_ctx(struct rcu_head *head)
863{
864 struct perf_event_context *ctx;
865
866 ctx = container_of(head, struct perf_event_context, rcu_head);
867 kfree(ctx->task_ctx_data);
868 kfree(ctx);
869}
870
cdd6c482 871static void put_ctx(struct perf_event_context *ctx)
a63eaf34 872{
564c2b21
PM
873 if (atomic_dec_and_test(&ctx->refcount)) {
874 if (ctx->parent_ctx)
875 put_ctx(ctx->parent_ctx);
c93f7669
PM
876 if (ctx->task)
877 put_task_struct(ctx->task);
4af57ef2 878 call_rcu(&ctx->rcu_head, free_ctx);
564c2b21 879 }
a63eaf34
PM
880}
881
f63a8daa
PZ
882/*
883 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
884 * perf_pmu_migrate_context() we need some magic.
885 *
886 * Those places that change perf_event::ctx will hold both
887 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
888 *
889 * Lock ordering is by mutex address. There is one other site where
890 * perf_event_context::mutex nests and that is put_event(). But remember that
891 * that is a parent<->child context relation, and migration does not affect
892 * children, therefore these two orderings should not interact.
893 *
894 * The change in perf_event::ctx does not affect children (as claimed above)
895 * because the sys_perf_event_open() case will install a new event and break
896 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
897 * concerned with cpuctx and that doesn't have children.
898 *
899 * The places that change perf_event::ctx will issue:
900 *
901 * perf_remove_from_context();
902 * synchronize_rcu();
903 * perf_install_in_context();
904 *
905 * to affect the change. The remove_from_context() + synchronize_rcu() should
906 * quiesce the event, after which we can install it in the new location. This
907 * means that only external vectors (perf_fops, prctl) can perturb the event
908 * while in transit. Therefore all such accessors should also acquire
909 * perf_event_context::mutex to serialize against this.
910 *
911 * However; because event->ctx can change while we're waiting to acquire
912 * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
913 * function.
914 *
915 * Lock order:
916 * task_struct::perf_event_mutex
917 * perf_event_context::mutex
918 * perf_event_context::lock
919 * perf_event::child_mutex;
920 * perf_event::mmap_mutex
921 * mmap_sem
922 */
a83fe28e
PZ
923static struct perf_event_context *
924perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
f63a8daa
PZ
925{
926 struct perf_event_context *ctx;
927
928again:
929 rcu_read_lock();
930 ctx = ACCESS_ONCE(event->ctx);
931 if (!atomic_inc_not_zero(&ctx->refcount)) {
932 rcu_read_unlock();
933 goto again;
934 }
935 rcu_read_unlock();
936
a83fe28e 937 mutex_lock_nested(&ctx->mutex, nesting);
f63a8daa
PZ
938 if (event->ctx != ctx) {
939 mutex_unlock(&ctx->mutex);
940 put_ctx(ctx);
941 goto again;
942 }
943
944 return ctx;
945}
946
a83fe28e
PZ
947static inline struct perf_event_context *
948perf_event_ctx_lock(struct perf_event *event)
949{
950 return perf_event_ctx_lock_nested(event, 0);
951}
952
f63a8daa
PZ
953static void perf_event_ctx_unlock(struct perf_event *event,
954 struct perf_event_context *ctx)
955{
956 mutex_unlock(&ctx->mutex);
957 put_ctx(ctx);
958}
959
211de6eb
PZ
960/*
961 * This must be done under the ctx->lock, such as to serialize against
962 * context_equiv(), therefore we cannot call put_ctx() since that might end up
963 * calling scheduler related locks and ctx->lock nests inside those.
964 */
965static __must_check struct perf_event_context *
966unclone_ctx(struct perf_event_context *ctx)
71a851b4 967{
211de6eb
PZ
968 struct perf_event_context *parent_ctx = ctx->parent_ctx;
969
970 lockdep_assert_held(&ctx->lock);
971
972 if (parent_ctx)
71a851b4 973 ctx->parent_ctx = NULL;
5a3126d4 974 ctx->generation++;
211de6eb
PZ
975
976 return parent_ctx;
71a851b4
PZ
977}
978
6844c09d
ACM
979static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
980{
981 /*
982 * only top level events have the pid namespace they were created in
983 */
984 if (event->parent)
985 event = event->parent;
986
987 return task_tgid_nr_ns(p, event->ns);
988}
989
990static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
991{
992 /*
993 * only top level events have the pid namespace they were created in
994 */
995 if (event->parent)
996 event = event->parent;
997
998 return task_pid_nr_ns(p, event->ns);
999}
1000
7f453c24 1001/*
cdd6c482 1002 * If we inherit events we want to return the parent event id
7f453c24
PZ
1003 * to userspace.
1004 */
cdd6c482 1005static u64 primary_event_id(struct perf_event *event)
7f453c24 1006{
cdd6c482 1007 u64 id = event->id;
7f453c24 1008
cdd6c482
IM
1009 if (event->parent)
1010 id = event->parent->id;
7f453c24
PZ
1011
1012 return id;
1013}
1014
25346b93 1015/*
cdd6c482 1016 * Get the perf_event_context for a task and lock it.
25346b93
PM
1017 * This has to cope with with the fact that until it is locked,
1018 * the context could get moved to another task.
1019 */
cdd6c482 1020static struct perf_event_context *
8dc85d54 1021perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
25346b93 1022{
cdd6c482 1023 struct perf_event_context *ctx;
25346b93 1024
9ed6060d 1025retry:
058ebd0e
PZ
1026 /*
1027 * One of the few rules of preemptible RCU is that one cannot do
1028 * rcu_read_unlock() while holding a scheduler (or nested) lock when
1029 * part of the read side critical section was preemptible -- see
1030 * rcu_read_unlock_special().
1031 *
1032 * Since ctx->lock nests under rq->lock we must ensure the entire read
1033 * side critical section is non-preemptible.
1034 */
1035 preempt_disable();
1036 rcu_read_lock();
8dc85d54 1037 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
25346b93
PM
1038 if (ctx) {
1039 /*
1040 * If this context is a clone of another, it might
1041 * get swapped for another underneath us by
cdd6c482 1042 * perf_event_task_sched_out, though the
25346b93
PM
1043 * rcu_read_lock() protects us from any context
1044 * getting freed. Lock the context and check if it
1045 * got swapped before we could get the lock, and retry
1046 * if so. If we locked the right context, then it
1047 * can't get swapped on us any more.
1048 */
e625cce1 1049 raw_spin_lock_irqsave(&ctx->lock, *flags);
8dc85d54 1050 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
e625cce1 1051 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
058ebd0e
PZ
1052 rcu_read_unlock();
1053 preempt_enable();
25346b93
PM
1054 goto retry;
1055 }
b49a9e7e
PZ
1056
1057 if (!atomic_inc_not_zero(&ctx->refcount)) {
e625cce1 1058 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
b49a9e7e
PZ
1059 ctx = NULL;
1060 }
25346b93
PM
1061 }
1062 rcu_read_unlock();
058ebd0e 1063 preempt_enable();
25346b93
PM
1064 return ctx;
1065}
1066
1067/*
1068 * Get the context for a task and increment its pin_count so it
1069 * can't get swapped to another task. This also increments its
1070 * reference count so that the context can't get freed.
1071 */
8dc85d54
PZ
1072static struct perf_event_context *
1073perf_pin_task_context(struct task_struct *task, int ctxn)
25346b93 1074{
cdd6c482 1075 struct perf_event_context *ctx;
25346b93
PM
1076 unsigned long flags;
1077
8dc85d54 1078 ctx = perf_lock_task_context(task, ctxn, &flags);
25346b93
PM
1079 if (ctx) {
1080 ++ctx->pin_count;
e625cce1 1081 raw_spin_unlock_irqrestore(&ctx->lock, flags);
25346b93
PM
1082 }
1083 return ctx;
1084}
1085
cdd6c482 1086static void perf_unpin_context(struct perf_event_context *ctx)
25346b93
PM
1087{
1088 unsigned long flags;
1089
e625cce1 1090 raw_spin_lock_irqsave(&ctx->lock, flags);
25346b93 1091 --ctx->pin_count;
e625cce1 1092 raw_spin_unlock_irqrestore(&ctx->lock, flags);
25346b93
PM
1093}
1094
f67218c3
PZ
1095/*
1096 * Update the record of the current time in a context.
1097 */
1098static void update_context_time(struct perf_event_context *ctx)
1099{
1100 u64 now = perf_clock();
1101
1102 ctx->time += now - ctx->timestamp;
1103 ctx->timestamp = now;
1104}
1105
4158755d
SE
1106static u64 perf_event_time(struct perf_event *event)
1107{
1108 struct perf_event_context *ctx = event->ctx;
e5d1367f
SE
1109
1110 if (is_cgroup_event(event))
1111 return perf_cgroup_event_time(event);
1112
4158755d
SE
1113 return ctx ? ctx->time : 0;
1114}
1115
f67218c3
PZ
1116/*
1117 * Update the total_time_enabled and total_time_running fields for a event.
b7526f0c 1118 * The caller of this function needs to hold the ctx->lock.
f67218c3
PZ
1119 */
1120static void update_event_times(struct perf_event *event)
1121{
1122 struct perf_event_context *ctx = event->ctx;
1123 u64 run_end;
1124
1125 if (event->state < PERF_EVENT_STATE_INACTIVE ||
1126 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
1127 return;
e5d1367f
SE
1128 /*
1129 * in cgroup mode, time_enabled represents
1130 * the time the event was enabled AND active
1131 * tasks were in the monitored cgroup. This is
1132 * independent of the activity of the context as
1133 * there may be a mix of cgroup and non-cgroup events.
1134 *
1135 * That is why we treat cgroup events differently
1136 * here.
1137 */
1138 if (is_cgroup_event(event))
46cd6a7f 1139 run_end = perf_cgroup_event_time(event);
e5d1367f
SE
1140 else if (ctx->is_active)
1141 run_end = ctx->time;
acd1d7c1
PZ
1142 else
1143 run_end = event->tstamp_stopped;
1144
1145 event->total_time_enabled = run_end - event->tstamp_enabled;
f67218c3
PZ
1146
1147 if (event->state == PERF_EVENT_STATE_INACTIVE)
1148 run_end = event->tstamp_stopped;
1149 else
4158755d 1150 run_end = perf_event_time(event);
f67218c3
PZ
1151
1152 event->total_time_running = run_end - event->tstamp_running;
e5d1367f 1153
f67218c3
PZ
1154}
1155
96c21a46
PZ
1156/*
1157 * Update total_time_enabled and total_time_running for all events in a group.
1158 */
1159static void update_group_times(struct perf_event *leader)
1160{
1161 struct perf_event *event;
1162
1163 update_event_times(leader);
1164 list_for_each_entry(event, &leader->sibling_list, group_entry)
1165 update_event_times(event);
1166}
1167
889ff015
FW
1168static struct list_head *
1169ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1170{
1171 if (event->attr.pinned)
1172 return &ctx->pinned_groups;
1173 else
1174 return &ctx->flexible_groups;
1175}
1176
fccc714b 1177/*
cdd6c482 1178 * Add a event from the lists for its context.
fccc714b
PZ
1179 * Must be called with ctx->mutex and ctx->lock held.
1180 */
04289bb9 1181static void
cdd6c482 1182list_add_event(struct perf_event *event, struct perf_event_context *ctx)
04289bb9 1183{
8a49542c
PZ
1184 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1185 event->attach_state |= PERF_ATTACH_CONTEXT;
04289bb9
IM
1186
1187 /*
8a49542c
PZ
1188 * If we're a stand alone event or group leader, we go to the context
1189 * list, group events are kept attached to the group so that
1190 * perf_group_detach can, at all times, locate all siblings.
04289bb9 1191 */
8a49542c 1192 if (event->group_leader == event) {
889ff015
FW
1193 struct list_head *list;
1194
d6f962b5
FW
1195 if (is_software_event(event))
1196 event->group_flags |= PERF_GROUP_SOFTWARE;
1197
889ff015
FW
1198 list = ctx_group_list(event, ctx);
1199 list_add_tail(&event->group_entry, list);
5c148194 1200 }
592903cd 1201
08309379 1202 if (is_cgroup_event(event))
e5d1367f 1203 ctx->nr_cgroups++;
e5d1367f 1204
cdd6c482
IM
1205 list_add_rcu(&event->event_entry, &ctx->event_list);
1206 ctx->nr_events++;
1207 if (event->attr.inherit_stat)
bfbd3381 1208 ctx->nr_stat++;
5a3126d4
PZ
1209
1210 ctx->generation++;
04289bb9
IM
1211}
1212
0231bb53
JO
1213/*
1214 * Initialize event state based on the perf_event_attr::disabled.
1215 */
1216static inline void perf_event__state_init(struct perf_event *event)
1217{
1218 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1219 PERF_EVENT_STATE_INACTIVE;
1220}
1221
c320c7b7
ACM
1222/*
1223 * Called at perf_event creation and when events are attached/detached from a
1224 * group.
1225 */
1226static void perf_event__read_size(struct perf_event *event)
1227{
1228 int entry = sizeof(u64); /* value */
1229 int size = 0;
1230 int nr = 1;
1231
1232 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1233 size += sizeof(u64);
1234
1235 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1236 size += sizeof(u64);
1237
1238 if (event->attr.read_format & PERF_FORMAT_ID)
1239 entry += sizeof(u64);
1240
1241 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1242 nr += event->group_leader->nr_siblings;
1243 size += sizeof(u64);
1244 }
1245
1246 size += entry * nr;
1247 event->read_size = size;
1248}
1249
1250static void perf_event__header_size(struct perf_event *event)
1251{
1252 struct perf_sample_data *data;
1253 u64 sample_type = event->attr.sample_type;
1254 u16 size = 0;
1255
1256 perf_event__read_size(event);
1257
1258 if (sample_type & PERF_SAMPLE_IP)
1259 size += sizeof(data->ip);
1260
6844c09d
ACM
1261 if (sample_type & PERF_SAMPLE_ADDR)
1262 size += sizeof(data->addr);
1263
1264 if (sample_type & PERF_SAMPLE_PERIOD)
1265 size += sizeof(data->period);
1266
c3feedf2
AK
1267 if (sample_type & PERF_SAMPLE_WEIGHT)
1268 size += sizeof(data->weight);
1269
6844c09d
ACM
1270 if (sample_type & PERF_SAMPLE_READ)
1271 size += event->read_size;
1272
d6be9ad6
SE
1273 if (sample_type & PERF_SAMPLE_DATA_SRC)
1274 size += sizeof(data->data_src.val);
1275
fdfbbd07
AK
1276 if (sample_type & PERF_SAMPLE_TRANSACTION)
1277 size += sizeof(data->txn);
1278
6844c09d
ACM
1279 event->header_size = size;
1280}
1281
1282static void perf_event__id_header_size(struct perf_event *event)
1283{
1284 struct perf_sample_data *data;
1285 u64 sample_type = event->attr.sample_type;
1286 u16 size = 0;
1287
c320c7b7
ACM
1288 if (sample_type & PERF_SAMPLE_TID)
1289 size += sizeof(data->tid_entry);
1290
1291 if (sample_type & PERF_SAMPLE_TIME)
1292 size += sizeof(data->time);
1293
ff3d527c
AH
1294 if (sample_type & PERF_SAMPLE_IDENTIFIER)
1295 size += sizeof(data->id);
1296
c320c7b7
ACM
1297 if (sample_type & PERF_SAMPLE_ID)
1298 size += sizeof(data->id);
1299
1300 if (sample_type & PERF_SAMPLE_STREAM_ID)
1301 size += sizeof(data->stream_id);
1302
1303 if (sample_type & PERF_SAMPLE_CPU)
1304 size += sizeof(data->cpu_entry);
1305
6844c09d 1306 event->id_header_size = size;
c320c7b7
ACM
1307}
1308
8a49542c
PZ
1309static void perf_group_attach(struct perf_event *event)
1310{
c320c7b7 1311 struct perf_event *group_leader = event->group_leader, *pos;
8a49542c 1312
74c3337c
PZ
1313 /*
1314 * We can have double attach due to group movement in perf_event_open.
1315 */
1316 if (event->attach_state & PERF_ATTACH_GROUP)
1317 return;
1318
8a49542c
PZ
1319 event->attach_state |= PERF_ATTACH_GROUP;
1320
1321 if (group_leader == event)
1322 return;
1323
652884fe
PZ
1324 WARN_ON_ONCE(group_leader->ctx != event->ctx);
1325
8a49542c
PZ
1326 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
1327 !is_software_event(event))
1328 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
1329
1330 list_add_tail(&event->group_entry, &group_leader->sibling_list);
1331 group_leader->nr_siblings++;
c320c7b7
ACM
1332
1333 perf_event__header_size(group_leader);
1334
1335 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1336 perf_event__header_size(pos);
8a49542c
PZ
1337}
1338
a63eaf34 1339/*
cdd6c482 1340 * Remove a event from the lists for its context.
fccc714b 1341 * Must be called with ctx->mutex and ctx->lock held.
a63eaf34 1342 */
04289bb9 1343static void
cdd6c482 1344list_del_event(struct perf_event *event, struct perf_event_context *ctx)
04289bb9 1345{
68cacd29 1346 struct perf_cpu_context *cpuctx;
652884fe
PZ
1347
1348 WARN_ON_ONCE(event->ctx != ctx);
1349 lockdep_assert_held(&ctx->lock);
1350
8a49542c
PZ
1351 /*
1352 * We can have double detach due to exit/hot-unplug + close.
1353 */
1354 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
a63eaf34 1355 return;
8a49542c
PZ
1356
1357 event->attach_state &= ~PERF_ATTACH_CONTEXT;
1358
68cacd29 1359 if (is_cgroup_event(event)) {
e5d1367f 1360 ctx->nr_cgroups--;
68cacd29
SE
1361 cpuctx = __get_cpu_context(ctx);
1362 /*
1363 * if there are no more cgroup events
1364 * then cler cgrp to avoid stale pointer
1365 * in update_cgrp_time_from_cpuctx()
1366 */
1367 if (!ctx->nr_cgroups)
1368 cpuctx->cgrp = NULL;
1369 }
e5d1367f 1370
cdd6c482
IM
1371 ctx->nr_events--;
1372 if (event->attr.inherit_stat)
bfbd3381 1373 ctx->nr_stat--;
8bc20959 1374
cdd6c482 1375 list_del_rcu(&event->event_entry);
04289bb9 1376
8a49542c
PZ
1377 if (event->group_leader == event)
1378 list_del_init(&event->group_entry);
5c148194 1379
96c21a46 1380 update_group_times(event);
b2e74a26
SE
1381
1382 /*
1383 * If event was in error state, then keep it
1384 * that way, otherwise bogus counts will be
1385 * returned on read(). The only way to get out
1386 * of error state is by explicit re-enabling
1387 * of the event
1388 */
1389 if (event->state > PERF_EVENT_STATE_OFF)
1390 event->state = PERF_EVENT_STATE_OFF;
5a3126d4
PZ
1391
1392 ctx->generation++;
050735b0
PZ
1393}
1394
8a49542c 1395static void perf_group_detach(struct perf_event *event)
050735b0
PZ
1396{
1397 struct perf_event *sibling, *tmp;
8a49542c
PZ
1398 struct list_head *list = NULL;
1399
1400 /*
1401 * We can have double detach due to exit/hot-unplug + close.
1402 */
1403 if (!(event->attach_state & PERF_ATTACH_GROUP))
1404 return;
1405
1406 event->attach_state &= ~PERF_ATTACH_GROUP;
1407
1408 /*
1409 * If this is a sibling, remove it from its group.
1410 */
1411 if (event->group_leader != event) {
1412 list_del_init(&event->group_entry);
1413 event->group_leader->nr_siblings--;
c320c7b7 1414 goto out;
8a49542c
PZ
1415 }
1416
1417 if (!list_empty(&event->group_entry))
1418 list = &event->group_entry;
2e2af50b 1419
04289bb9 1420 /*
cdd6c482
IM
1421 * If this was a group event with sibling events then
1422 * upgrade the siblings to singleton events by adding them
8a49542c 1423 * to whatever list we are on.
04289bb9 1424 */
cdd6c482 1425 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
8a49542c
PZ
1426 if (list)
1427 list_move_tail(&sibling->group_entry, list);
04289bb9 1428 sibling->group_leader = sibling;
d6f962b5
FW
1429
1430 /* Inherit group flags from the previous leader */
1431 sibling->group_flags = event->group_flags;
652884fe
PZ
1432
1433 WARN_ON_ONCE(sibling->ctx != event->ctx);
04289bb9 1434 }
c320c7b7
ACM
1435
1436out:
1437 perf_event__header_size(event->group_leader);
1438
1439 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1440 perf_event__header_size(tmp);
04289bb9
IM
1441}
1442
fadfe7be
JO
1443/*
1444 * User event without the task.
1445 */
1446static bool is_orphaned_event(struct perf_event *event)
1447{
1448 return event && !is_kernel_event(event) && !event->owner;
1449}
1450
1451/*
1452 * Event has a parent but parent's task finished and it's
1453 * alive only because of children holding refference.
1454 */
1455static bool is_orphaned_child(struct perf_event *event)
1456{
1457 return is_orphaned_event(event->parent);
1458}
1459
1460static void orphans_remove_work(struct work_struct *work);
1461
1462static void schedule_orphans_remove(struct perf_event_context *ctx)
1463{
1464 if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
1465 return;
1466
1467 if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
1468 get_ctx(ctx);
1469 ctx->orphans_remove_sched = true;
1470 }
1471}
1472
1473static int __init perf_workqueue_init(void)
1474{
1475 perf_wq = create_singlethread_workqueue("perf");
1476 WARN(!perf_wq, "failed to create perf workqueue\n");
1477 return perf_wq ? 0 : -1;
1478}
1479
1480core_initcall(perf_workqueue_init);
1481
fa66f07a
SE
1482static inline int
1483event_filter_match(struct perf_event *event)
1484{
e5d1367f
SE
1485 return (event->cpu == -1 || event->cpu == smp_processor_id())
1486 && perf_cgroup_match(event);
fa66f07a
SE
1487}
1488
9ffcfa6f
SE
1489static void
1490event_sched_out(struct perf_event *event,
3b6f9e5c 1491 struct perf_cpu_context *cpuctx,
cdd6c482 1492 struct perf_event_context *ctx)
3b6f9e5c 1493{
4158755d 1494 u64 tstamp = perf_event_time(event);
fa66f07a 1495 u64 delta;
652884fe
PZ
1496
1497 WARN_ON_ONCE(event->ctx != ctx);
1498 lockdep_assert_held(&ctx->lock);
1499
fa66f07a
SE
1500 /*
1501 * An event which could not be activated because of
1502 * filter mismatch still needs to have its timings
1503 * maintained, otherwise bogus information is return
1504 * via read() for time_enabled, time_running:
1505 */
1506 if (event->state == PERF_EVENT_STATE_INACTIVE
1507 && !event_filter_match(event)) {
e5d1367f 1508 delta = tstamp - event->tstamp_stopped;
fa66f07a 1509 event->tstamp_running += delta;
4158755d 1510 event->tstamp_stopped = tstamp;
fa66f07a
SE
1511 }
1512
cdd6c482 1513 if (event->state != PERF_EVENT_STATE_ACTIVE)
9ffcfa6f 1514 return;
3b6f9e5c 1515
44377277
AS
1516 perf_pmu_disable(event->pmu);
1517
cdd6c482
IM
1518 event->state = PERF_EVENT_STATE_INACTIVE;
1519 if (event->pending_disable) {
1520 event->pending_disable = 0;
1521 event->state = PERF_EVENT_STATE_OFF;
970892a9 1522 }
4158755d 1523 event->tstamp_stopped = tstamp;
a4eaf7f1 1524 event->pmu->del(event, 0);
cdd6c482 1525 event->oncpu = -1;
3b6f9e5c 1526
cdd6c482 1527 if (!is_software_event(event))
3b6f9e5c 1528 cpuctx->active_oncpu--;
2fde4f94
MR
1529 if (!--ctx->nr_active)
1530 perf_event_ctx_deactivate(ctx);
0f5a2601
PZ
1531 if (event->attr.freq && event->attr.sample_freq)
1532 ctx->nr_freq--;
cdd6c482 1533 if (event->attr.exclusive || !cpuctx->active_oncpu)
3b6f9e5c 1534 cpuctx->exclusive = 0;
44377277 1535
fadfe7be
JO
1536 if (is_orphaned_child(event))
1537 schedule_orphans_remove(ctx);
1538
44377277 1539 perf_pmu_enable(event->pmu);
3b6f9e5c
PM
1540}
1541
d859e29f 1542static void
cdd6c482 1543group_sched_out(struct perf_event *group_event,
d859e29f 1544 struct perf_cpu_context *cpuctx,
cdd6c482 1545 struct perf_event_context *ctx)
d859e29f 1546{
cdd6c482 1547 struct perf_event *event;
fa66f07a 1548 int state = group_event->state;
d859e29f 1549
cdd6c482 1550 event_sched_out(group_event, cpuctx, ctx);
d859e29f
PM
1551
1552 /*
1553 * Schedule out siblings (if any):
1554 */
cdd6c482
IM
1555 list_for_each_entry(event, &group_event->sibling_list, group_entry)
1556 event_sched_out(event, cpuctx, ctx);
d859e29f 1557
fa66f07a 1558 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
d859e29f
PM
1559 cpuctx->exclusive = 0;
1560}
1561
46ce0fe9
PZ
1562struct remove_event {
1563 struct perf_event *event;
1564 bool detach_group;
1565};
1566
0793a61d 1567/*
cdd6c482 1568 * Cross CPU call to remove a performance event
0793a61d 1569 *
cdd6c482 1570 * We disable the event on the hardware level first. After that we
0793a61d
TG
1571 * remove it from the context list.
1572 */
fe4b04fa 1573static int __perf_remove_from_context(void *info)
0793a61d 1574{
46ce0fe9
PZ
1575 struct remove_event *re = info;
1576 struct perf_event *event = re->event;
cdd6c482 1577 struct perf_event_context *ctx = event->ctx;
108b02cf 1578 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
0793a61d 1579
e625cce1 1580 raw_spin_lock(&ctx->lock);
cdd6c482 1581 event_sched_out(event, cpuctx, ctx);
46ce0fe9
PZ
1582 if (re->detach_group)
1583 perf_group_detach(event);
cdd6c482 1584 list_del_event(event, ctx);
64ce3126
PZ
1585 if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
1586 ctx->is_active = 0;
1587 cpuctx->task_ctx = NULL;
1588 }
e625cce1 1589 raw_spin_unlock(&ctx->lock);
fe4b04fa
PZ
1590
1591 return 0;
0793a61d
TG
1592}
1593
1594
1595/*
cdd6c482 1596 * Remove the event from a task's (or a CPU's) list of events.
0793a61d 1597 *
cdd6c482 1598 * CPU events are removed with a smp call. For task events we only
0793a61d 1599 * call when the task is on a CPU.
c93f7669 1600 *
cdd6c482
IM
1601 * If event->ctx is a cloned context, callers must make sure that
1602 * every task struct that event->ctx->task could possibly point to
c93f7669
PM
1603 * remains valid. This is OK when called from perf_release since
1604 * that only calls us on the top-level context, which can't be a clone.
cdd6c482 1605 * When called from perf_event_exit_task, it's OK because the
c93f7669 1606 * context has been detached from its task.
0793a61d 1607 */
46ce0fe9 1608static void perf_remove_from_context(struct perf_event *event, bool detach_group)
0793a61d 1609{
cdd6c482 1610 struct perf_event_context *ctx = event->ctx;
0793a61d 1611 struct task_struct *task = ctx->task;
46ce0fe9
PZ
1612 struct remove_event re = {
1613 .event = event,
1614 .detach_group = detach_group,
1615 };
0793a61d 1616
fe4b04fa
PZ
1617 lockdep_assert_held(&ctx->mutex);
1618
0793a61d
TG
1619 if (!task) {
1620 /*
226424ee
MR
1621 * Per cpu events are removed via an smp call. The removal can
1622 * fail if the CPU is currently offline, but in that case we
1623 * already called __perf_remove_from_context from
1624 * perf_event_exit_cpu.
0793a61d 1625 */
46ce0fe9 1626 cpu_function_call(event->cpu, __perf_remove_from_context, &re);
0793a61d
TG
1627 return;
1628 }
1629
1630retry:
46ce0fe9 1631 if (!task_function_call(task, __perf_remove_from_context, &re))
fe4b04fa 1632 return;
0793a61d 1633
e625cce1 1634 raw_spin_lock_irq(&ctx->lock);
0793a61d 1635 /*
fe4b04fa
PZ
1636 * If we failed to find a running task, but find the context active now
1637 * that we've acquired the ctx->lock, retry.
0793a61d 1638 */
fe4b04fa 1639 if (ctx->is_active) {
e625cce1 1640 raw_spin_unlock_irq(&ctx->lock);
3577af70
CW
1641 /*
1642 * Reload the task pointer, it might have been changed by
1643 * a concurrent perf_event_context_sched_out().
1644 */
1645 task = ctx->task;
0793a61d
TG
1646 goto retry;
1647 }
1648
1649 /*
fe4b04fa
PZ
1650 * Since the task isn't running, its safe to remove the event, us
1651 * holding the ctx->lock ensures the task won't get scheduled in.
0793a61d 1652 */
46ce0fe9
PZ
1653 if (detach_group)
1654 perf_group_detach(event);
fe4b04fa 1655 list_del_event(event, ctx);
e625cce1 1656 raw_spin_unlock_irq(&ctx->lock);
0793a61d
TG
1657}
1658
d859e29f 1659/*
cdd6c482 1660 * Cross CPU call to disable a performance event
d859e29f 1661 */
500ad2d8 1662int __perf_event_disable(void *info)
d859e29f 1663{
cdd6c482 1664 struct perf_event *event = info;
cdd6c482 1665 struct perf_event_context *ctx = event->ctx;
108b02cf 1666 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
d859e29f
PM
1667
1668 /*
cdd6c482
IM
1669 * If this is a per-task event, need to check whether this
1670 * event's task is the current task on this cpu.
fe4b04fa
PZ
1671 *
1672 * Can trigger due to concurrent perf_event_context_sched_out()
1673 * flipping contexts around.
d859e29f 1674 */
665c2142 1675 if (ctx->task && cpuctx->task_ctx != ctx)
fe4b04fa 1676 return -EINVAL;
d859e29f 1677
e625cce1 1678 raw_spin_lock(&ctx->lock);
d859e29f
PM
1679
1680 /*
cdd6c482 1681 * If the event is on, turn it off.
d859e29f
PM
1682 * If it is in error state, leave it in error state.
1683 */
cdd6c482 1684 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
4af4998b 1685 update_context_time(ctx);
e5d1367f 1686 update_cgrp_time_from_event(event);
cdd6c482
IM
1687 update_group_times(event);
1688 if (event == event->group_leader)
1689 group_sched_out(event, cpuctx, ctx);
d859e29f 1690 else
cdd6c482
IM
1691 event_sched_out(event, cpuctx, ctx);
1692 event->state = PERF_EVENT_STATE_OFF;
d859e29f
PM
1693 }
1694
e625cce1 1695 raw_spin_unlock(&ctx->lock);
fe4b04fa
PZ
1696
1697 return 0;
d859e29f
PM
1698}
1699
1700/*
cdd6c482 1701 * Disable a event.
c93f7669 1702 *
cdd6c482
IM
1703 * If event->ctx is a cloned context, callers must make sure that
1704 * every task struct that event->ctx->task could possibly point to
c93f7669 1705 * remains valid. This condition is satisifed when called through
cdd6c482
IM
1706 * perf_event_for_each_child or perf_event_for_each because they
1707 * hold the top-level event's child_mutex, so any descendant that
1708 * goes to exit will block in sync_child_event.
1709 * When called from perf_pending_event it's OK because event->ctx
c93f7669 1710 * is the current context on this CPU and preemption is disabled,
cdd6c482 1711 * hence we can't get into perf_event_task_sched_out for this context.
d859e29f 1712 */
f63a8daa 1713static void _perf_event_disable(struct perf_event *event)
d859e29f 1714{
cdd6c482 1715 struct perf_event_context *ctx = event->ctx;
d859e29f
PM
1716 struct task_struct *task = ctx->task;
1717
1718 if (!task) {
1719 /*
cdd6c482 1720 * Disable the event on the cpu that it's on
d859e29f 1721 */
fe4b04fa 1722 cpu_function_call(event->cpu, __perf_event_disable, event);
d859e29f
PM
1723 return;
1724 }
1725
9ed6060d 1726retry:
fe4b04fa
PZ
1727 if (!task_function_call(task, __perf_event_disable, event))
1728 return;
d859e29f 1729
e625cce1 1730 raw_spin_lock_irq(&ctx->lock);
d859e29f 1731 /*
cdd6c482 1732 * If the event is still active, we need to retry the cross-call.
d859e29f 1733 */
cdd6c482 1734 if (event->state == PERF_EVENT_STATE_ACTIVE) {
e625cce1 1735 raw_spin_unlock_irq(&ctx->lock);
fe4b04fa
PZ
1736 /*
1737 * Reload the task pointer, it might have been changed by
1738 * a concurrent perf_event_context_sched_out().
1739 */
1740 task = ctx->task;
d859e29f
PM
1741 goto retry;
1742 }
1743
1744 /*
1745 * Since we have the lock this context can't be scheduled
1746 * in, so we can change the state safely.
1747 */
cdd6c482
IM
1748 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1749 update_group_times(event);
1750 event->state = PERF_EVENT_STATE_OFF;
53cfbf59 1751 }
e625cce1 1752 raw_spin_unlock_irq(&ctx->lock);
d859e29f 1753}
f63a8daa
PZ
1754
1755/*
1756 * Strictly speaking kernel users cannot create groups and therefore this
1757 * interface does not need the perf_event_ctx_lock() magic.
1758 */
1759void perf_event_disable(struct perf_event *event)
1760{
1761 struct perf_event_context *ctx;
1762
1763 ctx = perf_event_ctx_lock(event);
1764 _perf_event_disable(event);
1765 perf_event_ctx_unlock(event, ctx);
1766}
dcfce4a0 1767EXPORT_SYMBOL_GPL(perf_event_disable);
d859e29f 1768
e5d1367f
SE
1769static void perf_set_shadow_time(struct perf_event *event,
1770 struct perf_event_context *ctx,
1771 u64 tstamp)
1772{
1773 /*
1774 * use the correct time source for the time snapshot
1775 *
1776 * We could get by without this by leveraging the
1777 * fact that to get to this function, the caller
1778 * has most likely already called update_context_time()
1779 * and update_cgrp_time_xx() and thus both timestamp
1780 * are identical (or very close). Given that tstamp is,
1781 * already adjusted for cgroup, we could say that:
1782 * tstamp - ctx->timestamp
1783 * is equivalent to
1784 * tstamp - cgrp->timestamp.
1785 *
1786 * Then, in perf_output_read(), the calculation would
1787 * work with no changes because:
1788 * - event is guaranteed scheduled in
1789 * - no scheduled out in between
1790 * - thus the timestamp would be the same
1791 *
1792 * But this is a bit hairy.
1793 *
1794 * So instead, we have an explicit cgroup call to remain
1795 * within the time time source all along. We believe it
1796 * is cleaner and simpler to understand.
1797 */
1798 if (is_cgroup_event(event))
1799 perf_cgroup_set_shadow_time(event, tstamp);
1800 else
1801 event->shadow_ctx_time = tstamp - ctx->timestamp;
1802}
1803
4fe757dd
PZ
1804#define MAX_INTERRUPTS (~0ULL)
1805
1806static void perf_log_throttle(struct perf_event *event, int enable);
ec0d7729 1807static void perf_log_itrace_start(struct perf_event *event);
4fe757dd 1808
235c7fc7 1809static int
9ffcfa6f 1810event_sched_in(struct perf_event *event,
235c7fc7 1811 struct perf_cpu_context *cpuctx,
6e37738a 1812 struct perf_event_context *ctx)
235c7fc7 1813{
4158755d 1814 u64 tstamp = perf_event_time(event);
44377277 1815 int ret = 0;
4158755d 1816
63342411
PZ
1817 lockdep_assert_held(&ctx->lock);
1818
cdd6c482 1819 if (event->state <= PERF_EVENT_STATE_OFF)
235c7fc7
IM
1820 return 0;
1821
cdd6c482 1822 event->state = PERF_EVENT_STATE_ACTIVE;
6e37738a 1823 event->oncpu = smp_processor_id();
4fe757dd
PZ
1824
1825 /*
1826 * Unthrottle events, since we scheduled we might have missed several
1827 * ticks already, also for a heavily scheduling task there is little
1828 * guarantee it'll get a tick in a timely manner.
1829 */
1830 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1831 perf_log_throttle(event, 1);
1832 event->hw.interrupts = 0;
1833 }
1834
235c7fc7
IM
1835 /*
1836 * The new state must be visible before we turn it on in the hardware:
1837 */
1838 smp_wmb();
1839
44377277
AS
1840 perf_pmu_disable(event->pmu);
1841
72f669c0
SL
1842 event->tstamp_running += tstamp - event->tstamp_stopped;
1843
1844 perf_set_shadow_time(event, ctx, tstamp);
1845
ec0d7729
AS
1846 perf_log_itrace_start(event);
1847
a4eaf7f1 1848 if (event->pmu->add(event, PERF_EF_START)) {
cdd6c482
IM
1849 event->state = PERF_EVENT_STATE_INACTIVE;
1850 event->oncpu = -1;
44377277
AS
1851 ret = -EAGAIN;
1852 goto out;
235c7fc7
IM
1853 }
1854
cdd6c482 1855 if (!is_software_event(event))
3b6f9e5c 1856 cpuctx->active_oncpu++;
2fde4f94
MR
1857 if (!ctx->nr_active++)
1858 perf_event_ctx_activate(ctx);
0f5a2601
PZ
1859 if (event->attr.freq && event->attr.sample_freq)
1860 ctx->nr_freq++;
235c7fc7 1861
cdd6c482 1862 if (event->attr.exclusive)
3b6f9e5c
PM
1863 cpuctx->exclusive = 1;
1864
fadfe7be
JO
1865 if (is_orphaned_child(event))
1866 schedule_orphans_remove(ctx);
1867
44377277
AS
1868out:
1869 perf_pmu_enable(event->pmu);
1870
1871 return ret;
235c7fc7
IM
1872}
1873
6751b71e 1874static int
cdd6c482 1875group_sched_in(struct perf_event *group_event,
6751b71e 1876 struct perf_cpu_context *cpuctx,
6e37738a 1877 struct perf_event_context *ctx)
6751b71e 1878{
6bde9b6c 1879 struct perf_event *event, *partial_group = NULL;
4a234593 1880 struct pmu *pmu = ctx->pmu;
d7842da4
SE
1881 u64 now = ctx->time;
1882 bool simulate = false;
6751b71e 1883
cdd6c482 1884 if (group_event->state == PERF_EVENT_STATE_OFF)
6751b71e
PM
1885 return 0;
1886
ad5133b7 1887 pmu->start_txn(pmu);
6bde9b6c 1888
9ffcfa6f 1889 if (event_sched_in(group_event, cpuctx, ctx)) {
ad5133b7 1890 pmu->cancel_txn(pmu);
272325c4 1891 perf_mux_hrtimer_restart(cpuctx);
6751b71e 1892 return -EAGAIN;
90151c35 1893 }
6751b71e
PM
1894
1895 /*
1896 * Schedule in siblings as one group (if any):
1897 */
cdd6c482 1898 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
9ffcfa6f 1899 if (event_sched_in(event, cpuctx, ctx)) {
cdd6c482 1900 partial_group = event;
6751b71e
PM
1901 goto group_error;
1902 }
1903 }
1904
9ffcfa6f 1905 if (!pmu->commit_txn(pmu))
6e85158c 1906 return 0;
9ffcfa6f 1907
6751b71e
PM
1908group_error:
1909 /*
1910 * Groups can be scheduled in as one unit only, so undo any
1911 * partial group before returning:
d7842da4
SE
1912 * The events up to the failed event are scheduled out normally,
1913 * tstamp_stopped will be updated.
1914 *
1915 * The failed events and the remaining siblings need to have
1916 * their timings updated as if they had gone thru event_sched_in()
1917 * and event_sched_out(). This is required to get consistent timings
1918 * across the group. This also takes care of the case where the group
1919 * could never be scheduled by ensuring tstamp_stopped is set to mark
1920 * the time the event was actually stopped, such that time delta
1921 * calculation in update_event_times() is correct.
6751b71e 1922 */
cdd6c482
IM
1923 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1924 if (event == partial_group)
d7842da4
SE
1925 simulate = true;
1926
1927 if (simulate) {
1928 event->tstamp_running += now - event->tstamp_stopped;
1929 event->tstamp_stopped = now;
1930 } else {
1931 event_sched_out(event, cpuctx, ctx);
1932 }
6751b71e 1933 }
9ffcfa6f 1934 event_sched_out(group_event, cpuctx, ctx);
6751b71e 1935
ad5133b7 1936 pmu->cancel_txn(pmu);
90151c35 1937
272325c4 1938 perf_mux_hrtimer_restart(cpuctx);
9e630205 1939
6751b71e
PM
1940 return -EAGAIN;
1941}
1942
3b6f9e5c 1943/*
cdd6c482 1944 * Work out whether we can put this event group on the CPU now.
3b6f9e5c 1945 */
cdd6c482 1946static int group_can_go_on(struct perf_event *event,
3b6f9e5c
PM
1947 struct perf_cpu_context *cpuctx,
1948 int can_add_hw)
1949{
1950 /*
cdd6c482 1951 * Groups consisting entirely of software events can always go on.
3b6f9e5c 1952 */
d6f962b5 1953 if (event->group_flags & PERF_GROUP_SOFTWARE)
3b6f9e5c
PM
1954 return 1;
1955 /*
1956 * If an exclusive group is already on, no other hardware
cdd6c482 1957 * events can go on.
3b6f9e5c
PM
1958 */
1959 if (cpuctx->exclusive)
1960 return 0;
1961 /*
1962 * If this group is exclusive and there are already
cdd6c482 1963 * events on the CPU, it can't go on.
3b6f9e5c 1964 */
cdd6c482 1965 if (event->attr.exclusive && cpuctx->active_oncpu)
3b6f9e5c
PM
1966 return 0;
1967 /*
1968 * Otherwise, try to add it if all previous groups were able
1969 * to go on.
1970 */
1971 return can_add_hw;
1972}
1973
cdd6c482
IM
1974static void add_event_to_ctx(struct perf_event *event,
1975 struct perf_event_context *ctx)
53cfbf59 1976{
4158755d
SE
1977 u64 tstamp = perf_event_time(event);
1978
cdd6c482 1979 list_add_event(event, ctx);
8a49542c 1980 perf_group_attach(event);
4158755d
SE
1981 event->tstamp_enabled = tstamp;
1982 event->tstamp_running = tstamp;
1983 event->tstamp_stopped = tstamp;
53cfbf59
PM
1984}
1985
2c29ef0f
PZ
1986static void task_ctx_sched_out(struct perf_event_context *ctx);
1987static void
1988ctx_sched_in(struct perf_event_context *ctx,
1989 struct perf_cpu_context *cpuctx,
1990 enum event_type_t event_type,
1991 struct task_struct *task);
fe4b04fa 1992
dce5855b
PZ
1993static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
1994 struct perf_event_context *ctx,
1995 struct task_struct *task)
1996{
1997 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
1998 if (ctx)
1999 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2000 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2001 if (ctx)
2002 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2003}
2004
0793a61d 2005/*
cdd6c482 2006 * Cross CPU call to install and enable a performance event
682076ae
PZ
2007 *
2008 * Must be called with ctx->mutex held
0793a61d 2009 */
fe4b04fa 2010static int __perf_install_in_context(void *info)
0793a61d 2011{
cdd6c482
IM
2012 struct perf_event *event = info;
2013 struct perf_event_context *ctx = event->ctx;
108b02cf 2014 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2c29ef0f
PZ
2015 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2016 struct task_struct *task = current;
2017
b58f6b0d 2018 perf_ctx_lock(cpuctx, task_ctx);
2c29ef0f 2019 perf_pmu_disable(cpuctx->ctx.pmu);
0793a61d
TG
2020
2021 /*
2c29ef0f 2022 * If there was an active task_ctx schedule it out.
0793a61d 2023 */
b58f6b0d 2024 if (task_ctx)
2c29ef0f 2025 task_ctx_sched_out(task_ctx);
b58f6b0d
PZ
2026
2027 /*
2028 * If the context we're installing events in is not the
2029 * active task_ctx, flip them.
2030 */
2031 if (ctx->task && task_ctx != ctx) {
2032 if (task_ctx)
2033 raw_spin_unlock(&task_ctx->lock);
2034 raw_spin_lock(&ctx->lock);
2035 task_ctx = ctx;
2036 }
2037
2038 if (task_ctx) {
2039 cpuctx->task_ctx = task_ctx;
2c29ef0f
PZ
2040 task = task_ctx->task;
2041 }
b58f6b0d 2042
2c29ef0f 2043 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
0793a61d 2044
4af4998b 2045 update_context_time(ctx);
e5d1367f
SE
2046 /*
2047 * update cgrp time only if current cgrp
2048 * matches event->cgrp. Must be done before
2049 * calling add_event_to_ctx()
2050 */
2051 update_cgrp_time_from_event(event);
0793a61d 2052
cdd6c482 2053 add_event_to_ctx(event, ctx);
0793a61d 2054
d859e29f 2055 /*
2c29ef0f 2056 * Schedule everything back in
d859e29f 2057 */
dce5855b 2058 perf_event_sched_in(cpuctx, task_ctx, task);
2c29ef0f
PZ
2059
2060 perf_pmu_enable(cpuctx->ctx.pmu);
2061 perf_ctx_unlock(cpuctx, task_ctx);
fe4b04fa
PZ
2062
2063 return 0;
0793a61d
TG
2064}
2065
2066/*
cdd6c482 2067 * Attach a performance event to a context
0793a61d 2068 *
cdd6c482
IM
2069 * First we add the event to the list with the hardware enable bit
2070 * in event->hw_config cleared.
0793a61d 2071 *
cdd6c482 2072 * If the event is attached to a task which is on a CPU we use a smp
0793a61d
TG
2073 * call to enable it in the task context. The task might have been
2074 * scheduled away, but we check this in the smp call again.
2075 */
2076static void
cdd6c482
IM
2077perf_install_in_context(struct perf_event_context *ctx,
2078 struct perf_event *event,
0793a61d
TG
2079 int cpu)
2080{
2081 struct task_struct *task = ctx->task;
2082
fe4b04fa
PZ
2083 lockdep_assert_held(&ctx->mutex);
2084
c3f00c70 2085 event->ctx = ctx;
0cda4c02
YZ
2086 if (event->cpu != -1)
2087 event->cpu = cpu;
c3f00c70 2088
0793a61d
TG
2089 if (!task) {
2090 /*
cdd6c482 2091 * Per cpu events are installed via an smp call and
af901ca1 2092 * the install is always successful.
0793a61d 2093 */
fe4b04fa 2094 cpu_function_call(cpu, __perf_install_in_context, event);
0793a61d
TG
2095 return;
2096 }
2097
0793a61d 2098retry:
fe4b04fa
PZ
2099 if (!task_function_call(task, __perf_install_in_context, event))
2100 return;
0793a61d 2101
e625cce1 2102 raw_spin_lock_irq(&ctx->lock);
0793a61d 2103 /*
fe4b04fa
PZ
2104 * If we failed to find a running task, but find the context active now
2105 * that we've acquired the ctx->lock, retry.
0793a61d 2106 */
fe4b04fa 2107 if (ctx->is_active) {
e625cce1 2108 raw_spin_unlock_irq(&ctx->lock);
3577af70
CW
2109 /*
2110 * Reload the task pointer, it might have been changed by
2111 * a concurrent perf_event_context_sched_out().
2112 */
2113 task = ctx->task;
0793a61d
TG
2114 goto retry;
2115 }
2116
2117 /*
fe4b04fa
PZ
2118 * Since the task isn't running, its safe to add the event, us holding
2119 * the ctx->lock ensures the task won't get scheduled in.
0793a61d 2120 */
fe4b04fa 2121 add_event_to_ctx(event, ctx);
e625cce1 2122 raw_spin_unlock_irq(&ctx->lock);
0793a61d
TG
2123}
2124
fa289bec 2125/*
cdd6c482 2126 * Put a event into inactive state and update time fields.
fa289bec
PM
2127 * Enabling the leader of a group effectively enables all
2128 * the group members that aren't explicitly disabled, so we
2129 * have to update their ->tstamp_enabled also.
2130 * Note: this works for group members as well as group leaders
2131 * since the non-leader members' sibling_lists will be empty.
2132 */
1d9b482e 2133static void __perf_event_mark_enabled(struct perf_event *event)
fa289bec 2134{
cdd6c482 2135 struct perf_event *sub;
4158755d 2136 u64 tstamp = perf_event_time(event);
fa289bec 2137
cdd6c482 2138 event->state = PERF_EVENT_STATE_INACTIVE;
4158755d 2139 event->tstamp_enabled = tstamp - event->total_time_enabled;
9ed6060d 2140 list_for_each_entry(sub, &event->sibling_list, group_entry) {
4158755d
SE
2141 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
2142 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
9ed6060d 2143 }
fa289bec
PM
2144}
2145
d859e29f 2146/*
cdd6c482 2147 * Cross CPU call to enable a performance event
d859e29f 2148 */
fe4b04fa 2149static int __perf_event_enable(void *info)
04289bb9 2150{
cdd6c482 2151 struct perf_event *event = info;
cdd6c482
IM
2152 struct perf_event_context *ctx = event->ctx;
2153 struct perf_event *leader = event->group_leader;
108b02cf 2154 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
d859e29f 2155 int err;
04289bb9 2156
06f41796
JO
2157 /*
2158 * There's a time window between 'ctx->is_active' check
2159 * in perf_event_enable function and this place having:
2160 * - IRQs on
2161 * - ctx->lock unlocked
2162 *
2163 * where the task could be killed and 'ctx' deactivated
2164 * by perf_event_exit_task.
2165 */
2166 if (!ctx->is_active)
fe4b04fa 2167 return -EINVAL;
3cbed429 2168
e625cce1 2169 raw_spin_lock(&ctx->lock);
4af4998b 2170 update_context_time(ctx);
d859e29f 2171
cdd6c482 2172 if (event->state >= PERF_EVENT_STATE_INACTIVE)
d859e29f 2173 goto unlock;
e5d1367f
SE
2174
2175 /*
2176 * set current task's cgroup time reference point
2177 */
3f7cce3c 2178 perf_cgroup_set_timestamp(current, ctx);
e5d1367f 2179
1d9b482e 2180 __perf_event_mark_enabled(event);
04289bb9 2181
e5d1367f
SE
2182 if (!event_filter_match(event)) {
2183 if (is_cgroup_event(event))
2184 perf_cgroup_defer_enabled(event);
f4c4176f 2185 goto unlock;
e5d1367f 2186 }
f4c4176f 2187
04289bb9 2188 /*
cdd6c482 2189 * If the event is in a group and isn't the group leader,
d859e29f 2190 * then don't put it on unless the group is on.
04289bb9 2191 */
cdd6c482 2192 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
d859e29f 2193 goto unlock;
3b6f9e5c 2194
cdd6c482 2195 if (!group_can_go_on(event, cpuctx, 1)) {
d859e29f 2196 err = -EEXIST;
e758a33d 2197 } else {
cdd6c482 2198 if (event == leader)
6e37738a 2199 err = group_sched_in(event, cpuctx, ctx);
e758a33d 2200 else
6e37738a 2201 err = event_sched_in(event, cpuctx, ctx);
e758a33d 2202 }
d859e29f
PM
2203
2204 if (err) {
2205 /*
cdd6c482 2206 * If this event can't go on and it's part of a
d859e29f
PM
2207 * group, then the whole group has to come off.
2208 */
9e630205 2209 if (leader != event) {
d859e29f 2210 group_sched_out(leader, cpuctx, ctx);
272325c4 2211 perf_mux_hrtimer_restart(cpuctx);
9e630205 2212 }
0d48696f 2213 if (leader->attr.pinned) {
53cfbf59 2214 update_group_times(leader);
cdd6c482 2215 leader->state = PERF_EVENT_STATE_ERROR;
53cfbf59 2216 }
d859e29f
PM
2217 }
2218
9ed6060d 2219unlock:
e625cce1 2220 raw_spin_unlock(&ctx->lock);
fe4b04fa
PZ
2221
2222 return 0;
d859e29f
PM
2223}
2224
2225/*
cdd6c482 2226 * Enable a event.
c93f7669 2227 *
cdd6c482
IM
2228 * If event->ctx is a cloned context, callers must make sure that
2229 * every task struct that event->ctx->task could possibly point to
c93f7669 2230 * remains valid. This condition is satisfied when called through
cdd6c482
IM
2231 * perf_event_for_each_child or perf_event_for_each as described
2232 * for perf_event_disable.
d859e29f 2233 */
f63a8daa 2234static void _perf_event_enable(struct perf_event *event)
d859e29f 2235{
cdd6c482 2236 struct perf_event_context *ctx = event->ctx;
d859e29f
PM
2237 struct task_struct *task = ctx->task;
2238
2239 if (!task) {
2240 /*
cdd6c482 2241 * Enable the event on the cpu that it's on
d859e29f 2242 */
fe4b04fa 2243 cpu_function_call(event->cpu, __perf_event_enable, event);
d859e29f
PM
2244 return;
2245 }
2246
e625cce1 2247 raw_spin_lock_irq(&ctx->lock);
cdd6c482 2248 if (event->state >= PERF_EVENT_STATE_INACTIVE)
d859e29f
PM
2249 goto out;
2250
2251 /*
cdd6c482
IM
2252 * If the event is in error state, clear that first.
2253 * That way, if we see the event in error state below, we
d859e29f
PM
2254 * know that it has gone back into error state, as distinct
2255 * from the task having been scheduled away before the
2256 * cross-call arrived.
2257 */
cdd6c482
IM
2258 if (event->state == PERF_EVENT_STATE_ERROR)
2259 event->state = PERF_EVENT_STATE_OFF;
d859e29f 2260
9ed6060d 2261retry:
fe4b04fa 2262 if (!ctx->is_active) {
1d9b482e 2263 __perf_event_mark_enabled(event);
fe4b04fa
PZ
2264 goto out;
2265 }
2266
e625cce1 2267 raw_spin_unlock_irq(&ctx->lock);
fe4b04fa
PZ
2268
2269 if (!task_function_call(task, __perf_event_enable, event))
2270 return;
d859e29f 2271
e625cce1 2272 raw_spin_lock_irq(&ctx->lock);
d859e29f
PM
2273
2274 /*
cdd6c482 2275 * If the context is active and the event is still off,
d859e29f
PM
2276 * we need to retry the cross-call.
2277 */
fe4b04fa
PZ
2278 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
2279 /*
2280 * task could have been flipped by a concurrent
2281 * perf_event_context_sched_out()
2282 */
2283 task = ctx->task;
d859e29f 2284 goto retry;
fe4b04fa 2285 }
fa289bec 2286
9ed6060d 2287out:
e625cce1 2288 raw_spin_unlock_irq(&ctx->lock);
d859e29f 2289}
f63a8daa
PZ
2290
2291/*
2292 * See perf_event_disable();
2293 */
2294void perf_event_enable(struct perf_event *event)
2295{
2296 struct perf_event_context *ctx;
2297
2298 ctx = perf_event_ctx_lock(event);
2299 _perf_event_enable(event);
2300 perf_event_ctx_unlock(event, ctx);
2301}
dcfce4a0 2302EXPORT_SYMBOL_GPL(perf_event_enable);
d859e29f 2303
f63a8daa 2304static int _perf_event_refresh(struct perf_event *event, int refresh)
79f14641 2305{
2023b359 2306 /*
cdd6c482 2307 * not supported on inherited events
2023b359 2308 */
2e939d1d 2309 if (event->attr.inherit || !is_sampling_event(event))
2023b359
PZ
2310 return -EINVAL;
2311
cdd6c482 2312 atomic_add(refresh, &event->event_limit);
f63a8daa 2313 _perf_event_enable(event);
2023b359
PZ
2314
2315 return 0;
79f14641 2316}
f63a8daa
PZ
2317
2318/*
2319 * See perf_event_disable()
2320 */
2321int perf_event_refresh(struct perf_event *event, int refresh)
2322{
2323 struct perf_event_context *ctx;
2324 int ret;
2325
2326 ctx = perf_event_ctx_lock(event);
2327 ret = _perf_event_refresh(event, refresh);
2328 perf_event_ctx_unlock(event, ctx);
2329
2330 return ret;
2331}
26ca5c11 2332EXPORT_SYMBOL_GPL(perf_event_refresh);
79f14641 2333
5b0311e1
FW
2334static void ctx_sched_out(struct perf_event_context *ctx,
2335 struct perf_cpu_context *cpuctx,
2336 enum event_type_t event_type)
235c7fc7 2337{
cdd6c482 2338 struct perf_event *event;
db24d33e 2339 int is_active = ctx->is_active;
235c7fc7 2340
db24d33e 2341 ctx->is_active &= ~event_type;
cdd6c482 2342 if (likely(!ctx->nr_events))
facc4307
PZ
2343 return;
2344
4af4998b 2345 update_context_time(ctx);
e5d1367f 2346 update_cgrp_time_from_cpuctx(cpuctx);
5b0311e1 2347 if (!ctx->nr_active)
facc4307 2348 return;
5b0311e1 2349
075e0b00 2350 perf_pmu_disable(ctx->pmu);
db24d33e 2351 if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
889ff015
FW
2352 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
2353 group_sched_out(event, cpuctx, ctx);
9ed6060d 2354 }
889ff015 2355
db24d33e 2356 if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
889ff015 2357 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
8c9ed8e1 2358 group_sched_out(event, cpuctx, ctx);
9ed6060d 2359 }
1b9a644f 2360 perf_pmu_enable(ctx->pmu);
235c7fc7
IM
2361}
2362
564c2b21 2363/*
5a3126d4
PZ
2364 * Test whether two contexts are equivalent, i.e. whether they have both been
2365 * cloned from the same version of the same context.
2366 *
2367 * Equivalence is measured using a generation number in the context that is
2368 * incremented on each modification to it; see unclone_ctx(), list_add_event()
2369 * and list_del_event().
564c2b21 2370 */
cdd6c482
IM
2371static int context_equiv(struct perf_event_context *ctx1,
2372 struct perf_event_context *ctx2)
564c2b21 2373{
211de6eb
PZ
2374 lockdep_assert_held(&ctx1->lock);
2375 lockdep_assert_held(&ctx2->lock);
2376
5a3126d4
PZ
2377 /* Pinning disables the swap optimization */
2378 if (ctx1->pin_count || ctx2->pin_count)
2379 return 0;
2380
2381 /* If ctx1 is the parent of ctx2 */
2382 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2383 return 1;
2384
2385 /* If ctx2 is the parent of ctx1 */
2386 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2387 return 1;
2388
2389 /*
2390 * If ctx1 and ctx2 have the same parent; we flatten the parent
2391 * hierarchy, see perf_event_init_context().
2392 */
2393 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2394 ctx1->parent_gen == ctx2->parent_gen)
2395 return 1;
2396
2397 /* Unmatched */
2398 return 0;
564c2b21
PM
2399}
2400
cdd6c482
IM
2401static void __perf_event_sync_stat(struct perf_event *event,
2402 struct perf_event *next_event)
bfbd3381
PZ
2403{
2404 u64 value;
2405
cdd6c482 2406 if (!event->attr.inherit_stat)
bfbd3381
PZ
2407 return;
2408
2409 /*
cdd6c482 2410 * Update the event value, we cannot use perf_event_read()
bfbd3381
PZ
2411 * because we're in the middle of a context switch and have IRQs
2412 * disabled, which upsets smp_call_function_single(), however
cdd6c482 2413 * we know the event must be on the current CPU, therefore we
bfbd3381
PZ
2414 * don't need to use it.
2415 */
cdd6c482
IM
2416 switch (event->state) {
2417 case PERF_EVENT_STATE_ACTIVE:
3dbebf15
PZ
2418 event->pmu->read(event);
2419 /* fall-through */
bfbd3381 2420
cdd6c482
IM
2421 case PERF_EVENT_STATE_INACTIVE:
2422 update_event_times(event);
bfbd3381
PZ
2423 break;
2424
2425 default:
2426 break;
2427 }
2428
2429 /*
cdd6c482 2430 * In order to keep per-task stats reliable we need to flip the event
bfbd3381
PZ
2431 * values when we flip the contexts.
2432 */
e7850595
PZ
2433 value = local64_read(&next_event->count);
2434 value = local64_xchg(&event->count, value);
2435 local64_set(&next_event->count, value);
bfbd3381 2436
cdd6c482
IM
2437 swap(event->total_time_enabled, next_event->total_time_enabled);
2438 swap(event->total_time_running, next_event->total_time_running);
19d2e755 2439
bfbd3381 2440 /*
19d2e755 2441 * Since we swizzled the values, update the user visible data too.
bfbd3381 2442 */
cdd6c482
IM
2443 perf_event_update_userpage(event);
2444 perf_event_update_userpage(next_event);
bfbd3381
PZ
2445}
2446
cdd6c482
IM
2447static void perf_event_sync_stat(struct perf_event_context *ctx,
2448 struct perf_event_context *next_ctx)
bfbd3381 2449{
cdd6c482 2450 struct perf_event *event, *next_event;
bfbd3381
PZ
2451
2452 if (!ctx->nr_stat)
2453 return;
2454
02ffdbc8
PZ
2455 update_context_time(ctx);
2456
cdd6c482
IM
2457 event = list_first_entry(&ctx->event_list,
2458 struct perf_event, event_entry);
bfbd3381 2459
cdd6c482
IM
2460 next_event = list_first_entry(&next_ctx->event_list,
2461 struct perf_event, event_entry);
bfbd3381 2462
cdd6c482
IM
2463 while (&event->event_entry != &ctx->event_list &&
2464 &next_event->event_entry != &next_ctx->event_list) {
bfbd3381 2465
cdd6c482 2466 __perf_event_sync_stat(event, next_event);
bfbd3381 2467
cdd6c482
IM
2468 event = list_next_entry(event, event_entry);
2469 next_event = list_next_entry(next_event, event_entry);
bfbd3381
PZ
2470 }
2471}
2472
fe4b04fa
PZ
2473static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2474 struct task_struct *next)
0793a61d 2475{
8dc85d54 2476 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
cdd6c482 2477 struct perf_event_context *next_ctx;
5a3126d4 2478 struct perf_event_context *parent, *next_parent;
108b02cf 2479 struct perf_cpu_context *cpuctx;
c93f7669 2480 int do_switch = 1;
0793a61d 2481
108b02cf
PZ
2482 if (likely(!ctx))
2483 return;
10989fb2 2484
108b02cf
PZ
2485 cpuctx = __get_cpu_context(ctx);
2486 if (!cpuctx->task_ctx)
0793a61d
TG
2487 return;
2488
c93f7669 2489 rcu_read_lock();
8dc85d54 2490 next_ctx = next->perf_event_ctxp[ctxn];
5a3126d4
PZ
2491 if (!next_ctx)
2492 goto unlock;
2493
2494 parent = rcu_dereference(ctx->parent_ctx);
2495 next_parent = rcu_dereference(next_ctx->parent_ctx);
2496
2497 /* If neither context have a parent context; they cannot be clones. */
802c8a61 2498 if (!parent && !next_parent)
5a3126d4
PZ
2499 goto unlock;
2500
2501 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
c93f7669
PM
2502 /*
2503 * Looks like the two contexts are clones, so we might be
2504 * able to optimize the context switch. We lock both
2505 * contexts and check that they are clones under the
2506 * lock (including re-checking that neither has been
2507 * uncloned in the meantime). It doesn't matter which
2508 * order we take the locks because no other cpu could
2509 * be trying to lock both of these tasks.
2510 */
e625cce1
TG
2511 raw_spin_lock(&ctx->lock);
2512 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
c93f7669 2513 if (context_equiv(ctx, next_ctx)) {
665c2142
PZ
2514 /*
2515 * XXX do we need a memory barrier of sorts
cdd6c482 2516 * wrt to rcu_dereference() of perf_event_ctxp
665c2142 2517 */
8dc85d54
PZ
2518 task->perf_event_ctxp[ctxn] = next_ctx;
2519 next->perf_event_ctxp[ctxn] = ctx;
c93f7669
PM
2520 ctx->task = next;
2521 next_ctx->task = task;
5a158c3c
YZ
2522
2523 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
2524
c93f7669 2525 do_switch = 0;
bfbd3381 2526
cdd6c482 2527 perf_event_sync_stat(ctx, next_ctx);
c93f7669 2528 }
e625cce1
TG
2529 raw_spin_unlock(&next_ctx->lock);
2530 raw_spin_unlock(&ctx->lock);
564c2b21 2531 }
5a3126d4 2532unlock:
c93f7669 2533 rcu_read_unlock();
564c2b21 2534
c93f7669 2535 if (do_switch) {
facc4307 2536 raw_spin_lock(&ctx->lock);
5b0311e1 2537 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
c93f7669 2538 cpuctx->task_ctx = NULL;
facc4307 2539 raw_spin_unlock(&ctx->lock);
c93f7669 2540 }
0793a61d
TG
2541}
2542
ba532500
YZ
2543void perf_sched_cb_dec(struct pmu *pmu)
2544{
2545 this_cpu_dec(perf_sched_cb_usages);
2546}
2547
2548void perf_sched_cb_inc(struct pmu *pmu)
2549{
2550 this_cpu_inc(perf_sched_cb_usages);
2551}
2552
2553/*
2554 * This function provides the context switch callback to the lower code
2555 * layer. It is invoked ONLY when the context switch callback is enabled.
2556 */
2557static void perf_pmu_sched_task(struct task_struct *prev,
2558 struct task_struct *next,
2559 bool sched_in)
2560{
2561 struct perf_cpu_context *cpuctx;
2562 struct pmu *pmu;
2563 unsigned long flags;
2564
2565 if (prev == next)
2566 return;
2567
2568 local_irq_save(flags);
2569
2570 rcu_read_lock();
2571
2572 list_for_each_entry_rcu(pmu, &pmus, entry) {
2573 if (pmu->sched_task) {
2574 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2575
2576 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2577
2578 perf_pmu_disable(pmu);
2579
2580 pmu->sched_task(cpuctx->task_ctx, sched_in);
2581
2582 perf_pmu_enable(pmu);
2583
2584 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2585 }
2586 }
2587
2588 rcu_read_unlock();
2589
2590 local_irq_restore(flags);
2591}
2592
8dc85d54
PZ
2593#define for_each_task_context_nr(ctxn) \
2594 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2595
2596/*
2597 * Called from scheduler to remove the events of the current task,
2598 * with interrupts disabled.
2599 *
2600 * We stop each event and update the event value in event->count.
2601 *
2602 * This does not protect us against NMI, but disable()
2603 * sets the disabled bit in the control field of event _before_
2604 * accessing the event control register. If a NMI hits, then it will
2605 * not restart the event.
2606 */
ab0cce56
JO
2607void __perf_event_task_sched_out(struct task_struct *task,
2608 struct task_struct *next)
8dc85d54
PZ
2609{
2610 int ctxn;
2611
ba532500
YZ
2612 if (__this_cpu_read(perf_sched_cb_usages))
2613 perf_pmu_sched_task(task, next, false);
2614
8dc85d54
PZ
2615 for_each_task_context_nr(ctxn)
2616 perf_event_context_sched_out(task, ctxn, next);
e5d1367f
SE
2617
2618 /*
2619 * if cgroup events exist on this CPU, then we need
2620 * to check if we have to switch out PMU state.
2621 * cgroup event are system-wide mode only
2622 */
4a32fea9 2623 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
a8d757ef 2624 perf_cgroup_sched_out(task, next);
8dc85d54
PZ
2625}
2626
04dc2dbb 2627static void task_ctx_sched_out(struct perf_event_context *ctx)
a08b159f 2628{
108b02cf 2629 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
a08b159f 2630
a63eaf34
PM
2631 if (!cpuctx->task_ctx)
2632 return;
012b84da
IM
2633
2634 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2635 return;
2636
04dc2dbb 2637 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
a08b159f
PM
2638 cpuctx->task_ctx = NULL;
2639}
2640
5b0311e1
FW
2641/*
2642 * Called with IRQs disabled
2643 */
2644static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
2645 enum event_type_t event_type)
2646{
2647 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
04289bb9
IM
2648}
2649
235c7fc7 2650static void
5b0311e1 2651ctx_pinned_sched_in(struct perf_event_context *ctx,
6e37738a 2652 struct perf_cpu_context *cpuctx)
0793a61d 2653{
cdd6c482 2654 struct perf_event *event;
0793a61d 2655
889ff015
FW
2656 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2657 if (event->state <= PERF_EVENT_STATE_OFF)
3b6f9e5c 2658 continue;
5632ab12 2659 if (!event_filter_match(event))
3b6f9e5c
PM
2660 continue;
2661
e5d1367f
SE
2662 /* may need to reset tstamp_enabled */
2663 if (is_cgroup_event(event))
2664 perf_cgroup_mark_enabled(event, ctx);
2665
8c9ed8e1 2666 if (group_can_go_on(event, cpuctx, 1))
6e37738a 2667 group_sched_in(event, cpuctx, ctx);
3b6f9e5c
PM
2668
2669 /*
2670 * If this pinned group hasn't been scheduled,
2671 * put it in error state.
2672 */
cdd6c482
IM
2673 if (event->state == PERF_EVENT_STATE_INACTIVE) {
2674 update_group_times(event);
2675 event->state = PERF_EVENT_STATE_ERROR;
53cfbf59 2676 }
3b6f9e5c 2677 }
5b0311e1
FW
2678}
2679
2680static void
2681ctx_flexible_sched_in(struct perf_event_context *ctx,
6e37738a 2682 struct perf_cpu_context *cpuctx)
5b0311e1
FW
2683{
2684 struct perf_event *event;
2685 int can_add_hw = 1;
3b6f9e5c 2686
889ff015
FW
2687 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2688 /* Ignore events in OFF or ERROR state */
2689 if (event->state <= PERF_EVENT_STATE_OFF)
3b6f9e5c 2690 continue;
04289bb9
IM
2691 /*
2692 * Listen to the 'cpu' scheduling filter constraint
cdd6c482 2693 * of events:
04289bb9 2694 */
5632ab12 2695 if (!event_filter_match(event))
0793a61d
TG
2696 continue;
2697
e5d1367f
SE
2698 /* may need to reset tstamp_enabled */
2699 if (is_cgroup_event(event))
2700 perf_cgroup_mark_enabled(event, ctx);
2701
9ed6060d 2702 if (group_can_go_on(event, cpuctx, can_add_hw)) {
6e37738a 2703 if (group_sched_in(event, cpuctx, ctx))
dd0e6ba2 2704 can_add_hw = 0;
9ed6060d 2705 }
0793a61d 2706 }
5b0311e1
FW
2707}
2708
2709static void
2710ctx_sched_in(struct perf_event_context *ctx,
2711 struct perf_cpu_context *cpuctx,
e5d1367f
SE
2712 enum event_type_t event_type,
2713 struct task_struct *task)
5b0311e1 2714{
e5d1367f 2715 u64 now;
db24d33e 2716 int is_active = ctx->is_active;
e5d1367f 2717
db24d33e 2718 ctx->is_active |= event_type;
5b0311e1 2719 if (likely(!ctx->nr_events))
facc4307 2720 return;
5b0311e1 2721
e5d1367f
SE
2722 now = perf_clock();
2723 ctx->timestamp = now;
3f7cce3c 2724 perf_cgroup_set_timestamp(task, ctx);
5b0311e1
FW
2725 /*
2726 * First go through the list and put on any pinned groups
2727 * in order to give them the best chance of going on.
2728 */
db24d33e 2729 if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
6e37738a 2730 ctx_pinned_sched_in(ctx, cpuctx);
5b0311e1
FW
2731
2732 /* Then walk through the lower prio flexible groups */
db24d33e 2733 if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
6e37738a 2734 ctx_flexible_sched_in(ctx, cpuctx);
235c7fc7
IM
2735}
2736
329c0e01 2737static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
e5d1367f
SE
2738 enum event_type_t event_type,
2739 struct task_struct *task)
329c0e01
FW
2740{
2741 struct perf_event_context *ctx = &cpuctx->ctx;
2742
e5d1367f 2743 ctx_sched_in(ctx, cpuctx, event_type, task);
329c0e01
FW
2744}
2745
e5d1367f
SE
2746static void perf_event_context_sched_in(struct perf_event_context *ctx,
2747 struct task_struct *task)
235c7fc7 2748{
108b02cf 2749 struct perf_cpu_context *cpuctx;
235c7fc7 2750
108b02cf 2751 cpuctx = __get_cpu_context(ctx);
329c0e01
FW
2752 if (cpuctx->task_ctx == ctx)
2753 return;
2754
facc4307 2755 perf_ctx_lock(cpuctx, ctx);
1b9a644f 2756 perf_pmu_disable(ctx->pmu);
329c0e01
FW
2757 /*
2758 * We want to keep the following priority order:
2759 * cpu pinned (that don't need to move), task pinned,
2760 * cpu flexible, task flexible.
2761 */
2762 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2763
1d5f003f
GN
2764 if (ctx->nr_events)
2765 cpuctx->task_ctx = ctx;
9b33fa6b 2766
86b47c25
GN
2767 perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
2768
facc4307
PZ
2769 perf_pmu_enable(ctx->pmu);
2770 perf_ctx_unlock(cpuctx, ctx);
235c7fc7
IM
2771}
2772
8dc85d54
PZ
2773/*
2774 * Called from scheduler to add the events of the current task
2775 * with interrupts disabled.
2776 *
2777 * We restore the event value and then enable it.
2778 *
2779 * This does not protect us against NMI, but enable()
2780 * sets the enabled bit in the control field of event _before_
2781 * accessing the event control register. If a NMI hits, then it will
2782 * keep the event running.
2783 */
ab0cce56
JO
2784void __perf_event_task_sched_in(struct task_struct *prev,
2785 struct task_struct *task)
8dc85d54
PZ
2786{
2787 struct perf_event_context *ctx;
2788 int ctxn;
2789
2790 for_each_task_context_nr(ctxn) {
2791 ctx = task->perf_event_ctxp[ctxn];
2792 if (likely(!ctx))
2793 continue;
2794
e5d1367f 2795 perf_event_context_sched_in(ctx, task);
8dc85d54 2796 }
e5d1367f
SE
2797 /*
2798 * if cgroup events exist on this CPU, then we need
2799 * to check if we have to switch in PMU state.
2800 * cgroup event are system-wide mode only
2801 */
4a32fea9 2802 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
a8d757ef 2803 perf_cgroup_sched_in(prev, task);
d010b332 2804
ba532500
YZ
2805 if (__this_cpu_read(perf_sched_cb_usages))
2806 perf_pmu_sched_task(prev, task, true);
235c7fc7
IM
2807}
2808
abd50713
PZ
2809static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
2810{
2811 u64 frequency = event->attr.sample_freq;
2812 u64 sec = NSEC_PER_SEC;
2813 u64 divisor, dividend;
2814
2815 int count_fls, nsec_fls, frequency_fls, sec_fls;
2816
2817 count_fls = fls64(count);
2818 nsec_fls = fls64(nsec);
2819 frequency_fls = fls64(frequency);
2820 sec_fls = 30;
2821
2822 /*
2823 * We got @count in @nsec, with a target of sample_freq HZ
2824 * the target period becomes:
2825 *
2826 * @count * 10^9
2827 * period = -------------------
2828 * @nsec * sample_freq
2829 *
2830 */
2831
2832 /*
2833 * Reduce accuracy by one bit such that @a and @b converge
2834 * to a similar magnitude.
2835 */
fe4b04fa 2836#define REDUCE_FLS(a, b) \
abd50713
PZ
2837do { \
2838 if (a##_fls > b##_fls) { \
2839 a >>= 1; \
2840 a##_fls--; \
2841 } else { \
2842 b >>= 1; \
2843 b##_fls--; \
2844 } \
2845} while (0)
2846
2847 /*
2848 * Reduce accuracy until either term fits in a u64, then proceed with
2849 * the other, so that finally we can do a u64/u64 division.
2850 */
2851 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
2852 REDUCE_FLS(nsec, frequency);
2853 REDUCE_FLS(sec, count);
2854 }
2855
2856 if (count_fls + sec_fls > 64) {
2857 divisor = nsec * frequency;
2858
2859 while (count_fls + sec_fls > 64) {
2860 REDUCE_FLS(count, sec);
2861 divisor >>= 1;
2862 }
2863
2864 dividend = count * sec;
2865 } else {
2866 dividend = count * sec;
2867
2868 while (nsec_fls + frequency_fls > 64) {
2869 REDUCE_FLS(nsec, frequency);
2870 dividend >>= 1;
2871 }
2872
2873 divisor = nsec * frequency;
2874 }
2875
f6ab91ad
PZ
2876 if (!divisor)
2877 return dividend;
2878
abd50713
PZ
2879 return div64_u64(dividend, divisor);
2880}
2881
e050e3f0
SE
2882static DEFINE_PER_CPU(int, perf_throttled_count);
2883static DEFINE_PER_CPU(u64, perf_throttled_seq);
2884
f39d47ff 2885static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
bd2b5b12 2886{
cdd6c482 2887 struct hw_perf_event *hwc = &event->hw;
f6ab91ad 2888 s64 period, sample_period;
bd2b5b12
PZ
2889 s64 delta;
2890
abd50713 2891 period = perf_calculate_period(event, nsec, count);
bd2b5b12
PZ
2892
2893 delta = (s64)(period - hwc->sample_period);
2894 delta = (delta + 7) / 8; /* low pass filter */
2895
2896 sample_period = hwc->sample_period + delta;
2897
2898 if (!sample_period)
2899 sample_period = 1;
2900
bd2b5b12 2901 hwc->sample_period = sample_period;
abd50713 2902
e7850595 2903 if (local64_read(&hwc->period_left) > 8*sample_period) {
f39d47ff
SE
2904 if (disable)
2905 event->pmu->stop(event, PERF_EF_UPDATE);
2906
e7850595 2907 local64_set(&hwc->period_left, 0);
f39d47ff
SE
2908
2909 if (disable)
2910 event->pmu->start(event, PERF_EF_RELOAD);
abd50713 2911 }
bd2b5b12
PZ
2912}
2913
e050e3f0
SE
2914/*
2915 * combine freq adjustment with unthrottling to avoid two passes over the
2916 * events. At the same time, make sure, having freq events does not change
2917 * the rate of unthrottling as that would introduce bias.
2918 */
2919static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2920 int needs_unthr)
60db5e09 2921{
cdd6c482
IM
2922 struct perf_event *event;
2923 struct hw_perf_event *hwc;
e050e3f0 2924 u64 now, period = TICK_NSEC;
abd50713 2925 s64 delta;
60db5e09 2926
e050e3f0
SE
2927 /*
2928 * only need to iterate over all events iff:
2929 * - context have events in frequency mode (needs freq adjust)
2930 * - there are events to unthrottle on this cpu
2931 */
2932 if (!(ctx->nr_freq || needs_unthr))
0f5a2601
PZ
2933 return;
2934
e050e3f0 2935 raw_spin_lock(&ctx->lock);
f39d47ff 2936 perf_pmu_disable(ctx->pmu);
e050e3f0 2937
03541f8b 2938 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
cdd6c482 2939 if (event->state != PERF_EVENT_STATE_ACTIVE)
60db5e09
PZ
2940 continue;
2941
5632ab12 2942 if (!event_filter_match(event))
5d27c23d
PZ
2943 continue;
2944
44377277
AS
2945 perf_pmu_disable(event->pmu);
2946
cdd6c482 2947 hwc = &event->hw;
6a24ed6c 2948
ae23bff1 2949 if (hwc->interrupts == MAX_INTERRUPTS) {
e050e3f0 2950 hwc->interrupts = 0;
cdd6c482 2951 perf_log_throttle(event, 1);
a4eaf7f1 2952 event->pmu->start(event, 0);
a78ac325
PZ
2953 }
2954
cdd6c482 2955 if (!event->attr.freq || !event->attr.sample_freq)
44377277 2956 goto next;
60db5e09 2957
e050e3f0
SE
2958 /*
2959 * stop the event and update event->count
2960 */
2961 event->pmu->stop(event, PERF_EF_UPDATE);
2962
e7850595 2963 now = local64_read(&event->count);
abd50713
PZ
2964 delta = now - hwc->freq_count_stamp;
2965 hwc->freq_count_stamp = now;
60db5e09 2966
e050e3f0
SE
2967 /*
2968 * restart the event
2969 * reload only if value has changed
f39d47ff
SE
2970 * we have stopped the event so tell that
2971 * to perf_adjust_period() to avoid stopping it
2972 * twice.
e050e3f0 2973 */
abd50713 2974 if (delta > 0)
f39d47ff 2975 perf_adjust_period(event, period, delta, false);
e050e3f0
SE
2976
2977 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
44377277
AS
2978 next:
2979 perf_pmu_enable(event->pmu);
60db5e09 2980 }
e050e3f0 2981
f39d47ff 2982 perf_pmu_enable(ctx->pmu);
e050e3f0 2983 raw_spin_unlock(&ctx->lock);
60db5e09
PZ
2984}
2985
235c7fc7 2986/*
cdd6c482 2987 * Round-robin a context's events:
235c7fc7 2988 */
cdd6c482 2989static void rotate_ctx(struct perf_event_context *ctx)
0793a61d 2990{
dddd3379
TG
2991 /*
2992 * Rotate the first entry last of non-pinned groups. Rotation might be
2993 * disabled by the inheritance code.
2994 */
2995 if (!ctx->rotate_disable)
2996 list_rotate_left(&ctx->flexible_groups);
235c7fc7
IM
2997}
2998
9e630205 2999static int perf_rotate_context(struct perf_cpu_context *cpuctx)
235c7fc7 3000{
8dc85d54 3001 struct perf_event_context *ctx = NULL;
2fde4f94 3002 int rotate = 0;
7fc23a53 3003
b5ab4cd5 3004 if (cpuctx->ctx.nr_events) {
b5ab4cd5
PZ
3005 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3006 rotate = 1;
3007 }
235c7fc7 3008
8dc85d54 3009 ctx = cpuctx->task_ctx;
b5ab4cd5 3010 if (ctx && ctx->nr_events) {
b5ab4cd5
PZ
3011 if (ctx->nr_events != ctx->nr_active)
3012 rotate = 1;
3013 }
9717e6cd 3014
e050e3f0 3015 if (!rotate)
0f5a2601
PZ
3016 goto done;
3017
facc4307 3018 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
1b9a644f 3019 perf_pmu_disable(cpuctx->ctx.pmu);
60db5e09 3020
e050e3f0
SE
3021 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3022 if (ctx)
3023 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
0793a61d 3024
e050e3f0
SE
3025 rotate_ctx(&cpuctx->ctx);
3026 if (ctx)
3027 rotate_ctx(ctx);
235c7fc7 3028
e050e3f0 3029 perf_event_sched_in(cpuctx, ctx, current);
235c7fc7 3030
0f5a2601
PZ
3031 perf_pmu_enable(cpuctx->ctx.pmu);
3032 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
b5ab4cd5 3033done:
9e630205
SE
3034
3035 return rotate;
e9d2b064
PZ
3036}
3037
026249ef
FW
3038#ifdef CONFIG_NO_HZ_FULL
3039bool perf_event_can_stop_tick(void)
3040{
948b26b6 3041 if (atomic_read(&nr_freq_events) ||
d84153d6 3042 __this_cpu_read(perf_throttled_count))
026249ef 3043 return false;
d84153d6
FW
3044 else
3045 return true;
026249ef
FW
3046}
3047#endif
3048
e9d2b064
PZ
3049void perf_event_task_tick(void)
3050{
2fde4f94
MR
3051 struct list_head *head = this_cpu_ptr(&active_ctx_list);
3052 struct perf_event_context *ctx, *tmp;
e050e3f0 3053 int throttled;
b5ab4cd5 3054
e9d2b064
PZ
3055 WARN_ON(!irqs_disabled());
3056
e050e3f0
SE
3057 __this_cpu_inc(perf_throttled_seq);
3058 throttled = __this_cpu_xchg(perf_throttled_count, 0);
3059
2fde4f94 3060 list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
e050e3f0 3061 perf_adjust_freq_unthr_context(ctx, throttled);
0793a61d
TG
3062}
3063
889ff015
FW
3064static int event_enable_on_exec(struct perf_event *event,
3065 struct perf_event_context *ctx)
3066{
3067 if (!event->attr.enable_on_exec)
3068 return 0;
3069
3070 event->attr.enable_on_exec = 0;
3071 if (event->state >= PERF_EVENT_STATE_INACTIVE)
3072 return 0;
3073
1d9b482e 3074 __perf_event_mark_enabled(event);
889ff015
FW
3075
3076 return 1;
3077}
3078
57e7986e 3079/*
cdd6c482 3080 * Enable all of a task's events that have been marked enable-on-exec.
57e7986e
PM
3081 * This expects task == current.
3082 */
8dc85d54 3083static void perf_event_enable_on_exec(struct perf_event_context *ctx)
57e7986e 3084{
211de6eb 3085 struct perf_event_context *clone_ctx = NULL;
cdd6c482 3086 struct perf_event *event;
57e7986e
PM
3087 unsigned long flags;
3088 int enabled = 0;
889ff015 3089 int ret;
57e7986e
PM
3090
3091 local_irq_save(flags);
cdd6c482 3092 if (!ctx || !ctx->nr_events)
57e7986e
PM
3093 goto out;
3094
e566b76e
SE
3095 /*
3096 * We must ctxsw out cgroup events to avoid conflict
3097 * when invoking perf_task_event_sched_in() later on
3098 * in this function. Otherwise we end up trying to
3099 * ctxswin cgroup events which are already scheduled
3100 * in.
3101 */
a8d757ef 3102 perf_cgroup_sched_out(current, NULL);
57e7986e 3103
e625cce1 3104 raw_spin_lock(&ctx->lock);
04dc2dbb 3105 task_ctx_sched_out(ctx);
57e7986e 3106
b79387ef 3107 list_for_each_entry(event, &ctx->event_list, event_entry) {
889ff015
FW
3108 ret = event_enable_on_exec(event, ctx);
3109 if (ret)
3110 enabled = 1;
57e7986e
PM
3111 }
3112
3113 /*
cdd6c482 3114 * Unclone this context if we enabled any event.
57e7986e 3115 */
71a851b4 3116 if (enabled)
211de6eb 3117 clone_ctx = unclone_ctx(ctx);
57e7986e 3118
e625cce1 3119 raw_spin_unlock(&ctx->lock);
57e7986e 3120
e566b76e
SE
3121 /*
3122 * Also calls ctxswin for cgroup events, if any:
3123 */
e5d1367f 3124 perf_event_context_sched_in(ctx, ctx->task);
9ed6060d 3125out:
57e7986e 3126 local_irq_restore(flags);
211de6eb
PZ
3127
3128 if (clone_ctx)
3129 put_ctx(clone_ctx);
57e7986e
PM
3130}
3131
e041e328
PZ
3132void perf_event_exec(void)
3133{
3134 struct perf_event_context *ctx;
3135 int ctxn;
3136
3137 rcu_read_lock();
3138 for_each_task_context_nr(ctxn) {
3139 ctx = current->perf_event_ctxp[ctxn];
3140 if (!ctx)
3141 continue;
3142
3143 perf_event_enable_on_exec(ctx);
3144 }
3145 rcu_read_unlock();
3146}
3147
0793a61d 3148/*
cdd6c482 3149 * Cross CPU call to read the hardware event
0793a61d 3150 */
cdd6c482 3151static void __perf_event_read(void *info)
0793a61d 3152{
cdd6c482
IM
3153 struct perf_event *event = info;
3154 struct perf_event_context *ctx = event->ctx;
108b02cf 3155 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
621a01ea 3156
e1ac3614
PM
3157 /*
3158 * If this is a task context, we need to check whether it is
3159 * the current task context of this cpu. If not it has been
3160 * scheduled out before the smp call arrived. In that case
cdd6c482
IM
3161 * event->count would have been updated to a recent sample
3162 * when the event was scheduled out.
e1ac3614
PM
3163 */
3164 if (ctx->task && cpuctx->task_ctx != ctx)
3165 return;
3166
e625cce1 3167 raw_spin_lock(&ctx->lock);
e5d1367f 3168 if (ctx->is_active) {
542e72fc 3169 update_context_time(ctx);
e5d1367f
SE
3170 update_cgrp_time_from_event(event);
3171 }
cdd6c482 3172 update_event_times(event);
542e72fc
PZ
3173 if (event->state == PERF_EVENT_STATE_ACTIVE)
3174 event->pmu->read(event);
e625cce1 3175 raw_spin_unlock(&ctx->lock);
0793a61d
TG
3176}
3177
b5e58793
PZ
3178static inline u64 perf_event_count(struct perf_event *event)
3179{
eacd3ecc
MF
3180 if (event->pmu->count)
3181 return event->pmu->count(event);
3182
3183 return __perf_event_count(event);
b5e58793
PZ
3184}
3185
cdd6c482 3186static u64 perf_event_read(struct perf_event *event)
0793a61d
TG
3187{
3188 /*
cdd6c482
IM
3189 * If event is enabled and currently active on a CPU, update the
3190 * value in the event structure:
0793a61d 3191 */
cdd6c482
IM
3192 if (event->state == PERF_EVENT_STATE_ACTIVE) {
3193 smp_call_function_single(event->oncpu,
3194 __perf_event_read, event, 1);
3195 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
2b8988c9
PZ
3196 struct perf_event_context *ctx = event->ctx;
3197 unsigned long flags;
3198
e625cce1 3199 raw_spin_lock_irqsave(&ctx->lock, flags);
c530ccd9
SE
3200 /*
3201 * may read while context is not active
3202 * (e.g., thread is blocked), in that case
3203 * we cannot update context time
3204 */
e5d1367f 3205 if (ctx->is_active) {
c530ccd9 3206 update_context_time(ctx);
e5d1367f
SE
3207 update_cgrp_time_from_event(event);
3208 }
cdd6c482 3209 update_event_times(event);
e625cce1 3210 raw_spin_unlock_irqrestore(&ctx->lock, flags);
0793a61d
TG
3211 }
3212
b5e58793 3213 return perf_event_count(event);
0793a61d
TG
3214}
3215
a63eaf34 3216/*
cdd6c482 3217 * Initialize the perf_event context in a task_struct:
a63eaf34 3218 */
eb184479 3219static void __perf_event_init_context(struct perf_event_context *ctx)
a63eaf34 3220{
e625cce1 3221 raw_spin_lock_init(&ctx->lock);
a63eaf34 3222 mutex_init(&ctx->mutex);
2fde4f94 3223 INIT_LIST_HEAD(&ctx->active_ctx_list);
889ff015
FW
3224 INIT_LIST_HEAD(&ctx->pinned_groups);
3225 INIT_LIST_HEAD(&ctx->flexible_groups);
a63eaf34
PM
3226 INIT_LIST_HEAD(&ctx->event_list);
3227 atomic_set(&ctx->refcount, 1);
fadfe7be 3228 INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
eb184479
PZ
3229}
3230
3231static struct perf_event_context *
3232alloc_perf_context(struct pmu *pmu, struct task_struct *task)
3233{
3234 struct perf_event_context *ctx;
3235
3236 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
3237 if (!ctx)
3238 return NULL;
3239
3240 __perf_event_init_context(ctx);
3241 if (task) {
3242 ctx->task = task;
3243 get_task_struct(task);
0793a61d 3244 }
eb184479
PZ
3245 ctx->pmu = pmu;
3246
3247 return ctx;
a63eaf34
PM
3248}
3249
2ebd4ffb
MH
3250static struct task_struct *
3251find_lively_task_by_vpid(pid_t vpid)
3252{
3253 struct task_struct *task;
3254 int err;
0793a61d
TG
3255
3256 rcu_read_lock();
2ebd4ffb 3257 if (!vpid)
0793a61d
TG
3258 task = current;
3259 else
2ebd4ffb 3260 task = find_task_by_vpid(vpid);
0793a61d
TG
3261 if (task)
3262 get_task_struct(task);
3263 rcu_read_unlock();
3264
3265 if (!task)
3266 return ERR_PTR(-ESRCH);
3267
0793a61d 3268 /* Reuse ptrace permission checks for now. */
c93f7669
PM
3269 err = -EACCES;
3270 if (!ptrace_may_access(task, PTRACE_MODE_READ))
3271 goto errout;
3272
2ebd4ffb
MH
3273 return task;
3274errout:
3275 put_task_struct(task);
3276 return ERR_PTR(err);
3277
3278}
3279
fe4b04fa
PZ
3280/*
3281 * Returns a matching context with refcount and pincount.
3282 */
108b02cf 3283static struct perf_event_context *
4af57ef2
YZ
3284find_get_context(struct pmu *pmu, struct task_struct *task,
3285 struct perf_event *event)
0793a61d 3286{
211de6eb 3287 struct perf_event_context *ctx, *clone_ctx = NULL;
22a4f650 3288 struct perf_cpu_context *cpuctx;
4af57ef2 3289 void *task_ctx_data = NULL;
25346b93 3290 unsigned long flags;
8dc85d54 3291 int ctxn, err;
4af57ef2 3292 int cpu = event->cpu;
0793a61d 3293
22a4ec72 3294 if (!task) {
cdd6c482 3295 /* Must be root to operate on a CPU event: */
0764771d 3296 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
0793a61d
TG
3297 return ERR_PTR(-EACCES);
3298
0793a61d 3299 /*
cdd6c482 3300 * We could be clever and allow to attach a event to an
0793a61d
TG
3301 * offline CPU and activate it when the CPU comes up, but
3302 * that's for later.
3303 */
f6325e30 3304 if (!cpu_online(cpu))
0793a61d
TG
3305 return ERR_PTR(-ENODEV);
3306
108b02cf 3307 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
0793a61d 3308 ctx = &cpuctx->ctx;
c93f7669 3309 get_ctx(ctx);
fe4b04fa 3310 ++ctx->pin_count;
0793a61d 3311
0793a61d
TG
3312 return ctx;
3313 }
3314
8dc85d54
PZ
3315 err = -EINVAL;
3316 ctxn = pmu->task_ctx_nr;
3317 if (ctxn < 0)
3318 goto errout;
3319
4af57ef2
YZ
3320 if (event->attach_state & PERF_ATTACH_TASK_DATA) {
3321 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
3322 if (!task_ctx_data) {
3323 err = -ENOMEM;
3324 goto errout;
3325 }
3326 }
3327
9ed6060d 3328retry:
8dc85d54 3329 ctx = perf_lock_task_context(task, ctxn, &flags);
c93f7669 3330 if (ctx) {
211de6eb 3331 clone_ctx = unclone_ctx(ctx);
fe4b04fa 3332 ++ctx->pin_count;
4af57ef2
YZ
3333
3334 if (task_ctx_data && !ctx->task_ctx_data) {
3335 ctx->task_ctx_data = task_ctx_data;
3336 task_ctx_data = NULL;
3337 }
e625cce1 3338 raw_spin_unlock_irqrestore(&ctx->lock, flags);
211de6eb
PZ
3339
3340 if (clone_ctx)
3341 put_ctx(clone_ctx);
9137fb28 3342 } else {
eb184479 3343 ctx = alloc_perf_context(pmu, task);
c93f7669
PM
3344 err = -ENOMEM;
3345 if (!ctx)
3346 goto errout;
eb184479 3347
4af57ef2
YZ
3348 if (task_ctx_data) {
3349 ctx->task_ctx_data = task_ctx_data;
3350 task_ctx_data = NULL;
3351 }
3352
dbe08d82
ON
3353 err = 0;
3354 mutex_lock(&task->perf_event_mutex);
3355 /*
3356 * If it has already passed perf_event_exit_task().
3357 * we must see PF_EXITING, it takes this mutex too.
3358 */
3359 if (task->flags & PF_EXITING)
3360 err = -ESRCH;
3361 else if (task->perf_event_ctxp[ctxn])
3362 err = -EAGAIN;
fe4b04fa 3363 else {
9137fb28 3364 get_ctx(ctx);
fe4b04fa 3365 ++ctx->pin_count;
dbe08d82 3366 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
fe4b04fa 3367 }
dbe08d82
ON
3368 mutex_unlock(&task->perf_event_mutex);
3369
3370 if (unlikely(err)) {
9137fb28 3371 put_ctx(ctx);
dbe08d82
ON
3372
3373 if (err == -EAGAIN)
3374 goto retry;
3375 goto errout;
a63eaf34
PM
3376 }
3377 }
3378
4af57ef2 3379 kfree(task_ctx_data);
0793a61d 3380 return ctx;
c93f7669 3381
9ed6060d 3382errout:
4af57ef2 3383 kfree(task_ctx_data);
c93f7669 3384 return ERR_PTR(err);
0793a61d
TG
3385}
3386
6fb2915d 3387static void perf_event_free_filter(struct perf_event *event);
2541517c 3388static void perf_event_free_bpf_prog(struct perf_event *event);
6fb2915d 3389
cdd6c482 3390static void free_event_rcu(struct rcu_head *head)
592903cd 3391{
cdd6c482 3392 struct perf_event *event;
592903cd 3393
cdd6c482
IM
3394 event = container_of(head, struct perf_event, rcu_head);
3395 if (event->ns)
3396 put_pid_ns(event->ns);
6fb2915d 3397 perf_event_free_filter(event);
2541517c 3398 perf_event_free_bpf_prog(event);
cdd6c482 3399 kfree(event);
592903cd
PZ
3400}
3401
b69cf536
PZ
3402static void ring_buffer_attach(struct perf_event *event,
3403 struct ring_buffer *rb);
925d519a 3404
4beb31f3 3405static void unaccount_event_cpu(struct perf_event *event, int cpu)
f1600952 3406{
4beb31f3
FW
3407 if (event->parent)
3408 return;
3409
4beb31f3
FW
3410 if (is_cgroup_event(event))
3411 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3412}
925d519a 3413
4beb31f3
FW
3414static void unaccount_event(struct perf_event *event)
3415{
3416 if (event->parent)
3417 return;
3418
3419 if (event->attach_state & PERF_ATTACH_TASK)
3420 static_key_slow_dec_deferred(&perf_sched_events);
3421 if (event->attr.mmap || event->attr.mmap_data)
3422 atomic_dec(&nr_mmap_events);
3423 if (event->attr.comm)
3424 atomic_dec(&nr_comm_events);
3425 if (event->attr.task)
3426 atomic_dec(&nr_task_events);
948b26b6
FW
3427 if (event->attr.freq)
3428 atomic_dec(&nr_freq_events);
4beb31f3
FW
3429 if (is_cgroup_event(event))
3430 static_key_slow_dec_deferred(&perf_sched_events);
3431 if (has_branch_stack(event))
3432 static_key_slow_dec_deferred(&perf_sched_events);
3433
3434 unaccount_event_cpu(event, event->cpu);
3435}
925d519a 3436
bed5b25a
AS
3437/*
3438 * The following implement mutual exclusion of events on "exclusive" pmus
3439 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
3440 * at a time, so we disallow creating events that might conflict, namely:
3441 *
3442 * 1) cpu-wide events in the presence of per-task events,
3443 * 2) per-task events in the presence of cpu-wide events,
3444 * 3) two matching events on the same context.
3445 *
3446 * The former two cases are handled in the allocation path (perf_event_alloc(),
3447 * __free_event()), the latter -- before the first perf_install_in_context().
3448 */
3449static int exclusive_event_init(struct perf_event *event)
3450{
3451 struct pmu *pmu = event->pmu;
3452
3453 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3454 return 0;
3455
3456 /*
3457 * Prevent co-existence of per-task and cpu-wide events on the
3458 * same exclusive pmu.
3459 *
3460 * Negative pmu::exclusive_cnt means there are cpu-wide
3461 * events on this "exclusive" pmu, positive means there are
3462 * per-task events.
3463 *
3464 * Since this is called in perf_event_alloc() path, event::ctx
3465 * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
3466 * to mean "per-task event", because unlike other attach states it
3467 * never gets cleared.
3468 */
3469 if (event->attach_state & PERF_ATTACH_TASK) {
3470 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
3471 return -EBUSY;
3472 } else {
3473 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
3474 return -EBUSY;
3475 }
3476
3477 return 0;
3478}
3479
3480static void exclusive_event_destroy(struct perf_event *event)
3481{
3482 struct pmu *pmu = event->pmu;
3483
3484 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3485 return;
3486
3487 /* see comment in exclusive_event_init() */
3488 if (event->attach_state & PERF_ATTACH_TASK)
3489 atomic_dec(&pmu->exclusive_cnt);
3490 else
3491 atomic_inc(&pmu->exclusive_cnt);
3492}
3493
3494static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
3495{
3496 if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
3497 (e1->cpu == e2->cpu ||
3498 e1->cpu == -1 ||
3499 e2->cpu == -1))
3500 return true;
3501 return false;
3502}
3503
3504/* Called under the same ctx::mutex as perf_install_in_context() */
3505static bool exclusive_event_installable(struct perf_event *event,
3506 struct perf_event_context *ctx)
3507{
3508 struct perf_event *iter_event;
3509 struct pmu *pmu = event->pmu;
3510
3511 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3512 return true;
3513
3514 list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
3515 if (exclusive_event_match(iter_event, event))
3516 return false;
3517 }
3518
3519 return true;
3520}
3521
766d6c07
FW
3522static void __free_event(struct perf_event *event)
3523{
cdd6c482 3524 if (!event->parent) {
927c7a9e
FW
3525 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
3526 put_callchain_buffers();
f344011c 3527 }
9ee318a7 3528
766d6c07
FW
3529 if (event->destroy)
3530 event->destroy(event);
3531
3532 if (event->ctx)
3533 put_ctx(event->ctx);
3534
bed5b25a
AS
3535 if (event->pmu) {
3536 exclusive_event_destroy(event);
c464c76e 3537 module_put(event->pmu->module);
bed5b25a 3538 }
c464c76e 3539
766d6c07
FW
3540 call_rcu(&event->rcu_head, free_event_rcu);
3541}
683ede43
PZ
3542
3543static void _free_event(struct perf_event *event)
f1600952 3544{
e360adbe 3545 irq_work_sync(&event->pending);
925d519a 3546
4beb31f3 3547 unaccount_event(event);
9ee318a7 3548
76369139 3549 if (event->rb) {
9bb5d40c
PZ
3550 /*
3551 * Can happen when we close an event with re-directed output.
3552 *
3553 * Since we have a 0 refcount, perf_mmap_close() will skip
3554 * over us; possibly making our ring_buffer_put() the last.
3555 */
3556 mutex_lock(&event->mmap_mutex);
b69cf536 3557 ring_buffer_attach(event, NULL);
9bb5d40c 3558 mutex_unlock(&event->mmap_mutex);
a4be7c27
PZ
3559 }
3560
e5d1367f
SE
3561 if (is_cgroup_event(event))
3562 perf_detach_cgroup(event);
3563
766d6c07 3564 __free_event(event);
f1600952
PZ
3565}
3566
683ede43
PZ
3567/*
3568 * Used to free events which have a known refcount of 1, such as in error paths
3569 * where the event isn't exposed yet and inherited events.
3570 */
3571static void free_event(struct perf_event *event)
0793a61d 3572{
683ede43
PZ
3573 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
3574 "unexpected event refcount: %ld; ptr=%p\n",
3575 atomic_long_read(&event->refcount), event)) {
3576 /* leak to avoid use-after-free */
3577 return;
3578 }
0793a61d 3579
683ede43 3580 _free_event(event);
0793a61d
TG
3581}
3582
a66a3052 3583/*
f8697762 3584 * Remove user event from the owner task.
a66a3052 3585 */
f8697762 3586static void perf_remove_from_owner(struct perf_event *event)
fb0459d7 3587{
8882135b 3588 struct task_struct *owner;
fb0459d7 3589
8882135b
PZ
3590 rcu_read_lock();
3591 owner = ACCESS_ONCE(event->owner);
3592 /*
3593 * Matches the smp_wmb() in perf_event_exit_task(). If we observe
3594 * !owner it means the list deletion is complete and we can indeed
3595 * free this event, otherwise we need to serialize on
3596 * owner->perf_event_mutex.
3597 */
3598 smp_read_barrier_depends();
3599 if (owner) {
3600 /*
3601 * Since delayed_put_task_struct() also drops the last
3602 * task reference we can safely take a new reference
3603 * while holding the rcu_read_lock().
3604 */
3605 get_task_struct(owner);
3606 }
3607 rcu_read_unlock();
3608
3609 if (owner) {
f63a8daa
PZ
3610 /*
3611 * If we're here through perf_event_exit_task() we're already
3612 * holding ctx->mutex which would be an inversion wrt. the
3613 * normal lock order.
3614 *
3615 * However we can safely take this lock because its the child
3616 * ctx->mutex.
3617 */
3618 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
3619
8882135b
PZ
3620 /*
3621 * We have to re-check the event->owner field, if it is cleared
3622 * we raced with perf_event_exit_task(), acquiring the mutex
3623 * ensured they're done, and we can proceed with freeing the
3624 * event.
3625 */
3626 if (event->owner)
3627 list_del_init(&event->owner_entry);
3628 mutex_unlock(&owner->perf_event_mutex);
3629 put_task_struct(owner);
3630 }
f8697762
JO
3631}
3632
3633/*
3634 * Called when the last reference to the file is gone.
3635 */
3636static void put_event(struct perf_event *event)
3637{
a83fe28e 3638 struct perf_event_context *ctx;
f8697762
JO
3639
3640 if (!atomic_long_dec_and_test(&event->refcount))
3641 return;
3642
3643 if (!is_kernel_event(event))
3644 perf_remove_from_owner(event);
8882135b 3645
683ede43
PZ
3646 /*
3647 * There are two ways this annotation is useful:
3648 *
3649 * 1) there is a lock recursion from perf_event_exit_task
3650 * see the comment there.
3651 *
3652 * 2) there is a lock-inversion with mmap_sem through
3653 * perf_event_read_group(), which takes faults while
3654 * holding ctx->mutex, however this is called after
3655 * the last filedesc died, so there is no possibility
3656 * to trigger the AB-BA case.
3657 */
a83fe28e
PZ
3658 ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
3659 WARN_ON_ONCE(ctx->parent_ctx);
683ede43 3660 perf_remove_from_context(event, true);
d415a7f1 3661 perf_event_ctx_unlock(event, ctx);
683ede43
PZ
3662
3663 _free_event(event);
a6fa941d
AV
3664}
3665
683ede43
PZ
3666int perf_event_release_kernel(struct perf_event *event)
3667{
3668 put_event(event);
3669 return 0;
3670}
3671EXPORT_SYMBOL_GPL(perf_event_release_kernel);
3672
a6fa941d
AV
3673static int perf_release(struct inode *inode, struct file *file)
3674{
3675 put_event(file->private_data);
3676 return 0;
fb0459d7 3677}
fb0459d7 3678
fadfe7be
JO
3679/*
3680 * Remove all orphanes events from the context.
3681 */
3682static void orphans_remove_work(struct work_struct *work)
3683{
3684 struct perf_event_context *ctx;
3685 struct perf_event *event, *tmp;
3686
3687 ctx = container_of(work, struct perf_event_context,
3688 orphans_remove.work);
3689
3690 mutex_lock(&ctx->mutex);
3691 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
3692 struct perf_event *parent_event = event->parent;
3693
3694 if (!is_orphaned_child(event))
3695 continue;
3696
3697 perf_remove_from_context(event, true);
3698
3699 mutex_lock(&parent_event->child_mutex);
3700 list_del_init(&event->child_list);
3701 mutex_unlock(&parent_event->child_mutex);
3702
3703 free_event(event);
3704 put_event(parent_event);
3705 }
3706
3707 raw_spin_lock_irq(&ctx->lock);
3708 ctx->orphans_remove_sched = false;
3709 raw_spin_unlock_irq(&ctx->lock);
3710 mutex_unlock(&ctx->mutex);
3711
3712 put_ctx(ctx);
3713}
3714
59ed446f 3715u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
e53c0994 3716{
cdd6c482 3717 struct perf_event *child;
e53c0994
PZ
3718 u64 total = 0;
3719
59ed446f
PZ
3720 *enabled = 0;
3721 *running = 0;
3722
6f10581a 3723 mutex_lock(&event->child_mutex);
cdd6c482 3724 total += perf_event_read(event);
59ed446f
PZ
3725 *enabled += event->total_time_enabled +
3726 atomic64_read(&event->child_total_time_enabled);
3727 *running += event->total_time_running +
3728 atomic64_read(&event->child_total_time_running);
3729
3730 list_for_each_entry(child, &event->child_list, child_list) {
cdd6c482 3731 total += perf_event_read(child);
59ed446f
PZ
3732 *enabled += child->total_time_enabled;
3733 *running += child->total_time_running;
3734 }
6f10581a 3735 mutex_unlock(&event->child_mutex);
e53c0994
PZ
3736
3737 return total;
3738}
fb0459d7 3739EXPORT_SYMBOL_GPL(perf_event_read_value);
e53c0994 3740
cdd6c482 3741static int perf_event_read_group(struct perf_event *event,
3dab77fb
PZ
3742 u64 read_format, char __user *buf)
3743{
cdd6c482 3744 struct perf_event *leader = event->group_leader, *sub;
6f10581a 3745 struct perf_event_context *ctx = leader->ctx;
f63a8daa 3746 int n = 0, size = 0, ret;
59ed446f 3747 u64 count, enabled, running;
f63a8daa
PZ
3748 u64 values[5];
3749
3750 lockdep_assert_held(&ctx->mutex);
abf4868b 3751
59ed446f 3752 count = perf_event_read_value(leader, &enabled, &running);
3dab77fb
PZ
3753
3754 values[n++] = 1 + leader->nr_siblings;
59ed446f
PZ
3755 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3756 values[n++] = enabled;
3757 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3758 values[n++] = running;
abf4868b
PZ
3759 values[n++] = count;
3760 if (read_format & PERF_FORMAT_ID)
3761 values[n++] = primary_event_id(leader);
3dab77fb
PZ
3762
3763 size = n * sizeof(u64);
3764
3765 if (copy_to_user(buf, values, size))
f63a8daa 3766 return -EFAULT;
3dab77fb 3767
6f10581a 3768 ret = size;
3dab77fb 3769
65abc865 3770 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
abf4868b 3771 n = 0;
3dab77fb 3772
59ed446f 3773 values[n++] = perf_event_read_value(sub, &enabled, &running);
abf4868b
PZ
3774 if (read_format & PERF_FORMAT_ID)
3775 values[n++] = primary_event_id(sub);
3776
3777 size = n * sizeof(u64);
3778
184d3da8 3779 if (copy_to_user(buf + ret, values, size)) {
f63a8daa 3780 return -EFAULT;
6f10581a 3781 }
abf4868b
PZ
3782
3783 ret += size;
3dab77fb
PZ
3784 }
3785
abf4868b 3786 return ret;
3dab77fb
PZ
3787}
3788
cdd6c482 3789static int perf_event_read_one(struct perf_event *event,
3dab77fb
PZ
3790 u64 read_format, char __user *buf)
3791{
59ed446f 3792 u64 enabled, running;
3dab77fb
PZ
3793 u64 values[4];
3794 int n = 0;
3795
59ed446f
PZ
3796 values[n++] = perf_event_read_value(event, &enabled, &running);
3797 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3798 values[n++] = enabled;
3799 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3800 values[n++] = running;
3dab77fb 3801 if (read_format & PERF_FORMAT_ID)
cdd6c482 3802 values[n++] = primary_event_id(event);
3dab77fb
PZ
3803
3804 if (copy_to_user(buf, values, n * sizeof(u64)))
3805 return -EFAULT;
3806
3807 return n * sizeof(u64);
3808}
3809
dc633982
JO
3810static bool is_event_hup(struct perf_event *event)
3811{
3812 bool no_children;
3813
3814 if (event->state != PERF_EVENT_STATE_EXIT)
3815 return false;
3816
3817 mutex_lock(&event->child_mutex);
3818 no_children = list_empty(&event->child_list);
3819 mutex_unlock(&event->child_mutex);
3820 return no_children;
3821}
3822
0793a61d 3823/*
cdd6c482 3824 * Read the performance event - simple non blocking version for now
0793a61d
TG
3825 */
3826static ssize_t
cdd6c482 3827perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
0793a61d 3828{
cdd6c482 3829 u64 read_format = event->attr.read_format;
3dab77fb 3830 int ret;
0793a61d 3831
3b6f9e5c 3832 /*
cdd6c482 3833 * Return end-of-file for a read on a event that is in
3b6f9e5c
PM
3834 * error state (i.e. because it was pinned but it couldn't be
3835 * scheduled on to the CPU at some point).
3836 */
cdd6c482 3837 if (event->state == PERF_EVENT_STATE_ERROR)
3b6f9e5c
PM
3838 return 0;
3839
c320c7b7 3840 if (count < event->read_size)
3dab77fb
PZ
3841 return -ENOSPC;
3842
cdd6c482 3843 WARN_ON_ONCE(event->ctx->parent_ctx);
3dab77fb 3844 if (read_format & PERF_FORMAT_GROUP)
cdd6c482 3845 ret = perf_event_read_group(event, read_format, buf);
3dab77fb 3846 else
cdd6c482 3847 ret = perf_event_read_one(event, read_format, buf);
0793a61d 3848
3dab77fb 3849 return ret;
0793a61d
TG
3850}
3851
0793a61d
TG
3852static ssize_t
3853perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3854{
cdd6c482 3855 struct perf_event *event = file->private_data;
f63a8daa
PZ
3856 struct perf_event_context *ctx;
3857 int ret;
0793a61d 3858
f63a8daa
PZ
3859 ctx = perf_event_ctx_lock(event);
3860 ret = perf_read_hw(event, buf, count);
3861 perf_event_ctx_unlock(event, ctx);
3862
3863 return ret;
0793a61d
TG
3864}
3865
3866static unsigned int perf_poll(struct file *file, poll_table *wait)
3867{
cdd6c482 3868 struct perf_event *event = file->private_data;
76369139 3869 struct ring_buffer *rb;
61b67684 3870 unsigned int events = POLLHUP;
c7138f37 3871
e708d7ad 3872 poll_wait(file, &event->waitq, wait);
179033b3 3873
dc633982 3874 if (is_event_hup(event))
179033b3 3875 return events;
c7138f37 3876
10c6db11 3877 /*
9bb5d40c
PZ
3878 * Pin the event->rb by taking event->mmap_mutex; otherwise
3879 * perf_event_set_output() can swizzle our rb and make us miss wakeups.
10c6db11
PZ
3880 */
3881 mutex_lock(&event->mmap_mutex);
9bb5d40c
PZ
3882 rb = event->rb;
3883 if (rb)
76369139 3884 events = atomic_xchg(&rb->poll, 0);
10c6db11 3885 mutex_unlock(&event->mmap_mutex);
0793a61d
TG
3886 return events;
3887}
3888
f63a8daa 3889static void _perf_event_reset(struct perf_event *event)
6de6a7b9 3890{
cdd6c482 3891 (void)perf_event_read(event);
e7850595 3892 local64_set(&event->count, 0);
cdd6c482 3893 perf_event_update_userpage(event);
3df5edad
PZ
3894}
3895
c93f7669 3896/*
cdd6c482
IM
3897 * Holding the top-level event's child_mutex means that any
3898 * descendant process that has inherited this event will block
3899 * in sync_child_event if it goes to exit, thus satisfying the
3900 * task existence requirements of perf_event_enable/disable.
c93f7669 3901 */
cdd6c482
IM
3902static void perf_event_for_each_child(struct perf_event *event,
3903 void (*func)(struct perf_event *))
3df5edad 3904{
cdd6c482 3905 struct perf_event *child;
3df5edad 3906
cdd6c482 3907 WARN_ON_ONCE(event->ctx->parent_ctx);
f63a8daa 3908
cdd6c482
IM
3909 mutex_lock(&event->child_mutex);
3910 func(event);
3911 list_for_each_entry(child, &event->child_list, child_list)
3df5edad 3912 func(child);
cdd6c482 3913 mutex_unlock(&event->child_mutex);
3df5edad
PZ
3914}
3915
cdd6c482
IM
3916static void perf_event_for_each(struct perf_event *event,
3917 void (*func)(struct perf_event *))
3df5edad 3918{
cdd6c482
IM
3919 struct perf_event_context *ctx = event->ctx;
3920 struct perf_event *sibling;
3df5edad 3921
f63a8daa
PZ
3922 lockdep_assert_held(&ctx->mutex);
3923
cdd6c482 3924 event = event->group_leader;
75f937f2 3925
cdd6c482 3926 perf_event_for_each_child(event, func);
cdd6c482 3927 list_for_each_entry(sibling, &event->sibling_list, group_entry)
724b6daa 3928 perf_event_for_each_child(sibling, func);
6de6a7b9
PZ
3929}
3930
cdd6c482 3931static int perf_event_period(struct perf_event *event, u64 __user *arg)
08247e31 3932{
cdd6c482 3933 struct perf_event_context *ctx = event->ctx;
bad7192b 3934 int ret = 0, active;
08247e31
PZ
3935 u64 value;
3936
6c7e550f 3937 if (!is_sampling_event(event))
08247e31
PZ
3938 return -EINVAL;
3939
ad0cf347 3940 if (copy_from_user(&value, arg, sizeof(value)))
08247e31
PZ
3941 return -EFAULT;
3942
3943 if (!value)
3944 return -EINVAL;
3945
e625cce1 3946 raw_spin_lock_irq(&ctx->lock);
cdd6c482
IM
3947 if (event->attr.freq) {
3948 if (value > sysctl_perf_event_sample_rate) {
08247e31
PZ
3949 ret = -EINVAL;
3950 goto unlock;
3951 }
3952
cdd6c482 3953 event->attr.sample_freq = value;
08247e31 3954 } else {
cdd6c482
IM
3955 event->attr.sample_period = value;
3956 event->hw.sample_period = value;
08247e31 3957 }
bad7192b
PZ
3958
3959 active = (event->state == PERF_EVENT_STATE_ACTIVE);
3960 if (active) {
3961 perf_pmu_disable(ctx->pmu);
3962 event->pmu->stop(event, PERF_EF_UPDATE);
3963 }
3964
3965 local64_set(&event->hw.period_left, 0);
3966
3967 if (active) {
3968 event->pmu->start(event, PERF_EF_RELOAD);
3969 perf_pmu_enable(ctx->pmu);
3970 }
3971
08247e31 3972unlock:
e625cce1 3973 raw_spin_unlock_irq(&ctx->lock);
08247e31
PZ
3974
3975 return ret;
3976}
3977
ac9721f3
PZ
3978static const struct file_operations perf_fops;
3979
2903ff01 3980static inline int perf_fget_light(int fd, struct fd *p)
ac9721f3 3981{
2903ff01
AV
3982 struct fd f = fdget(fd);
3983 if (!f.file)
3984 return -EBADF;
ac9721f3 3985
2903ff01
AV
3986 if (f.file->f_op != &perf_fops) {
3987 fdput(f);
3988 return -EBADF;
ac9721f3 3989 }
2903ff01
AV
3990 *p = f;
3991 return 0;
ac9721f3
PZ
3992}
3993
3994static int perf_event_set_output(struct perf_event *event,
3995 struct perf_event *output_event);
6fb2915d 3996static int perf_event_set_filter(struct perf_event *event, void __user *arg);
2541517c 3997static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
a4be7c27 3998
f63a8daa 3999static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
d859e29f 4000{
cdd6c482 4001 void (*func)(struct perf_event *);
3df5edad 4002 u32 flags = arg;
d859e29f
PM
4003
4004 switch (cmd) {
cdd6c482 4005 case PERF_EVENT_IOC_ENABLE:
f63a8daa 4006 func = _perf_event_enable;
d859e29f 4007 break;
cdd6c482 4008 case PERF_EVENT_IOC_DISABLE:
f63a8daa 4009 func = _perf_event_disable;
79f14641 4010 break;
cdd6c482 4011 case PERF_EVENT_IOC_RESET:
f63a8daa 4012 func = _perf_event_reset;
6de6a7b9 4013 break;
3df5edad 4014
cdd6c482 4015 case PERF_EVENT_IOC_REFRESH:
f63a8daa 4016 return _perf_event_refresh(event, arg);
08247e31 4017
cdd6c482
IM
4018 case PERF_EVENT_IOC_PERIOD:
4019 return perf_event_period(event, (u64 __user *)arg);
08247e31 4020
cf4957f1
JO
4021 case PERF_EVENT_IOC_ID:
4022 {
4023 u64 id = primary_event_id(event);
4024
4025 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
4026 return -EFAULT;
4027 return 0;
4028 }
4029
cdd6c482 4030 case PERF_EVENT_IOC_SET_OUTPUT:
ac9721f3 4031 {
ac9721f3 4032 int ret;
ac9721f3 4033 if (arg != -1) {
2903ff01
AV
4034 struct perf_event *output_event;
4035 struct fd output;
4036 ret = perf_fget_light(arg, &output);
4037 if (ret)
4038 return ret;
4039 output_event = output.file->private_data;
4040 ret = perf_event_set_output(event, output_event);
4041 fdput(output);
4042 } else {
4043 ret = perf_event_set_output(event, NULL);
ac9721f3 4044 }
ac9721f3
PZ
4045 return ret;
4046 }
a4be7c27 4047
6fb2915d
LZ
4048 case PERF_EVENT_IOC_SET_FILTER:
4049 return perf_event_set_filter(event, (void __user *)arg);
4050
2541517c
AS
4051 case PERF_EVENT_IOC_SET_BPF:
4052 return perf_event_set_bpf_prog(event, arg);
4053
d859e29f 4054 default:
3df5edad 4055 return -ENOTTY;
d859e29f 4056 }
3df5edad
PZ
4057
4058 if (flags & PERF_IOC_FLAG_GROUP)
cdd6c482 4059 perf_event_for_each(event, func);
3df5edad 4060 else
cdd6c482 4061 perf_event_for_each_child(event, func);
3df5edad
PZ
4062
4063 return 0;
d859e29f
PM
4064}
4065
f63a8daa
PZ
4066static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
4067{
4068 struct perf_event *event = file->private_data;
4069 struct perf_event_context *ctx;
4070 long ret;
4071
4072 ctx = perf_event_ctx_lock(event);
4073 ret = _perf_ioctl(event, cmd, arg);
4074 perf_event_ctx_unlock(event, ctx);
4075
4076 return ret;
4077}
4078
b3f20785
PM
4079#ifdef CONFIG_COMPAT
4080static long perf_compat_ioctl(struct file *file, unsigned int cmd,
4081 unsigned long arg)
4082{
4083 switch (_IOC_NR(cmd)) {
4084 case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
4085 case _IOC_NR(PERF_EVENT_IOC_ID):
4086 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
4087 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
4088 cmd &= ~IOCSIZE_MASK;
4089 cmd |= sizeof(void *) << IOCSIZE_SHIFT;
4090 }
4091 break;
4092 }
4093 return perf_ioctl(file, cmd, arg);
4094}
4095#else
4096# define perf_compat_ioctl NULL
4097#endif
4098
cdd6c482 4099int perf_event_task_enable(void)
771d7cde 4100{
f63a8daa 4101 struct perf_event_context *ctx;
cdd6c482 4102 struct perf_event *event;
771d7cde 4103
cdd6c482 4104 mutex_lock(&current->perf_event_mutex);
f63a8daa
PZ
4105 list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4106 ctx = perf_event_ctx_lock(event);
4107 perf_event_for_each_child(event, _perf_event_enable);
4108 perf_event_ctx_unlock(event, ctx);
4109 }
cdd6c482 4110 mutex_unlock(&current->perf_event_mutex);
771d7cde
PZ
4111
4112 return 0;
4113}
4114
cdd6c482 4115int perf_event_task_disable(void)
771d7cde 4116{
f63a8daa 4117 struct perf_event_context *ctx;
cdd6c482 4118 struct perf_event *event;
771d7cde 4119
cdd6c482 4120 mutex_lock(&current->perf_event_mutex);
f63a8daa
PZ
4121 list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4122 ctx = perf_event_ctx_lock(event);
4123 perf_event_for_each_child(event, _perf_event_disable);
4124 perf_event_ctx_unlock(event, ctx);
4125 }
cdd6c482 4126 mutex_unlock(&current->perf_event_mutex);
771d7cde
PZ
4127
4128 return 0;
4129}
4130
cdd6c482 4131static int perf_event_index(struct perf_event *event)
194002b2 4132{
a4eaf7f1
PZ
4133 if (event->hw.state & PERF_HES_STOPPED)
4134 return 0;
4135
cdd6c482 4136 if (event->state != PERF_EVENT_STATE_ACTIVE)
194002b2
PZ
4137 return 0;
4138
35edc2a5 4139 return event->pmu->event_idx(event);
194002b2
PZ
4140}
4141
c4794295 4142static void calc_timer_values(struct perf_event *event,
e3f3541c 4143 u64 *now,
7f310a5d
EM
4144 u64 *enabled,
4145 u64 *running)
c4794295 4146{
e3f3541c 4147 u64 ctx_time;
c4794295 4148
e3f3541c
PZ
4149 *now = perf_clock();
4150 ctx_time = event->shadow_ctx_time + *now;
c4794295
EM
4151 *enabled = ctx_time - event->tstamp_enabled;
4152 *running = ctx_time - event->tstamp_running;
4153}
4154
fa731587
PZ
4155static void perf_event_init_userpage(struct perf_event *event)
4156{
4157 struct perf_event_mmap_page *userpg;
4158 struct ring_buffer *rb;
4159
4160 rcu_read_lock();
4161 rb = rcu_dereference(event->rb);
4162 if (!rb)
4163 goto unlock;
4164
4165 userpg = rb->user_page;
4166
4167 /* Allow new userspace to detect that bit 0 is deprecated */
4168 userpg->cap_bit0_is_deprecated = 1;
4169 userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
e8c6deac
AS
4170 userpg->data_offset = PAGE_SIZE;
4171 userpg->data_size = perf_data_size(rb);
fa731587
PZ
4172
4173unlock:
4174 rcu_read_unlock();
4175}
4176
c1317ec2
AL
4177void __weak arch_perf_update_userpage(
4178 struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
e3f3541c
PZ
4179{
4180}
4181
38ff667b
PZ
4182/*
4183 * Callers need to ensure there can be no nesting of this function, otherwise
4184 * the seqlock logic goes bad. We can not serialize this because the arch
4185 * code calls this from NMI context.
4186 */
cdd6c482 4187void perf_event_update_userpage(struct perf_event *event)
37d81828 4188{
cdd6c482 4189 struct perf_event_mmap_page *userpg;
76369139 4190 struct ring_buffer *rb;
e3f3541c 4191 u64 enabled, running, now;
38ff667b
PZ
4192
4193 rcu_read_lock();
5ec4c599
PZ
4194 rb = rcu_dereference(event->rb);
4195 if (!rb)
4196 goto unlock;
4197
0d641208
EM
4198 /*
4199 * compute total_time_enabled, total_time_running
4200 * based on snapshot values taken when the event
4201 * was last scheduled in.
4202 *
4203 * we cannot simply called update_context_time()
4204 * because of locking issue as we can be called in
4205 * NMI context
4206 */
e3f3541c 4207 calc_timer_values(event, &now, &enabled, &running);
38ff667b 4208
76369139 4209 userpg = rb->user_page;
7b732a75
PZ
4210 /*
4211 * Disable preemption so as to not let the corresponding user-space
4212 * spin too long if we get preempted.
4213 */
4214 preempt_disable();
37d81828 4215 ++userpg->lock;
92f22a38 4216 barrier();
cdd6c482 4217 userpg->index = perf_event_index(event);
b5e58793 4218 userpg->offset = perf_event_count(event);
365a4038 4219 if (userpg->index)
e7850595 4220 userpg->offset -= local64_read(&event->hw.prev_count);
7b732a75 4221
0d641208 4222 userpg->time_enabled = enabled +
cdd6c482 4223 atomic64_read(&event->child_total_time_enabled);
7f8b4e4e 4224
0d641208 4225 userpg->time_running = running +
cdd6c482 4226 atomic64_read(&event->child_total_time_running);
7f8b4e4e 4227
c1317ec2 4228 arch_perf_update_userpage(event, userpg, now);
e3f3541c 4229
92f22a38 4230 barrier();
37d81828 4231 ++userpg->lock;
7b732a75 4232 preempt_enable();
38ff667b 4233unlock:
7b732a75 4234 rcu_read_unlock();
37d81828
PM
4235}
4236
906010b2
PZ
4237static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
4238{
4239 struct perf_event *event = vma->vm_file->private_data;
76369139 4240 struct ring_buffer *rb;
906010b2
PZ
4241 int ret = VM_FAULT_SIGBUS;
4242
4243 if (vmf->flags & FAULT_FLAG_MKWRITE) {
4244 if (vmf->pgoff == 0)
4245 ret = 0;
4246 return ret;
4247 }
4248
4249 rcu_read_lock();
76369139
FW
4250 rb = rcu_dereference(event->rb);
4251 if (!rb)
906010b2
PZ
4252 goto unlock;
4253
4254 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
4255 goto unlock;
4256
76369139 4257 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
906010b2
PZ
4258 if (!vmf->page)
4259 goto unlock;
4260
4261 get_page(vmf->page);
4262 vmf->page->mapping = vma->vm_file->f_mapping;
4263 vmf->page->index = vmf->pgoff;
4264
4265 ret = 0;
4266unlock:
4267 rcu_read_unlock();
4268
4269 return ret;
4270}
4271
10c6db11
PZ
4272static void ring_buffer_attach(struct perf_event *event,
4273 struct ring_buffer *rb)
4274{
b69cf536 4275 struct ring_buffer *old_rb = NULL;
10c6db11
PZ
4276 unsigned long flags;
4277
b69cf536
PZ
4278 if (event->rb) {
4279 /*
4280 * Should be impossible, we set this when removing
4281 * event->rb_entry and wait/clear when adding event->rb_entry.
4282 */
4283 WARN_ON_ONCE(event->rcu_pending);
10c6db11 4284
b69cf536
PZ
4285 old_rb = event->rb;
4286 event->rcu_batches = get_state_synchronize_rcu();
4287 event->rcu_pending = 1;
10c6db11 4288
b69cf536
PZ
4289 spin_lock_irqsave(&old_rb->event_lock, flags);
4290 list_del_rcu(&event->rb_entry);
4291 spin_unlock_irqrestore(&old_rb->event_lock, flags);
4292 }
10c6db11 4293
b69cf536
PZ
4294 if (event->rcu_pending && rb) {
4295 cond_synchronize_rcu(event->rcu_batches);
4296 event->rcu_pending = 0;
4297 }
10c6db11 4298
b69cf536
PZ
4299 if (rb) {
4300 spin_lock_irqsave(&rb->event_lock, flags);
4301 list_add_rcu(&event->rb_entry, &rb->event_list);
4302 spin_unlock_irqrestore(&rb->event_lock, flags);
4303 }
4304
4305 rcu_assign_pointer(event->rb, rb);
4306
4307 if (old_rb) {
4308 ring_buffer_put(old_rb);
4309 /*
4310 * Since we detached before setting the new rb, so that we
4311 * could attach the new rb, we could have missed a wakeup.
4312 * Provide it now.
4313 */
4314 wake_up_all(&event->waitq);
4315 }
10c6db11
PZ
4316}
4317
4318static void ring_buffer_wakeup(struct perf_event *event)
4319{
4320 struct ring_buffer *rb;
4321
4322 rcu_read_lock();
4323 rb = rcu_dereference(event->rb);
9bb5d40c
PZ
4324 if (rb) {
4325 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
4326 wake_up_all(&event->waitq);
4327 }
10c6db11
PZ
4328 rcu_read_unlock();
4329}
4330
76369139 4331static void rb_free_rcu(struct rcu_head *rcu_head)
906010b2 4332{
76369139 4333 struct ring_buffer *rb;
906010b2 4334
76369139
FW
4335 rb = container_of(rcu_head, struct ring_buffer, rcu_head);
4336 rb_free(rb);
7b732a75
PZ
4337}
4338
fdc26706 4339struct ring_buffer *ring_buffer_get(struct perf_event *event)
7b732a75 4340{
76369139 4341 struct ring_buffer *rb;
7b732a75 4342
ac9721f3 4343 rcu_read_lock();
76369139
FW
4344 rb = rcu_dereference(event->rb);
4345 if (rb) {
4346 if (!atomic_inc_not_zero(&rb->refcount))
4347 rb = NULL;
ac9721f3
PZ
4348 }
4349 rcu_read_unlock();
4350
76369139 4351 return rb;
ac9721f3
PZ
4352}
4353
fdc26706 4354void ring_buffer_put(struct ring_buffer *rb)
ac9721f3 4355{
76369139 4356 if (!atomic_dec_and_test(&rb->refcount))
ac9721f3 4357 return;
7b732a75 4358
9bb5d40c 4359 WARN_ON_ONCE(!list_empty(&rb->event_list));
10c6db11 4360
76369139 4361 call_rcu(&rb->rcu_head, rb_free_rcu);
7b732a75
PZ
4362}
4363
4364static void perf_mmap_open(struct vm_area_struct *vma)
4365{
cdd6c482 4366 struct perf_event *event = vma->vm_file->private_data;
7b732a75 4367
cdd6c482 4368 atomic_inc(&event->mmap_count);
9bb5d40c 4369 atomic_inc(&event->rb->mmap_count);
1e0fb9ec 4370
45bfb2e5
PZ
4371 if (vma->vm_pgoff)
4372 atomic_inc(&event->rb->aux_mmap_count);
4373
1e0fb9ec
AL
4374 if (event->pmu->event_mapped)
4375 event->pmu->event_mapped(event);
7b732a75
PZ
4376}
4377
9bb5d40c
PZ
4378/*
4379 * A buffer can be mmap()ed multiple times; either directly through the same
4380 * event, or through other events by use of perf_event_set_output().
4381 *
4382 * In order to undo the VM accounting done by perf_mmap() we need to destroy
4383 * the buffer here, where we still have a VM context. This means we need
4384 * to detach all events redirecting to us.
4385 */
7b732a75
PZ
4386static void perf_mmap_close(struct vm_area_struct *vma)
4387{
cdd6c482 4388 struct perf_event *event = vma->vm_file->private_data;
7b732a75 4389
b69cf536 4390 struct ring_buffer *rb = ring_buffer_get(event);
9bb5d40c
PZ
4391 struct user_struct *mmap_user = rb->mmap_user;
4392 int mmap_locked = rb->mmap_locked;
4393 unsigned long size = perf_data_size(rb);
789f90fc 4394
1e0fb9ec
AL
4395 if (event->pmu->event_unmapped)
4396 event->pmu->event_unmapped(event);
4397
45bfb2e5
PZ
4398 /*
4399 * rb->aux_mmap_count will always drop before rb->mmap_count and
4400 * event->mmap_count, so it is ok to use event->mmap_mutex to
4401 * serialize with perf_mmap here.
4402 */
4403 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
4404 atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
4405 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
4406 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
4407
4408 rb_free_aux(rb);
4409 mutex_unlock(&event->mmap_mutex);
4410 }
4411
9bb5d40c
PZ
4412 atomic_dec(&rb->mmap_count);
4413
4414 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
b69cf536 4415 goto out_put;
9bb5d40c 4416
b69cf536 4417 ring_buffer_attach(event, NULL);
9bb5d40c
PZ
4418 mutex_unlock(&event->mmap_mutex);
4419
4420 /* If there's still other mmap()s of this buffer, we're done. */
b69cf536
PZ
4421 if (atomic_read(&rb->mmap_count))
4422 goto out_put;
ac9721f3 4423
9bb5d40c
PZ
4424 /*
4425 * No other mmap()s, detach from all other events that might redirect
4426 * into the now unreachable buffer. Somewhat complicated by the
4427 * fact that rb::event_lock otherwise nests inside mmap_mutex.
4428 */
4429again:
4430 rcu_read_lock();
4431 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
4432 if (!atomic_long_inc_not_zero(&event->refcount)) {
4433 /*
4434 * This event is en-route to free_event() which will
4435 * detach it and remove it from the list.
4436 */
4437 continue;
4438 }
4439 rcu_read_unlock();
789f90fc 4440
9bb5d40c
PZ
4441 mutex_lock(&event->mmap_mutex);
4442 /*
4443 * Check we didn't race with perf_event_set_output() which can
4444 * swizzle the rb from under us while we were waiting to
4445 * acquire mmap_mutex.
4446 *
4447 * If we find a different rb; ignore this event, a next
4448 * iteration will no longer find it on the list. We have to
4449 * still restart the iteration to make sure we're not now
4450 * iterating the wrong list.
4451 */
b69cf536
PZ
4452 if (event->rb == rb)
4453 ring_buffer_attach(event, NULL);
4454
cdd6c482 4455 mutex_unlock(&event->mmap_mutex);
9bb5d40c 4456 put_event(event);
ac9721f3 4457
9bb5d40c
PZ
4458 /*
4459 * Restart the iteration; either we're on the wrong list or
4460 * destroyed its integrity by doing a deletion.
4461 */
4462 goto again;
7b732a75 4463 }
9bb5d40c
PZ
4464 rcu_read_unlock();
4465
4466 /*
4467 * It could be there's still a few 0-ref events on the list; they'll
4468 * get cleaned up by free_event() -- they'll also still have their
4469 * ref on the rb and will free it whenever they are done with it.
4470 *
4471 * Aside from that, this buffer is 'fully' detached and unmapped,
4472 * undo the VM accounting.
4473 */
4474
4475 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
4476 vma->vm_mm->pinned_vm -= mmap_locked;
4477 free_uid(mmap_user);
4478
b69cf536 4479out_put:
9bb5d40c 4480 ring_buffer_put(rb); /* could be last */
37d81828
PM
4481}
4482
f0f37e2f 4483static const struct vm_operations_struct perf_mmap_vmops = {
43a21ea8 4484 .open = perf_mmap_open,
45bfb2e5 4485 .close = perf_mmap_close, /* non mergable */
43a21ea8
PZ
4486 .fault = perf_mmap_fault,
4487 .page_mkwrite = perf_mmap_fault,
37d81828
PM
4488};
4489
4490static int perf_mmap(struct file *file, struct vm_area_struct *vma)
4491{
cdd6c482 4492 struct perf_event *event = file->private_data;
22a4f650 4493 unsigned long user_locked, user_lock_limit;
789f90fc 4494 struct user_struct *user = current_user();
22a4f650 4495 unsigned long locked, lock_limit;
45bfb2e5 4496 struct ring_buffer *rb = NULL;
7b732a75
PZ
4497 unsigned long vma_size;
4498 unsigned long nr_pages;
45bfb2e5 4499 long user_extra = 0, extra = 0;
d57e34fd 4500 int ret = 0, flags = 0;
37d81828 4501
c7920614
PZ
4502 /*
4503 * Don't allow mmap() of inherited per-task counters. This would
4504 * create a performance issue due to all children writing to the
76369139 4505 * same rb.
c7920614
PZ
4506 */
4507 if (event->cpu == -1 && event->attr.inherit)
4508 return -EINVAL;
4509
43a21ea8 4510 if (!(vma->vm_flags & VM_SHARED))
37d81828 4511 return -EINVAL;
7b732a75
PZ
4512
4513 vma_size = vma->vm_end - vma->vm_start;
45bfb2e5
PZ
4514
4515 if (vma->vm_pgoff == 0) {
4516 nr_pages = (vma_size / PAGE_SIZE) - 1;
4517 } else {
4518 /*
4519 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
4520 * mapped, all subsequent mappings should have the same size
4521 * and offset. Must be above the normal perf buffer.
4522 */
4523 u64 aux_offset, aux_size;
4524
4525 if (!event->rb)
4526 return -EINVAL;
4527
4528 nr_pages = vma_size / PAGE_SIZE;
4529
4530 mutex_lock(&event->mmap_mutex);
4531 ret = -EINVAL;
4532
4533 rb = event->rb;
4534 if (!rb)
4535 goto aux_unlock;
4536
4537 aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
4538 aux_size = ACCESS_ONCE(rb->user_page->aux_size);
4539
4540 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
4541 goto aux_unlock;
4542
4543 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
4544 goto aux_unlock;
4545
4546 /* already mapped with a different offset */
4547 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
4548 goto aux_unlock;
4549
4550 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
4551 goto aux_unlock;
4552
4553 /* already mapped with a different size */
4554 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
4555 goto aux_unlock;
4556
4557 if (!is_power_of_2(nr_pages))
4558 goto aux_unlock;
4559
4560 if (!atomic_inc_not_zero(&rb->mmap_count))
4561 goto aux_unlock;
4562
4563 if (rb_has_aux(rb)) {
4564 atomic_inc(&rb->aux_mmap_count);
4565 ret = 0;
4566 goto unlock;
4567 }
4568
4569 atomic_set(&rb->aux_mmap_count, 1);
4570 user_extra = nr_pages;
4571
4572 goto accounting;
4573 }
7b732a75 4574
7730d865 4575 /*
76369139 4576 * If we have rb pages ensure they're a power-of-two number, so we
7730d865
PZ
4577 * can do bitmasks instead of modulo.
4578 */
2ed11312 4579 if (nr_pages != 0 && !is_power_of_2(nr_pages))
37d81828
PM
4580 return -EINVAL;
4581
7b732a75 4582 if (vma_size != PAGE_SIZE * (1 + nr_pages))
37d81828
PM
4583 return -EINVAL;
4584
cdd6c482 4585 WARN_ON_ONCE(event->ctx->parent_ctx);
9bb5d40c 4586again:
cdd6c482 4587 mutex_lock(&event->mmap_mutex);
76369139 4588 if (event->rb) {
9bb5d40c 4589 if (event->rb->nr_pages != nr_pages) {
ebb3c4c4 4590 ret = -EINVAL;
9bb5d40c
PZ
4591 goto unlock;
4592 }
4593
4594 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
4595 /*
4596 * Raced against perf_mmap_close() through
4597 * perf_event_set_output(). Try again, hope for better
4598 * luck.
4599 */
4600 mutex_unlock(&event->mmap_mutex);
4601 goto again;
4602 }
4603
ebb3c4c4
PZ
4604 goto unlock;
4605 }
4606
789f90fc 4607 user_extra = nr_pages + 1;
45bfb2e5
PZ
4608
4609accounting:
cdd6c482 4610 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
a3862d3f
IM
4611
4612 /*
4613 * Increase the limit linearly with more CPUs:
4614 */
4615 user_lock_limit *= num_online_cpus();
4616
789f90fc 4617 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
c5078f78 4618
789f90fc
PZ
4619 if (user_locked > user_lock_limit)
4620 extra = user_locked - user_lock_limit;
7b732a75 4621
78d7d407 4622 lock_limit = rlimit(RLIMIT_MEMLOCK);
7b732a75 4623 lock_limit >>= PAGE_SHIFT;
bc3e53f6 4624 locked = vma->vm_mm->pinned_vm + extra;
7b732a75 4625
459ec28a
IM
4626 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
4627 !capable(CAP_IPC_LOCK)) {
ebb3c4c4
PZ
4628 ret = -EPERM;
4629 goto unlock;
4630 }
7b732a75 4631
45bfb2e5 4632 WARN_ON(!rb && event->rb);
906010b2 4633
d57e34fd 4634 if (vma->vm_flags & VM_WRITE)
76369139 4635 flags |= RING_BUFFER_WRITABLE;
d57e34fd 4636
76369139 4637 if (!rb) {
45bfb2e5
PZ
4638 rb = rb_alloc(nr_pages,
4639 event->attr.watermark ? event->attr.wakeup_watermark : 0,
4640 event->cpu, flags);
26cb63ad 4641
45bfb2e5
PZ
4642 if (!rb) {
4643 ret = -ENOMEM;
4644 goto unlock;
4645 }
43a21ea8 4646
45bfb2e5
PZ
4647 atomic_set(&rb->mmap_count, 1);
4648 rb->mmap_user = get_current_user();
4649 rb->mmap_locked = extra;
26cb63ad 4650
45bfb2e5 4651 ring_buffer_attach(event, rb);
ac9721f3 4652
45bfb2e5
PZ
4653 perf_event_init_userpage(event);
4654 perf_event_update_userpage(event);
4655 } else {
1a594131
AS
4656 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
4657 event->attr.aux_watermark, flags);
45bfb2e5
PZ
4658 if (!ret)
4659 rb->aux_mmap_locked = extra;
4660 }
9a0f05cb 4661
ebb3c4c4 4662unlock:
45bfb2e5
PZ
4663 if (!ret) {
4664 atomic_long_add(user_extra, &user->locked_vm);
4665 vma->vm_mm->pinned_vm += extra;
4666
ac9721f3 4667 atomic_inc(&event->mmap_count);
45bfb2e5
PZ
4668 } else if (rb) {
4669 atomic_dec(&rb->mmap_count);
4670 }
4671aux_unlock:
cdd6c482 4672 mutex_unlock(&event->mmap_mutex);
37d81828 4673
9bb5d40c
PZ
4674 /*
4675 * Since pinned accounting is per vm we cannot allow fork() to copy our
4676 * vma.
4677 */
26cb63ad 4678 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
37d81828 4679 vma->vm_ops = &perf_mmap_vmops;
7b732a75 4680
1e0fb9ec
AL
4681 if (event->pmu->event_mapped)
4682 event->pmu->event_mapped(event);
4683
7b732a75 4684 return ret;
37d81828
PM
4685}
4686
3c446b3d
PZ
4687static int perf_fasync(int fd, struct file *filp, int on)
4688{
496ad9aa 4689 struct inode *inode = file_inode(filp);
cdd6c482 4690 struct perf_event *event = filp->private_data;
3c446b3d
PZ
4691 int retval;
4692
4693 mutex_lock(&inode->i_mutex);
cdd6c482 4694 retval = fasync_helper(fd, filp, on, &event->fasync);
3c446b3d
PZ
4695 mutex_unlock(&inode->i_mutex);
4696
4697 if (retval < 0)
4698 return retval;
4699
4700 return 0;
4701}
4702
0793a61d 4703static const struct file_operations perf_fops = {
3326c1ce 4704 .llseek = no_llseek,
0793a61d
TG
4705 .release = perf_release,
4706 .read = perf_read,
4707 .poll = perf_poll,
d859e29f 4708 .unlocked_ioctl = perf_ioctl,
b3f20785 4709 .compat_ioctl = perf_compat_ioctl,
37d81828 4710 .mmap = perf_mmap,
3c446b3d 4711 .fasync = perf_fasync,
0793a61d
TG
4712};
4713
925d519a 4714/*
cdd6c482 4715 * Perf event wakeup
925d519a
PZ
4716 *
4717 * If there's data, ensure we set the poll() state and publish everything
4718 * to user-space before waking everybody up.
4719 */
4720
cdd6c482 4721void perf_event_wakeup(struct perf_event *event)
925d519a 4722{
10c6db11 4723 ring_buffer_wakeup(event);
4c9e2542 4724
cdd6c482
IM
4725 if (event->pending_kill) {
4726 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
4727 event->pending_kill = 0;
4c9e2542 4728 }
925d519a
PZ
4729}
4730
e360adbe 4731static void perf_pending_event(struct irq_work *entry)
79f14641 4732{
cdd6c482
IM
4733 struct perf_event *event = container_of(entry,
4734 struct perf_event, pending);
d525211f
PZ
4735 int rctx;
4736
4737 rctx = perf_swevent_get_recursion_context();
4738 /*
4739 * If we 'fail' here, that's OK, it means recursion is already disabled
4740 * and we won't recurse 'further'.
4741 */
79f14641 4742
cdd6c482
IM
4743 if (event->pending_disable) {
4744 event->pending_disable = 0;
4745 __perf_event_disable(event);
79f14641
PZ
4746 }
4747
cdd6c482
IM
4748 if (event->pending_wakeup) {
4749 event->pending_wakeup = 0;
4750 perf_event_wakeup(event);
79f14641 4751 }
d525211f
PZ
4752
4753 if (rctx >= 0)
4754 perf_swevent_put_recursion_context(rctx);
79f14641
PZ
4755}
4756
39447b38
ZY
4757/*
4758 * We assume there is only KVM supporting the callbacks.
4759 * Later on, we might change it to a list if there is
4760 * another virtualization implementation supporting the callbacks.
4761 */
4762struct perf_guest_info_callbacks *perf_guest_cbs;
4763
4764int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
4765{
4766 perf_guest_cbs = cbs;
4767 return 0;
4768}
4769EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
4770
4771int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
4772{
4773 perf_guest_cbs = NULL;
4774 return 0;
4775}
4776EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
4777
4018994f
JO
4778static void
4779perf_output_sample_regs(struct perf_output_handle *handle,
4780 struct pt_regs *regs, u64 mask)
4781{
4782 int bit;
4783
4784 for_each_set_bit(bit, (const unsigned long *) &mask,
4785 sizeof(mask) * BITS_PER_BYTE) {
4786 u64 val;
4787
4788 val = perf_reg_value(regs, bit);
4789 perf_output_put(handle, val);
4790 }
4791}
4792
60e2364e 4793static void perf_sample_regs_user(struct perf_regs *regs_user,
88a7c26a
AL
4794 struct pt_regs *regs,
4795 struct pt_regs *regs_user_copy)
4018994f 4796{
88a7c26a
AL
4797 if (user_mode(regs)) {
4798 regs_user->abi = perf_reg_abi(current);
2565711f 4799 regs_user->regs = regs;
88a7c26a
AL
4800 } else if (current->mm) {
4801 perf_get_regs_user(regs_user, regs, regs_user_copy);
2565711f
PZ
4802 } else {
4803 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
4804 regs_user->regs = NULL;
4018994f
JO
4805 }
4806}
4807
60e2364e
SE
4808static void perf_sample_regs_intr(struct perf_regs *regs_intr,
4809 struct pt_regs *regs)
4810{
4811 regs_intr->regs = regs;
4812 regs_intr->abi = perf_reg_abi(current);
4813}
4814
4815
c5ebcedb
JO
4816/*
4817 * Get remaining task size from user stack pointer.
4818 *
4819 * It'd be better to take stack vma map and limit this more
4820 * precisly, but there's no way to get it safely under interrupt,
4821 * so using TASK_SIZE as limit.
4822 */
4823static u64 perf_ustack_task_size(struct pt_regs *regs)
4824{
4825 unsigned long addr = perf_user_stack_pointer(regs);
4826
4827 if (!addr || addr >= TASK_SIZE)
4828 return 0;
4829
4830 return TASK_SIZE - addr;
4831}
4832
4833static u16
4834perf_sample_ustack_size(u16 stack_size, u16 header_size,
4835 struct pt_regs *regs)
4836{
4837 u64 task_size;
4838
4839 /* No regs, no stack pointer, no dump. */
4840 if (!regs)
4841 return 0;
4842
4843 /*
4844 * Check if we fit in with the requested stack size into the:
4845 * - TASK_SIZE
4846 * If we don't, we limit the size to the TASK_SIZE.
4847 *
4848 * - remaining sample size
4849 * If we don't, we customize the stack size to
4850 * fit in to the remaining sample size.
4851 */
4852
4853 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
4854 stack_size = min(stack_size, (u16) task_size);
4855
4856 /* Current header size plus static size and dynamic size. */
4857 header_size += 2 * sizeof(u64);
4858
4859 /* Do we fit in with the current stack dump size? */
4860 if ((u16) (header_size + stack_size) < header_size) {
4861 /*
4862 * If we overflow the maximum size for the sample,
4863 * we customize the stack dump size to fit in.
4864 */
4865 stack_size = USHRT_MAX - header_size - sizeof(u64);
4866 stack_size = round_up(stack_size, sizeof(u64));
4867 }
4868
4869 return stack_size;
4870}
4871
4872static void
4873perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
4874 struct pt_regs *regs)
4875{
4876 /* Case of a kernel thread, nothing to dump */
4877 if (!regs) {
4878 u64 size = 0;
4879 perf_output_put(handle, size);
4880 } else {
4881 unsigned long sp;
4882 unsigned int rem;
4883 u64 dyn_size;
4884
4885 /*
4886 * We dump:
4887 * static size
4888 * - the size requested by user or the best one we can fit
4889 * in to the sample max size
4890 * data
4891 * - user stack dump data
4892 * dynamic size
4893 * - the actual dumped size
4894 */
4895
4896 /* Static size. */
4897 perf_output_put(handle, dump_size);
4898
4899 /* Data. */
4900 sp = perf_user_stack_pointer(regs);
4901 rem = __output_copy_user(handle, (void *) sp, dump_size);
4902 dyn_size = dump_size - rem;
4903
4904 perf_output_skip(handle, rem);
4905
4906 /* Dynamic size. */
4907 perf_output_put(handle, dyn_size);
4908 }
4909}
4910
c980d109
ACM
4911static void __perf_event_header__init_id(struct perf_event_header *header,
4912 struct perf_sample_data *data,
4913 struct perf_event *event)
6844c09d
ACM
4914{
4915 u64 sample_type = event->attr.sample_type;
4916
4917 data->type = sample_type;
4918 header->size += event->id_header_size;
4919
4920 if (sample_type & PERF_SAMPLE_TID) {
4921 /* namespace issues */
4922 data->tid_entry.pid = perf_event_pid(event, current);
4923 data->tid_entry.tid = perf_event_tid(event, current);
4924 }
4925
4926 if (sample_type & PERF_SAMPLE_TIME)
34f43927 4927 data->time = perf_event_clock(event);
6844c09d 4928
ff3d527c 4929 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
6844c09d
ACM
4930 data->id = primary_event_id(event);
4931
4932 if (sample_type & PERF_SAMPLE_STREAM_ID)
4933 data->stream_id = event->id;
4934
4935 if (sample_type & PERF_SAMPLE_CPU) {
4936 data->cpu_entry.cpu = raw_smp_processor_id();
4937 data->cpu_entry.reserved = 0;
4938 }
4939}
4940
76369139
FW
4941void perf_event_header__init_id(struct perf_event_header *header,
4942 struct perf_sample_data *data,
4943 struct perf_event *event)
c980d109
ACM
4944{
4945 if (event->attr.sample_id_all)
4946 __perf_event_header__init_id(header, data, event);
4947}
4948
4949static void __perf_event__output_id_sample(struct perf_output_handle *handle,
4950 struct perf_sample_data *data)
4951{
4952 u64 sample_type = data->type;
4953
4954 if (sample_type & PERF_SAMPLE_TID)
4955 perf_output_put(handle, data->tid_entry);
4956
4957 if (sample_type & PERF_SAMPLE_TIME)
4958 perf_output_put(handle, data->time);
4959
4960 if (sample_type & PERF_SAMPLE_ID)
4961 perf_output_put(handle, data->id);
4962
4963 if (sample_type & PERF_SAMPLE_STREAM_ID)
4964 perf_output_put(handle, data->stream_id);
4965
4966 if (sample_type & PERF_SAMPLE_CPU)
4967 perf_output_put(handle, data->cpu_entry);
ff3d527c
AH
4968
4969 if (sample_type & PERF_SAMPLE_IDENTIFIER)
4970 perf_output_put(handle, data->id);
c980d109
ACM
4971}
4972
76369139
FW
4973void perf_event__output_id_sample(struct perf_event *event,
4974 struct perf_output_handle *handle,
4975 struct perf_sample_data *sample)
c980d109
ACM
4976{
4977 if (event->attr.sample_id_all)
4978 __perf_event__output_id_sample(handle, sample);
4979}
4980
3dab77fb 4981static void perf_output_read_one(struct perf_output_handle *handle,
eed01528
SE
4982 struct perf_event *event,
4983 u64 enabled, u64 running)
3dab77fb 4984{
cdd6c482 4985 u64 read_format = event->attr.read_format;
3dab77fb
PZ
4986 u64 values[4];
4987 int n = 0;
4988
b5e58793 4989 values[n++] = perf_event_count(event);
3dab77fb 4990 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
eed01528 4991 values[n++] = enabled +
cdd6c482 4992 atomic64_read(&event->child_total_time_enabled);
3dab77fb
PZ
4993 }
4994 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
eed01528 4995 values[n++] = running +
cdd6c482 4996 atomic64_read(&event->child_total_time_running);
3dab77fb
PZ
4997 }
4998 if (read_format & PERF_FORMAT_ID)
cdd6c482 4999 values[n++] = primary_event_id(event);
3dab77fb 5000
76369139 5001 __output_copy(handle, values, n * sizeof(u64));
3dab77fb
PZ
5002}
5003
5004/*
cdd6c482 5005 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
3dab77fb
PZ
5006 */
5007static void perf_output_read_group(struct perf_output_handle *handle,
eed01528
SE
5008 struct perf_event *event,
5009 u64 enabled, u64 running)
3dab77fb 5010{
cdd6c482
IM
5011 struct perf_event *leader = event->group_leader, *sub;
5012 u64 read_format = event->attr.read_format;
3dab77fb
PZ
5013 u64 values[5];
5014 int n = 0;
5015
5016 values[n++] = 1 + leader->nr_siblings;
5017
5018 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
eed01528 5019 values[n++] = enabled;
3dab77fb
PZ
5020
5021 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
eed01528 5022 values[n++] = running;
3dab77fb 5023
cdd6c482 5024 if (leader != event)
3dab77fb
PZ
5025 leader->pmu->read(leader);
5026
b5e58793 5027 values[n++] = perf_event_count(leader);
3dab77fb 5028 if (read_format & PERF_FORMAT_ID)
cdd6c482 5029 values[n++] = primary_event_id(leader);
3dab77fb 5030
76369139 5031 __output_copy(handle, values, n * sizeof(u64));
3dab77fb 5032
65abc865 5033 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3dab77fb
PZ
5034 n = 0;
5035
6f5ab001
JO
5036 if ((sub != event) &&
5037 (sub->state == PERF_EVENT_STATE_ACTIVE))
3dab77fb
PZ
5038 sub->pmu->read(sub);
5039
b5e58793 5040 values[n++] = perf_event_count(sub);
3dab77fb 5041 if (read_format & PERF_FORMAT_ID)
cdd6c482 5042 values[n++] = primary_event_id(sub);
3dab77fb 5043
76369139 5044 __output_copy(handle, values, n * sizeof(u64));
3dab77fb
PZ
5045 }
5046}
5047
eed01528
SE
5048#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
5049 PERF_FORMAT_TOTAL_TIME_RUNNING)
5050
3dab77fb 5051static void perf_output_read(struct perf_output_handle *handle,
cdd6c482 5052 struct perf_event *event)
3dab77fb 5053{
e3f3541c 5054 u64 enabled = 0, running = 0, now;
eed01528
SE
5055 u64 read_format = event->attr.read_format;
5056
5057 /*
5058 * compute total_time_enabled, total_time_running
5059 * based on snapshot values taken when the event
5060 * was last scheduled in.
5061 *
5062 * we cannot simply called update_context_time()
5063 * because of locking issue as we are called in
5064 * NMI context
5065 */
c4794295 5066 if (read_format & PERF_FORMAT_TOTAL_TIMES)
e3f3541c 5067 calc_timer_values(event, &now, &enabled, &running);
eed01528 5068
cdd6c482 5069 if (event->attr.read_format & PERF_FORMAT_GROUP)
eed01528 5070 perf_output_read_group(handle, event, enabled, running);
3dab77fb 5071 else
eed01528 5072 perf_output_read_one(handle, event, enabled, running);
3dab77fb
PZ
5073}
5074
5622f295
MM
5075void perf_output_sample(struct perf_output_handle *handle,
5076 struct perf_event_header *header,
5077 struct perf_sample_data *data,
cdd6c482 5078 struct perf_event *event)
5622f295
MM
5079{
5080 u64 sample_type = data->type;
5081
5082 perf_output_put(handle, *header);
5083
ff3d527c
AH
5084 if (sample_type & PERF_SAMPLE_IDENTIFIER)
5085 perf_output_put(handle, data->id);
5086
5622f295
MM
5087 if (sample_type & PERF_SAMPLE_IP)
5088 perf_output_put(handle, data->ip);
5089
5090 if (sample_type & PERF_SAMPLE_TID)
5091 perf_output_put(handle, data->tid_entry);
5092
5093 if (sample_type & PERF_SAMPLE_TIME)
5094 perf_output_put(handle, data->time);
5095
5096 if (sample_type & PERF_SAMPLE_ADDR)
5097 perf_output_put(handle, data->addr);
5098
5099 if (sample_type & PERF_SAMPLE_ID)
5100 perf_output_put(handle, data->id);
5101
5102 if (sample_type & PERF_SAMPLE_STREAM_ID)
5103 perf_output_put(handle, data->stream_id);
5104
5105 if (sample_type & PERF_SAMPLE_CPU)
5106 perf_output_put(handle, data->cpu_entry);
5107
5108 if (sample_type & PERF_SAMPLE_PERIOD)
5109 perf_output_put(handle, data->period);
5110
5111 if (sample_type & PERF_SAMPLE_READ)
cdd6c482 5112 perf_output_read(handle, event);
5622f295
MM
5113
5114 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5115 if (data->callchain) {
5116 int size = 1;
5117
5118 if (data->callchain)
5119 size += data->callchain->nr;
5120
5121 size *= sizeof(u64);
5122
76369139 5123 __output_copy(handle, data->callchain, size);
5622f295
MM
5124 } else {
5125 u64 nr = 0;
5126 perf_output_put(handle, nr);
5127 }
5128 }
5129
5130 if (sample_type & PERF_SAMPLE_RAW) {
5131 if (data->raw) {
5132 perf_output_put(handle, data->raw->size);
76369139
FW
5133 __output_copy(handle, data->raw->data,
5134 data->raw->size);
5622f295
MM
5135 } else {
5136 struct {
5137 u32 size;
5138 u32 data;
5139 } raw = {
5140 .size = sizeof(u32),
5141 .data = 0,
5142 };
5143 perf_output_put(handle, raw);
5144 }
5145 }
a7ac67ea 5146
bce38cd5
SE
5147 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5148 if (data->br_stack) {
5149 size_t size;
5150
5151 size = data->br_stack->nr
5152 * sizeof(struct perf_branch_entry);
5153
5154 perf_output_put(handle, data->br_stack->nr);
5155 perf_output_copy(handle, data->br_stack->entries, size);
5156 } else {
5157 /*
5158 * we always store at least the value of nr
5159 */
5160 u64 nr = 0;
5161 perf_output_put(handle, nr);
5162 }
5163 }
4018994f
JO
5164
5165 if (sample_type & PERF_SAMPLE_REGS_USER) {
5166 u64 abi = data->regs_user.abi;
5167
5168 /*
5169 * If there are no regs to dump, notice it through
5170 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5171 */
5172 perf_output_put(handle, abi);
5173
5174 if (abi) {
5175 u64 mask = event->attr.sample_regs_user;
5176 perf_output_sample_regs(handle,
5177 data->regs_user.regs,
5178 mask);
5179 }
5180 }
c5ebcedb 5181
a5cdd40c 5182 if (sample_type & PERF_SAMPLE_STACK_USER) {
c5ebcedb
JO
5183 perf_output_sample_ustack(handle,
5184 data->stack_user_size,
5185 data->regs_user.regs);
a5cdd40c 5186 }
c3feedf2
AK
5187
5188 if (sample_type & PERF_SAMPLE_WEIGHT)
5189 perf_output_put(handle, data->weight);
d6be9ad6
SE
5190
5191 if (sample_type & PERF_SAMPLE_DATA_SRC)
5192 perf_output_put(handle, data->data_src.val);
a5cdd40c 5193
fdfbbd07
AK
5194 if (sample_type & PERF_SAMPLE_TRANSACTION)
5195 perf_output_put(handle, data->txn);
5196
60e2364e
SE
5197 if (sample_type & PERF_SAMPLE_REGS_INTR) {
5198 u64 abi = data->regs_intr.abi;
5199 /*
5200 * If there are no regs to dump, notice it through
5201 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5202 */
5203 perf_output_put(handle, abi);
5204
5205 if (abi) {
5206 u64 mask = event->attr.sample_regs_intr;
5207
5208 perf_output_sample_regs(handle,
5209 data->regs_intr.regs,
5210 mask);
5211 }
5212 }
5213
a5cdd40c
PZ
5214 if (!event->attr.watermark) {
5215 int wakeup_events = event->attr.wakeup_events;
5216
5217 if (wakeup_events) {
5218 struct ring_buffer *rb = handle->rb;
5219 int events = local_inc_return(&rb->events);
5220
5221 if (events >= wakeup_events) {
5222 local_sub(wakeup_events, &rb->events);
5223 local_inc(&rb->wakeup);
5224 }
5225 }
5226 }
5622f295
MM
5227}
5228
5229void perf_prepare_sample(struct perf_event_header *header,
5230 struct perf_sample_data *data,
cdd6c482 5231 struct perf_event *event,
5622f295 5232 struct pt_regs *regs)
7b732a75 5233{
cdd6c482 5234 u64 sample_type = event->attr.sample_type;
7b732a75 5235
cdd6c482 5236 header->type = PERF_RECORD_SAMPLE;
c320c7b7 5237 header->size = sizeof(*header) + event->header_size;
5622f295
MM
5238
5239 header->misc = 0;
5240 header->misc |= perf_misc_flags(regs);
6fab0192 5241
c980d109 5242 __perf_event_header__init_id(header, data, event);
6844c09d 5243
c320c7b7 5244 if (sample_type & PERF_SAMPLE_IP)
5622f295
MM
5245 data->ip = perf_instruction_pointer(regs);
5246
b23f3325 5247 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5622f295 5248 int size = 1;
394ee076 5249
e6dab5ff 5250 data->callchain = perf_callchain(event, regs);
5622f295
MM
5251
5252 if (data->callchain)
5253 size += data->callchain->nr;
5254
5255 header->size += size * sizeof(u64);
394ee076
PZ
5256 }
5257
3a43ce68 5258 if (sample_type & PERF_SAMPLE_RAW) {
a044560c
PZ
5259 int size = sizeof(u32);
5260
5261 if (data->raw)
5262 size += data->raw->size;
5263 else
5264 size += sizeof(u32);
5265
5266 WARN_ON_ONCE(size & (sizeof(u64)-1));
5622f295 5267 header->size += size;
7f453c24 5268 }
bce38cd5
SE
5269
5270 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5271 int size = sizeof(u64); /* nr */
5272 if (data->br_stack) {
5273 size += data->br_stack->nr
5274 * sizeof(struct perf_branch_entry);
5275 }
5276 header->size += size;
5277 }
4018994f 5278
2565711f 5279 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
88a7c26a
AL
5280 perf_sample_regs_user(&data->regs_user, regs,
5281 &data->regs_user_copy);
2565711f 5282
4018994f
JO
5283 if (sample_type & PERF_SAMPLE_REGS_USER) {
5284 /* regs dump ABI info */
5285 int size = sizeof(u64);
5286
4018994f
JO
5287 if (data->regs_user.regs) {
5288 u64 mask = event->attr.sample_regs_user;
5289 size += hweight64(mask) * sizeof(u64);
5290 }
5291
5292 header->size += size;
5293 }
c5ebcedb
JO
5294
5295 if (sample_type & PERF_SAMPLE_STACK_USER) {
5296 /*
5297 * Either we need PERF_SAMPLE_STACK_USER bit to be allways
5298 * processed as the last one or have additional check added
5299 * in case new sample type is added, because we could eat
5300 * up the rest of the sample size.
5301 */
c5ebcedb
JO
5302 u16 stack_size = event->attr.sample_stack_user;
5303 u16 size = sizeof(u64);
5304
c5ebcedb 5305 stack_size = perf_sample_ustack_size(stack_size, header->size,
2565711f 5306 data->regs_user.regs);
c5ebcedb
JO
5307
5308 /*
5309 * If there is something to dump, add space for the dump
5310 * itself and for the field that tells the dynamic size,
5311 * which is how many have been actually dumped.
5312 */
5313 if (stack_size)
5314 size += sizeof(u64) + stack_size;
5315
5316 data->stack_user_size = stack_size;
5317 header->size += size;
5318 }
60e2364e
SE
5319
5320 if (sample_type & PERF_SAMPLE_REGS_INTR) {
5321 /* regs dump ABI info */
5322 int size = sizeof(u64);
5323
5324 perf_sample_regs_intr(&data->regs_intr, regs);
5325
5326 if (data->regs_intr.regs) {
5327 u64 mask = event->attr.sample_regs_intr;
5328
5329 size += hweight64(mask) * sizeof(u64);
5330 }
5331
5332 header->size += size;
5333 }
5622f295 5334}
7f453c24 5335
a8b0ca17 5336static void perf_event_output(struct perf_event *event,
5622f295
MM
5337 struct perf_sample_data *data,
5338 struct pt_regs *regs)
5339{
5340 struct perf_output_handle handle;
5341 struct perf_event_header header;
689802b2 5342
927c7a9e
FW
5343 /* protect the callchain buffers */
5344 rcu_read_lock();
5345
cdd6c482 5346 perf_prepare_sample(&header, data, event, regs);
5c148194 5347
a7ac67ea 5348 if (perf_output_begin(&handle, event, header.size))
927c7a9e 5349 goto exit;
0322cd6e 5350
cdd6c482 5351 perf_output_sample(&handle, &header, data, event);
f413cdb8 5352
8a057d84 5353 perf_output_end(&handle);
927c7a9e
FW
5354
5355exit:
5356 rcu_read_unlock();
0322cd6e
PZ
5357}
5358
38b200d6 5359/*
cdd6c482 5360 * read event_id
38b200d6
PZ
5361 */
5362
5363struct perf_read_event {
5364 struct perf_event_header header;
5365
5366 u32 pid;
5367 u32 tid;
38b200d6
PZ
5368};
5369
5370static void
cdd6c482 5371perf_event_read_event(struct perf_event *event,
38b200d6
PZ
5372 struct task_struct *task)
5373{
5374 struct perf_output_handle handle;
c980d109 5375 struct perf_sample_data sample;
dfc65094 5376 struct perf_read_event read_event = {
38b200d6 5377 .header = {
cdd6c482 5378 .type = PERF_RECORD_READ,
38b200d6 5379 .misc = 0,
c320c7b7 5380 .size = sizeof(read_event) + event->read_size,
38b200d6 5381 },
cdd6c482
IM
5382 .pid = perf_event_pid(event, task),
5383 .tid = perf_event_tid(event, task),
38b200d6 5384 };
3dab77fb 5385 int ret;
38b200d6 5386
c980d109 5387 perf_event_header__init_id(&read_event.header, &sample, event);
a7ac67ea 5388 ret = perf_output_begin(&handle, event, read_event.header.size);
38b200d6
PZ
5389 if (ret)
5390 return;
5391
dfc65094 5392 perf_output_put(&handle, read_event);
cdd6c482 5393 perf_output_read(&handle, event);
c980d109 5394 perf_event__output_id_sample(event, &handle, &sample);
3dab77fb 5395
38b200d6
PZ
5396 perf_output_end(&handle);
5397}
5398
52d857a8
JO
5399typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
5400
5401static void
5402perf_event_aux_ctx(struct perf_event_context *ctx,
52d857a8
JO
5403 perf_event_aux_output_cb output,
5404 void *data)
5405{
5406 struct perf_event *event;
5407
5408 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
5409 if (event->state < PERF_EVENT_STATE_INACTIVE)
5410 continue;
5411 if (!event_filter_match(event))
5412 continue;
67516844 5413 output(event, data);
52d857a8
JO
5414 }
5415}
5416
5417static void
67516844 5418perf_event_aux(perf_event_aux_output_cb output, void *data,
52d857a8
JO
5419 struct perf_event_context *task_ctx)
5420{
5421 struct perf_cpu_context *cpuctx;
5422 struct perf_event_context *ctx;
5423 struct pmu *pmu;
5424 int ctxn;
5425
5426 rcu_read_lock();
5427 list_for_each_entry_rcu(pmu, &pmus, entry) {
5428 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
5429 if (cpuctx->unique_pmu != pmu)
5430 goto next;
67516844 5431 perf_event_aux_ctx(&cpuctx->ctx, output, data);
52d857a8
JO
5432 if (task_ctx)
5433 goto next;
5434 ctxn = pmu->task_ctx_nr;
5435 if (ctxn < 0)
5436 goto next;
5437 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
5438 if (ctx)
67516844 5439 perf_event_aux_ctx(ctx, output, data);
52d857a8
JO
5440next:
5441 put_cpu_ptr(pmu->pmu_cpu_context);
5442 }
5443
5444 if (task_ctx) {
5445 preempt_disable();
67516844 5446 perf_event_aux_ctx(task_ctx, output, data);
52d857a8
JO
5447 preempt_enable();
5448 }
5449 rcu_read_unlock();
5450}
5451
60313ebe 5452/*
9f498cc5
PZ
5453 * task tracking -- fork/exit
5454 *
13d7a241 5455 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
60313ebe
PZ
5456 */
5457
9f498cc5 5458struct perf_task_event {
3a80b4a3 5459 struct task_struct *task;
cdd6c482 5460 struct perf_event_context *task_ctx;
60313ebe
PZ
5461
5462 struct {
5463 struct perf_event_header header;
5464
5465 u32 pid;
5466 u32 ppid;
9f498cc5
PZ
5467 u32 tid;
5468 u32 ptid;
393b2ad8 5469 u64 time;
cdd6c482 5470 } event_id;
60313ebe
PZ
5471};
5472
67516844
JO
5473static int perf_event_task_match(struct perf_event *event)
5474{
13d7a241
SE
5475 return event->attr.comm || event->attr.mmap ||
5476 event->attr.mmap2 || event->attr.mmap_data ||
5477 event->attr.task;
67516844
JO
5478}
5479
cdd6c482 5480static void perf_event_task_output(struct perf_event *event,
52d857a8 5481 void *data)
60313ebe 5482{
52d857a8 5483 struct perf_task_event *task_event = data;
60313ebe 5484 struct perf_output_handle handle;
c980d109 5485 struct perf_sample_data sample;
9f498cc5 5486 struct task_struct *task = task_event->task;
c980d109 5487 int ret, size = task_event->event_id.header.size;
8bb39f9a 5488
67516844
JO
5489 if (!perf_event_task_match(event))
5490 return;
5491
c980d109 5492 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
60313ebe 5493
c980d109 5494 ret = perf_output_begin(&handle, event,
a7ac67ea 5495 task_event->event_id.header.size);
ef60777c 5496 if (ret)
c980d109 5497 goto out;
60313ebe 5498
cdd6c482
IM
5499 task_event->event_id.pid = perf_event_pid(event, task);
5500 task_event->event_id.ppid = perf_event_pid(event, current);
60313ebe 5501
cdd6c482
IM
5502 task_event->event_id.tid = perf_event_tid(event, task);
5503 task_event->event_id.ptid = perf_event_tid(event, current);
9f498cc5 5504
34f43927
PZ
5505 task_event->event_id.time = perf_event_clock(event);
5506
cdd6c482 5507 perf_output_put(&handle, task_event->event_id);
393b2ad8 5508
c980d109
ACM
5509 perf_event__output_id_sample(event, &handle, &sample);
5510
60313ebe 5511 perf_output_end(&handle);
c980d109
ACM
5512out:
5513 task_event->event_id.header.size = size;
60313ebe
PZ
5514}
5515
cdd6c482
IM
5516static void perf_event_task(struct task_struct *task,
5517 struct perf_event_context *task_ctx,
3a80b4a3 5518 int new)
60313ebe 5519{
9f498cc5 5520 struct perf_task_event task_event;
60313ebe 5521
cdd6c482
IM
5522 if (!atomic_read(&nr_comm_events) &&
5523 !atomic_read(&nr_mmap_events) &&
5524 !atomic_read(&nr_task_events))
60313ebe
PZ
5525 return;
5526
9f498cc5 5527 task_event = (struct perf_task_event){
3a80b4a3
PZ
5528 .task = task,
5529 .task_ctx = task_ctx,
cdd6c482 5530 .event_id = {
60313ebe 5531 .header = {
cdd6c482 5532 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
573402db 5533 .misc = 0,
cdd6c482 5534 .size = sizeof(task_event.event_id),
60313ebe 5535 },
573402db
PZ
5536 /* .pid */
5537 /* .ppid */
9f498cc5
PZ
5538 /* .tid */
5539 /* .ptid */
34f43927 5540 /* .time */
60313ebe
PZ
5541 },
5542 };
5543
67516844 5544 perf_event_aux(perf_event_task_output,
52d857a8
JO
5545 &task_event,
5546 task_ctx);
9f498cc5
PZ
5547}
5548
cdd6c482 5549void perf_event_fork(struct task_struct *task)
9f498cc5 5550{
cdd6c482 5551 perf_event_task(task, NULL, 1);
60313ebe
PZ
5552}
5553
8d1b2d93
PZ
5554/*
5555 * comm tracking
5556 */
5557
5558struct perf_comm_event {
22a4f650
IM
5559 struct task_struct *task;
5560 char *comm;
8d1b2d93
PZ
5561 int comm_size;
5562
5563 struct {
5564 struct perf_event_header header;
5565
5566 u32 pid;
5567 u32 tid;
cdd6c482 5568 } event_id;
8d1b2d93
PZ
5569};
5570
67516844
JO
5571static int perf_event_comm_match(struct perf_event *event)
5572{
5573 return event->attr.comm;
5574}
5575
cdd6c482 5576static void perf_event_comm_output(struct perf_event *event,
52d857a8 5577 void *data)
8d1b2d93 5578{
52d857a8 5579 struct perf_comm_event *comm_event = data;
8d1b2d93 5580 struct perf_output_handle handle;
c980d109 5581 struct perf_sample_data sample;
cdd6c482 5582 int size = comm_event->event_id.header.size;
c980d109
ACM
5583 int ret;
5584
67516844
JO
5585 if (!perf_event_comm_match(event))
5586 return;
5587
c980d109
ACM
5588 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
5589 ret = perf_output_begin(&handle, event,
a7ac67ea 5590 comm_event->event_id.header.size);
8d1b2d93
PZ
5591
5592 if (ret)
c980d109 5593 goto out;
8d1b2d93 5594
cdd6c482
IM
5595 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
5596 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
709e50cf 5597
cdd6c482 5598 perf_output_put(&handle, comm_event->event_id);
76369139 5599 __output_copy(&handle, comm_event->comm,
8d1b2d93 5600 comm_event->comm_size);
c980d109
ACM
5601
5602 perf_event__output_id_sample(event, &handle, &sample);
5603
8d1b2d93 5604 perf_output_end(&handle);
c980d109
ACM
5605out:
5606 comm_event->event_id.header.size = size;
8d1b2d93
PZ
5607}
5608
cdd6c482 5609static void perf_event_comm_event(struct perf_comm_event *comm_event)
8d1b2d93 5610{
413ee3b4 5611 char comm[TASK_COMM_LEN];
8d1b2d93 5612 unsigned int size;
8d1b2d93 5613
413ee3b4 5614 memset(comm, 0, sizeof(comm));
96b02d78 5615 strlcpy(comm, comm_event->task->comm, sizeof(comm));
888fcee0 5616 size = ALIGN(strlen(comm)+1, sizeof(u64));
8d1b2d93
PZ
5617
5618 comm_event->comm = comm;
5619 comm_event->comm_size = size;
5620
cdd6c482 5621 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
8dc85d54 5622
67516844 5623 perf_event_aux(perf_event_comm_output,
52d857a8
JO
5624 comm_event,
5625 NULL);
8d1b2d93
PZ
5626}
5627
82b89778 5628void perf_event_comm(struct task_struct *task, bool exec)
8d1b2d93 5629{
9ee318a7
PZ
5630 struct perf_comm_event comm_event;
5631
cdd6c482 5632 if (!atomic_read(&nr_comm_events))
9ee318a7 5633 return;
a63eaf34 5634
9ee318a7 5635 comm_event = (struct perf_comm_event){
8d1b2d93 5636 .task = task,
573402db
PZ
5637 /* .comm */
5638 /* .comm_size */
cdd6c482 5639 .event_id = {
573402db 5640 .header = {
cdd6c482 5641 .type = PERF_RECORD_COMM,
82b89778 5642 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
573402db
PZ
5643 /* .size */
5644 },
5645 /* .pid */
5646 /* .tid */
8d1b2d93
PZ
5647 },
5648 };
5649
cdd6c482 5650 perf_event_comm_event(&comm_event);
8d1b2d93
PZ
5651}
5652
0a4a9391
PZ
5653/*
5654 * mmap tracking
5655 */
5656
5657struct perf_mmap_event {
089dd79d
PZ
5658 struct vm_area_struct *vma;
5659
5660 const char *file_name;
5661 int file_size;
13d7a241
SE
5662 int maj, min;
5663 u64 ino;
5664 u64 ino_generation;
f972eb63 5665 u32 prot, flags;
0a4a9391
PZ
5666
5667 struct {
5668 struct perf_event_header header;
5669
5670 u32 pid;
5671 u32 tid;
5672 u64 start;
5673 u64 len;
5674 u64 pgoff;
cdd6c482 5675 } event_id;
0a4a9391
PZ
5676};
5677
67516844
JO
5678static int perf_event_mmap_match(struct perf_event *event,
5679 void *data)
5680{
5681 struct perf_mmap_event *mmap_event = data;
5682 struct vm_area_struct *vma = mmap_event->vma;
5683 int executable = vma->vm_flags & VM_EXEC;
5684
5685 return (!executable && event->attr.mmap_data) ||
13d7a241 5686 (executable && (event->attr.mmap || event->attr.mmap2));
67516844
JO
5687}
5688
cdd6c482 5689static void perf_event_mmap_output(struct perf_event *event,
52d857a8 5690 void *data)
0a4a9391 5691{
52d857a8 5692 struct perf_mmap_event *mmap_event = data;
0a4a9391 5693 struct perf_output_handle handle;
c980d109 5694 struct perf_sample_data sample;
cdd6c482 5695 int size = mmap_event->event_id.header.size;
c980d109 5696 int ret;
0a4a9391 5697
67516844
JO
5698 if (!perf_event_mmap_match(event, data))
5699 return;
5700
13d7a241
SE
5701 if (event->attr.mmap2) {
5702 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
5703 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
5704 mmap_event->event_id.header.size += sizeof(mmap_event->min);
5705 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
d008d525 5706 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
f972eb63
PZ
5707 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
5708 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
13d7a241
SE
5709 }
5710
c980d109
ACM
5711 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
5712 ret = perf_output_begin(&handle, event,
a7ac67ea 5713 mmap_event->event_id.header.size);
0a4a9391 5714 if (ret)
c980d109 5715 goto out;
0a4a9391 5716
cdd6c482
IM
5717 mmap_event->event_id.pid = perf_event_pid(event, current);
5718 mmap_event->event_id.tid = perf_event_tid(event, current);
709e50cf 5719
cdd6c482 5720 perf_output_put(&handle, mmap_event->event_id);
13d7a241
SE
5721
5722 if (event->attr.mmap2) {
5723 perf_output_put(&handle, mmap_event->maj);
5724 perf_output_put(&handle, mmap_event->min);
5725 perf_output_put(&handle, mmap_event->ino);
5726 perf_output_put(&handle, mmap_event->ino_generation);
f972eb63
PZ
5727 perf_output_put(&handle, mmap_event->prot);
5728 perf_output_put(&handle, mmap_event->flags);
13d7a241
SE
5729 }
5730
76369139 5731 __output_copy(&handle, mmap_event->file_name,
0a4a9391 5732 mmap_event->file_size);
c980d109
ACM
5733
5734 perf_event__output_id_sample(event, &handle, &sample);
5735
78d613eb 5736 perf_output_end(&handle);
c980d109
ACM
5737out:
5738 mmap_event->event_id.header.size = size;
0a4a9391
PZ
5739}
5740
cdd6c482 5741static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
0a4a9391 5742{
089dd79d
PZ
5743 struct vm_area_struct *vma = mmap_event->vma;
5744 struct file *file = vma->vm_file;
13d7a241
SE
5745 int maj = 0, min = 0;
5746 u64 ino = 0, gen = 0;
f972eb63 5747 u32 prot = 0, flags = 0;
0a4a9391
PZ
5748 unsigned int size;
5749 char tmp[16];
5750 char *buf = NULL;
2c42cfbf 5751 char *name;
413ee3b4 5752
0a4a9391 5753 if (file) {
13d7a241
SE
5754 struct inode *inode;
5755 dev_t dev;
3ea2f2b9 5756
2c42cfbf 5757 buf = kmalloc(PATH_MAX, GFP_KERNEL);
0a4a9391 5758 if (!buf) {
c7e548b4
ON
5759 name = "//enomem";
5760 goto cpy_name;
0a4a9391 5761 }
413ee3b4 5762 /*
3ea2f2b9 5763 * d_path() works from the end of the rb backwards, so we
413ee3b4
AB
5764 * need to add enough zero bytes after the string to handle
5765 * the 64bit alignment we do later.
5766 */
3ea2f2b9 5767 name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
0a4a9391 5768 if (IS_ERR(name)) {
c7e548b4
ON
5769 name = "//toolong";
5770 goto cpy_name;
0a4a9391 5771 }
13d7a241
SE
5772 inode = file_inode(vma->vm_file);
5773 dev = inode->i_sb->s_dev;
5774 ino = inode->i_ino;
5775 gen = inode->i_generation;
5776 maj = MAJOR(dev);
5777 min = MINOR(dev);
f972eb63
PZ
5778
5779 if (vma->vm_flags & VM_READ)
5780 prot |= PROT_READ;
5781 if (vma->vm_flags & VM_WRITE)
5782 prot |= PROT_WRITE;
5783 if (vma->vm_flags & VM_EXEC)
5784 prot |= PROT_EXEC;
5785
5786 if (vma->vm_flags & VM_MAYSHARE)
5787 flags = MAP_SHARED;
5788 else
5789 flags = MAP_PRIVATE;
5790
5791 if (vma->vm_flags & VM_DENYWRITE)
5792 flags |= MAP_DENYWRITE;
5793 if (vma->vm_flags & VM_MAYEXEC)
5794 flags |= MAP_EXECUTABLE;
5795 if (vma->vm_flags & VM_LOCKED)
5796 flags |= MAP_LOCKED;
5797 if (vma->vm_flags & VM_HUGETLB)
5798 flags |= MAP_HUGETLB;
5799
c7e548b4 5800 goto got_name;
0a4a9391 5801 } else {
fbe26abe
JO
5802 if (vma->vm_ops && vma->vm_ops->name) {
5803 name = (char *) vma->vm_ops->name(vma);
5804 if (name)
5805 goto cpy_name;
5806 }
5807
2c42cfbf 5808 name = (char *)arch_vma_name(vma);
c7e548b4
ON
5809 if (name)
5810 goto cpy_name;
089dd79d 5811
32c5fb7e 5812 if (vma->vm_start <= vma->vm_mm->start_brk &&
3af9e859 5813 vma->vm_end >= vma->vm_mm->brk) {
c7e548b4
ON
5814 name = "[heap]";
5815 goto cpy_name;
32c5fb7e
ON
5816 }
5817 if (vma->vm_start <= vma->vm_mm->start_stack &&
3af9e859 5818 vma->vm_end >= vma->vm_mm->start_stack) {
c7e548b4
ON
5819 name = "[stack]";
5820 goto cpy_name;
089dd79d
PZ
5821 }
5822
c7e548b4
ON
5823 name = "//anon";
5824 goto cpy_name;
0a4a9391
PZ
5825 }
5826
c7e548b4
ON
5827cpy_name:
5828 strlcpy(tmp, name, sizeof(tmp));
5829 name = tmp;
0a4a9391 5830got_name:
2c42cfbf
PZ
5831 /*
5832 * Since our buffer works in 8 byte units we need to align our string
5833 * size to a multiple of 8. However, we must guarantee the tail end is
5834 * zero'd out to avoid leaking random bits to userspace.
5835 */
5836 size = strlen(name)+1;
5837 while (!IS_ALIGNED(size, sizeof(u64)))
5838 name[size++] = '\0';
0a4a9391
PZ
5839
5840 mmap_event->file_name = name;
5841 mmap_event->file_size = size;
13d7a241
SE
5842 mmap_event->maj = maj;
5843 mmap_event->min = min;
5844 mmap_event->ino = ino;
5845 mmap_event->ino_generation = gen;
f972eb63
PZ
5846 mmap_event->prot = prot;
5847 mmap_event->flags = flags;
0a4a9391 5848
2fe85427
SE
5849 if (!(vma->vm_flags & VM_EXEC))
5850 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
5851
cdd6c482 5852 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
0a4a9391 5853
67516844 5854 perf_event_aux(perf_event_mmap_output,
52d857a8
JO
5855 mmap_event,
5856 NULL);
665c2142 5857
0a4a9391
PZ
5858 kfree(buf);
5859}
5860
3af9e859 5861void perf_event_mmap(struct vm_area_struct *vma)
0a4a9391 5862{
9ee318a7
PZ
5863 struct perf_mmap_event mmap_event;
5864
cdd6c482 5865 if (!atomic_read(&nr_mmap_events))
9ee318a7
PZ
5866 return;
5867
5868 mmap_event = (struct perf_mmap_event){
089dd79d 5869 .vma = vma,
573402db
PZ
5870 /* .file_name */
5871 /* .file_size */
cdd6c482 5872 .event_id = {
573402db 5873 .header = {
cdd6c482 5874 .type = PERF_RECORD_MMAP,
39447b38 5875 .misc = PERF_RECORD_MISC_USER,
573402db
PZ
5876 /* .size */
5877 },
5878 /* .pid */
5879 /* .tid */
089dd79d
PZ
5880 .start = vma->vm_start,
5881 .len = vma->vm_end - vma->vm_start,
3a0304e9 5882 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
0a4a9391 5883 },
13d7a241
SE
5884 /* .maj (attr_mmap2 only) */
5885 /* .min (attr_mmap2 only) */
5886 /* .ino (attr_mmap2 only) */
5887 /* .ino_generation (attr_mmap2 only) */
f972eb63
PZ
5888 /* .prot (attr_mmap2 only) */
5889 /* .flags (attr_mmap2 only) */
0a4a9391
PZ
5890 };
5891
cdd6c482 5892 perf_event_mmap_event(&mmap_event);
0a4a9391
PZ
5893}
5894
68db7e98
AS
5895void perf_event_aux_event(struct perf_event *event, unsigned long head,
5896 unsigned long size, u64 flags)
5897{
5898 struct perf_output_handle handle;
5899 struct perf_sample_data sample;
5900 struct perf_aux_event {
5901 struct perf_event_header header;
5902 u64 offset;
5903 u64 size;
5904 u64 flags;
5905 } rec = {
5906 .header = {
5907 .type = PERF_RECORD_AUX,
5908 .misc = 0,
5909 .size = sizeof(rec),
5910 },
5911 .offset = head,
5912 .size = size,
5913 .flags = flags,
5914 };
5915 int ret;
5916
5917 perf_event_header__init_id(&rec.header, &sample, event);
5918 ret = perf_output_begin(&handle, event, rec.header.size);
5919
5920 if (ret)
5921 return;
5922
5923 perf_output_put(&handle, rec);
5924 perf_event__output_id_sample(event, &handle, &sample);
5925
5926 perf_output_end(&handle);
5927}
5928
a78ac325
PZ
5929/*
5930 * IRQ throttle logging
5931 */
5932
cdd6c482 5933static void perf_log_throttle(struct perf_event *event, int enable)
a78ac325
PZ
5934{
5935 struct perf_output_handle handle;
c980d109 5936 struct perf_sample_data sample;
a78ac325
PZ
5937 int ret;
5938
5939 struct {
5940 struct perf_event_header header;
5941 u64 time;
cca3f454 5942 u64 id;
7f453c24 5943 u64 stream_id;
a78ac325
PZ
5944 } throttle_event = {
5945 .header = {
cdd6c482 5946 .type = PERF_RECORD_THROTTLE,
a78ac325
PZ
5947 .misc = 0,
5948 .size = sizeof(throttle_event),
5949 },
34f43927 5950 .time = perf_event_clock(event),
cdd6c482
IM
5951 .id = primary_event_id(event),
5952 .stream_id = event->id,
a78ac325
PZ
5953 };
5954
966ee4d6 5955 if (enable)
cdd6c482 5956 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
966ee4d6 5957
c980d109
ACM
5958 perf_event_header__init_id(&throttle_event.header, &sample, event);
5959
5960 ret = perf_output_begin(&handle, event,
a7ac67ea 5961 throttle_event.header.size);
a78ac325
PZ
5962 if (ret)
5963 return;
5964
5965 perf_output_put(&handle, throttle_event);
c980d109 5966 perf_event__output_id_sample(event, &handle, &sample);
a78ac325
PZ
5967 perf_output_end(&handle);
5968}
5969
ec0d7729
AS
5970static void perf_log_itrace_start(struct perf_event *event)
5971{
5972 struct perf_output_handle handle;
5973 struct perf_sample_data sample;
5974 struct perf_aux_event {
5975 struct perf_event_header header;
5976 u32 pid;
5977 u32 tid;
5978 } rec;
5979 int ret;
5980
5981 if (event->parent)
5982 event = event->parent;
5983
5984 if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
5985 event->hw.itrace_started)
5986 return;
5987
5988 event->hw.itrace_started = 1;
5989
5990 rec.header.type = PERF_RECORD_ITRACE_START;
5991 rec.header.misc = 0;
5992 rec.header.size = sizeof(rec);
5993 rec.pid = perf_event_pid(event, current);
5994 rec.tid = perf_event_tid(event, current);
5995
5996 perf_event_header__init_id(&rec.header, &sample, event);
5997 ret = perf_output_begin(&handle, event, rec.header.size);
5998
5999 if (ret)
6000 return;
6001
6002 perf_output_put(&handle, rec);
6003 perf_event__output_id_sample(event, &handle, &sample);
6004
6005 perf_output_end(&handle);
6006}
6007
f6c7d5fe 6008/*
cdd6c482 6009 * Generic event overflow handling, sampling.
f6c7d5fe
PZ
6010 */
6011
a8b0ca17 6012static int __perf_event_overflow(struct perf_event *event,
5622f295
MM
6013 int throttle, struct perf_sample_data *data,
6014 struct pt_regs *regs)
f6c7d5fe 6015{
cdd6c482
IM
6016 int events = atomic_read(&event->event_limit);
6017 struct hw_perf_event *hwc = &event->hw;
e050e3f0 6018 u64 seq;
79f14641
PZ
6019 int ret = 0;
6020
96398826
PZ
6021 /*
6022 * Non-sampling counters might still use the PMI to fold short
6023 * hardware counters, ignore those.
6024 */
6025 if (unlikely(!is_sampling_event(event)))
6026 return 0;
6027
e050e3f0
SE
6028 seq = __this_cpu_read(perf_throttled_seq);
6029 if (seq != hwc->interrupts_seq) {
6030 hwc->interrupts_seq = seq;
6031 hwc->interrupts = 1;
6032 } else {
6033 hwc->interrupts++;
6034 if (unlikely(throttle
6035 && hwc->interrupts >= max_samples_per_tick)) {
6036 __this_cpu_inc(perf_throttled_count);
163ec435
PZ
6037 hwc->interrupts = MAX_INTERRUPTS;
6038 perf_log_throttle(event, 0);
d84153d6 6039 tick_nohz_full_kick();
a78ac325
PZ
6040 ret = 1;
6041 }
e050e3f0 6042 }
60db5e09 6043
cdd6c482 6044 if (event->attr.freq) {
def0a9b2 6045 u64 now = perf_clock();
abd50713 6046 s64 delta = now - hwc->freq_time_stamp;
bd2b5b12 6047
abd50713 6048 hwc->freq_time_stamp = now;
bd2b5b12 6049
abd50713 6050 if (delta > 0 && delta < 2*TICK_NSEC)
f39d47ff 6051 perf_adjust_period(event, delta, hwc->last_period, true);
bd2b5b12
PZ
6052 }
6053
2023b359
PZ
6054 /*
6055 * XXX event_limit might not quite work as expected on inherited
cdd6c482 6056 * events
2023b359
PZ
6057 */
6058
cdd6c482
IM
6059 event->pending_kill = POLL_IN;
6060 if (events && atomic_dec_and_test(&event->event_limit)) {
79f14641 6061 ret = 1;
cdd6c482 6062 event->pending_kill = POLL_HUP;
a8b0ca17
PZ
6063 event->pending_disable = 1;
6064 irq_work_queue(&event->pending);
79f14641
PZ
6065 }
6066
453f19ee 6067 if (event->overflow_handler)
a8b0ca17 6068 event->overflow_handler(event, data, regs);
453f19ee 6069 else
a8b0ca17 6070 perf_event_output(event, data, regs);
453f19ee 6071
f506b3dc 6072 if (event->fasync && event->pending_kill) {
a8b0ca17
PZ
6073 event->pending_wakeup = 1;
6074 irq_work_queue(&event->pending);
f506b3dc
PZ
6075 }
6076
79f14641 6077 return ret;
f6c7d5fe
PZ
6078}
6079
a8b0ca17 6080int perf_event_overflow(struct perf_event *event,
5622f295
MM
6081 struct perf_sample_data *data,
6082 struct pt_regs *regs)
850bc73f 6083{
a8b0ca17 6084 return __perf_event_overflow(event, 1, data, regs);
850bc73f
PZ
6085}
6086
15dbf27c 6087/*
cdd6c482 6088 * Generic software event infrastructure
15dbf27c
PZ
6089 */
6090
b28ab83c
PZ
6091struct swevent_htable {
6092 struct swevent_hlist *swevent_hlist;
6093 struct mutex hlist_mutex;
6094 int hlist_refcount;
6095
6096 /* Recursion avoidance in each contexts */
6097 int recursion[PERF_NR_CONTEXTS];
39af6b16
JO
6098
6099 /* Keeps track of cpu being initialized/exited */
6100 bool online;
b28ab83c
PZ
6101};
6102
6103static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
6104
7b4b6658 6105/*
cdd6c482
IM
6106 * We directly increment event->count and keep a second value in
6107 * event->hw.period_left to count intervals. This period event
7b4b6658
PZ
6108 * is kept in the range [-sample_period, 0] so that we can use the
6109 * sign as trigger.
6110 */
6111
ab573844 6112u64 perf_swevent_set_period(struct perf_event *event)
15dbf27c 6113{
cdd6c482 6114 struct hw_perf_event *hwc = &event->hw;
7b4b6658
PZ
6115 u64 period = hwc->last_period;
6116 u64 nr, offset;
6117 s64 old, val;
6118
6119 hwc->last_period = hwc->sample_period;
15dbf27c
PZ
6120
6121again:
e7850595 6122 old = val = local64_read(&hwc->period_left);
7b4b6658
PZ
6123 if (val < 0)
6124 return 0;
15dbf27c 6125
7b4b6658
PZ
6126 nr = div64_u64(period + val, period);
6127 offset = nr * period;
6128 val -= offset;
e7850595 6129 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
7b4b6658 6130 goto again;
15dbf27c 6131
7b4b6658 6132 return nr;
15dbf27c
PZ
6133}
6134
0cff784a 6135static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
a8b0ca17 6136 struct perf_sample_data *data,
5622f295 6137 struct pt_regs *regs)
15dbf27c 6138{
cdd6c482 6139 struct hw_perf_event *hwc = &event->hw;
850bc73f 6140 int throttle = 0;
15dbf27c 6141
0cff784a
PZ
6142 if (!overflow)
6143 overflow = perf_swevent_set_period(event);
15dbf27c 6144
7b4b6658
PZ
6145 if (hwc->interrupts == MAX_INTERRUPTS)
6146 return;
15dbf27c 6147
7b4b6658 6148 for (; overflow; overflow--) {
a8b0ca17 6149 if (__perf_event_overflow(event, throttle,
5622f295 6150 data, regs)) {
7b4b6658
PZ
6151 /*
6152 * We inhibit the overflow from happening when
6153 * hwc->interrupts == MAX_INTERRUPTS.
6154 */
6155 break;
6156 }
cf450a73 6157 throttle = 1;
7b4b6658 6158 }
15dbf27c
PZ
6159}
6160
a4eaf7f1 6161static void perf_swevent_event(struct perf_event *event, u64 nr,
a8b0ca17 6162 struct perf_sample_data *data,
5622f295 6163 struct pt_regs *regs)
7b4b6658 6164{
cdd6c482 6165 struct hw_perf_event *hwc = &event->hw;
d6d020e9 6166
e7850595 6167 local64_add(nr, &event->count);
d6d020e9 6168
0cff784a
PZ
6169 if (!regs)
6170 return;
6171
6c7e550f 6172 if (!is_sampling_event(event))
7b4b6658 6173 return;
d6d020e9 6174
5d81e5cf
AV
6175 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
6176 data->period = nr;
6177 return perf_swevent_overflow(event, 1, data, regs);
6178 } else
6179 data->period = event->hw.last_period;
6180
0cff784a 6181 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
a8b0ca17 6182 return perf_swevent_overflow(event, 1, data, regs);
0cff784a 6183
e7850595 6184 if (local64_add_negative(nr, &hwc->period_left))
7b4b6658 6185 return;
df1a132b 6186
a8b0ca17 6187 perf_swevent_overflow(event, 0, data, regs);
d6d020e9
PZ
6188}
6189
f5ffe02e
FW
6190static int perf_exclude_event(struct perf_event *event,
6191 struct pt_regs *regs)
6192{
a4eaf7f1 6193 if (event->hw.state & PERF_HES_STOPPED)
91b2f482 6194 return 1;
a4eaf7f1 6195
f5ffe02e
FW
6196 if (regs) {
6197 if (event->attr.exclude_user && user_mode(regs))
6198 return 1;
6199
6200 if (event->attr.exclude_kernel && !user_mode(regs))
6201 return 1;
6202 }
6203
6204 return 0;
6205}
6206
cdd6c482 6207static int perf_swevent_match(struct perf_event *event,
1c432d89 6208 enum perf_type_id type,
6fb2915d
LZ
6209 u32 event_id,
6210 struct perf_sample_data *data,
6211 struct pt_regs *regs)
15dbf27c 6212{
cdd6c482 6213 if (event->attr.type != type)
a21ca2ca 6214 return 0;
f5ffe02e 6215
cdd6c482 6216 if (event->attr.config != event_id)
15dbf27c
PZ
6217 return 0;
6218
f5ffe02e
FW
6219 if (perf_exclude_event(event, regs))
6220 return 0;
15dbf27c
PZ
6221
6222 return 1;
6223}
6224
76e1d904
FW
6225static inline u64 swevent_hash(u64 type, u32 event_id)
6226{
6227 u64 val = event_id | (type << 32);
6228
6229 return hash_64(val, SWEVENT_HLIST_BITS);
6230}
6231
49f135ed
FW
6232static inline struct hlist_head *
6233__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
76e1d904 6234{
49f135ed
FW
6235 u64 hash = swevent_hash(type, event_id);
6236
6237 return &hlist->heads[hash];
6238}
76e1d904 6239
49f135ed
FW
6240/* For the read side: events when they trigger */
6241static inline struct hlist_head *
b28ab83c 6242find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
49f135ed
FW
6243{
6244 struct swevent_hlist *hlist;
76e1d904 6245
b28ab83c 6246 hlist = rcu_dereference(swhash->swevent_hlist);
76e1d904
FW
6247 if (!hlist)
6248 return NULL;
6249
49f135ed
FW
6250 return __find_swevent_head(hlist, type, event_id);
6251}
6252
6253/* For the event head insertion and removal in the hlist */
6254static inline struct hlist_head *
b28ab83c 6255find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
49f135ed
FW
6256{
6257 struct swevent_hlist *hlist;
6258 u32 event_id = event->attr.config;
6259 u64 type = event->attr.type;
6260
6261 /*
6262 * Event scheduling is always serialized against hlist allocation
6263 * and release. Which makes the protected version suitable here.
6264 * The context lock guarantees that.
6265 */
b28ab83c 6266 hlist = rcu_dereference_protected(swhash->swevent_hlist,
49f135ed
FW
6267 lockdep_is_held(&event->ctx->lock));
6268 if (!hlist)
6269 return NULL;
6270
6271 return __find_swevent_head(hlist, type, event_id);
76e1d904
FW
6272}
6273
6274static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
a8b0ca17 6275 u64 nr,
76e1d904
FW
6276 struct perf_sample_data *data,
6277 struct pt_regs *regs)
15dbf27c 6278{
4a32fea9 6279 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
cdd6c482 6280 struct perf_event *event;
76e1d904 6281 struct hlist_head *head;
15dbf27c 6282
76e1d904 6283 rcu_read_lock();
b28ab83c 6284 head = find_swevent_head_rcu(swhash, type, event_id);
76e1d904
FW
6285 if (!head)
6286 goto end;
6287
b67bfe0d 6288 hlist_for_each_entry_rcu(event, head, hlist_entry) {
6fb2915d 6289 if (perf_swevent_match(event, type, event_id, data, regs))
a8b0ca17 6290 perf_swevent_event(event, nr, data, regs);
15dbf27c 6291 }
76e1d904
FW
6292end:
6293 rcu_read_unlock();
15dbf27c
PZ
6294}
6295
86038c5e
PZI
6296DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
6297
4ed7c92d 6298int perf_swevent_get_recursion_context(void)
96f6d444 6299{
4a32fea9 6300 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
96f6d444 6301
b28ab83c 6302 return get_recursion_context(swhash->recursion);
96f6d444 6303}
645e8cc0 6304EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
96f6d444 6305
fa9f90be 6306inline void perf_swevent_put_recursion_context(int rctx)
15dbf27c 6307{
4a32fea9 6308 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
927c7a9e 6309
b28ab83c 6310 put_recursion_context(swhash->recursion, rctx);
ce71b9df 6311}
15dbf27c 6312
86038c5e 6313void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
b8e83514 6314{
a4234bfc 6315 struct perf_sample_data data;
4ed7c92d 6316
86038c5e 6317 if (WARN_ON_ONCE(!regs))
4ed7c92d 6318 return;
a4234bfc 6319
fd0d000b 6320 perf_sample_data_init(&data, addr, 0);
a8b0ca17 6321 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
86038c5e
PZI
6322}
6323
6324void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
6325{
6326 int rctx;
6327
6328 preempt_disable_notrace();
6329 rctx = perf_swevent_get_recursion_context();
6330 if (unlikely(rctx < 0))
6331 goto fail;
6332
6333 ___perf_sw_event(event_id, nr, regs, addr);
4ed7c92d
PZ
6334
6335 perf_swevent_put_recursion_context(rctx);
86038c5e 6336fail:
1c024eca 6337 preempt_enable_notrace();
b8e83514
PZ
6338}
6339
cdd6c482 6340static void perf_swevent_read(struct perf_event *event)
15dbf27c 6341{
15dbf27c
PZ
6342}
6343
a4eaf7f1 6344static int perf_swevent_add(struct perf_event *event, int flags)
15dbf27c 6345{
4a32fea9 6346 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
cdd6c482 6347 struct hw_perf_event *hwc = &event->hw;
76e1d904
FW
6348 struct hlist_head *head;
6349
6c7e550f 6350 if (is_sampling_event(event)) {
7b4b6658 6351 hwc->last_period = hwc->sample_period;
cdd6c482 6352 perf_swevent_set_period(event);
7b4b6658 6353 }
76e1d904 6354
a4eaf7f1
PZ
6355 hwc->state = !(flags & PERF_EF_START);
6356
b28ab83c 6357 head = find_swevent_head(swhash, event);
39af6b16
JO
6358 if (!head) {
6359 /*
6360 * We can race with cpu hotplug code. Do not
6361 * WARN if the cpu just got unplugged.
6362 */
6363 WARN_ON_ONCE(swhash->online);
76e1d904 6364 return -EINVAL;
39af6b16 6365 }
76e1d904
FW
6366
6367 hlist_add_head_rcu(&event->hlist_entry, head);
6a694a60 6368 perf_event_update_userpage(event);
76e1d904 6369
15dbf27c
PZ
6370 return 0;
6371}
6372
a4eaf7f1 6373static void perf_swevent_del(struct perf_event *event, int flags)
15dbf27c 6374{
76e1d904 6375 hlist_del_rcu(&event->hlist_entry);
15dbf27c
PZ
6376}
6377
a4eaf7f1 6378static void perf_swevent_start(struct perf_event *event, int flags)
5c92d124 6379{
a4eaf7f1 6380 event->hw.state = 0;
d6d020e9 6381}
aa9c4c0f 6382
a4eaf7f1 6383static void perf_swevent_stop(struct perf_event *event, int flags)
d6d020e9 6384{
a4eaf7f1 6385 event->hw.state = PERF_HES_STOPPED;
bae43c99
IM
6386}
6387
49f135ed
FW
6388/* Deref the hlist from the update side */
6389static inline struct swevent_hlist *
b28ab83c 6390swevent_hlist_deref(struct swevent_htable *swhash)
49f135ed 6391{
b28ab83c
PZ
6392 return rcu_dereference_protected(swhash->swevent_hlist,
6393 lockdep_is_held(&swhash->hlist_mutex));
49f135ed
FW
6394}
6395
b28ab83c 6396static void swevent_hlist_release(struct swevent_htable *swhash)
76e1d904 6397{
b28ab83c 6398 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
76e1d904 6399
49f135ed 6400 if (!hlist)
76e1d904
FW
6401 return;
6402
70691d4a 6403 RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
fa4bbc4c 6404 kfree_rcu(hlist, rcu_head);
76e1d904
FW
6405}
6406
6407static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
6408{
b28ab83c 6409 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
76e1d904 6410
b28ab83c 6411 mutex_lock(&swhash->hlist_mutex);
76e1d904 6412
b28ab83c
PZ
6413 if (!--swhash->hlist_refcount)
6414 swevent_hlist_release(swhash);
76e1d904 6415
b28ab83c 6416 mutex_unlock(&swhash->hlist_mutex);
76e1d904
FW
6417}
6418
6419static void swevent_hlist_put(struct perf_event *event)
6420{
6421 int cpu;
6422
76e1d904
FW
6423 for_each_possible_cpu(cpu)
6424 swevent_hlist_put_cpu(event, cpu);
6425}
6426
6427static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
6428{
b28ab83c 6429 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
76e1d904
FW
6430 int err = 0;
6431
b28ab83c 6432 mutex_lock(&swhash->hlist_mutex);
76e1d904 6433
b28ab83c 6434 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
76e1d904
FW
6435 struct swevent_hlist *hlist;
6436
6437 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
6438 if (!hlist) {
6439 err = -ENOMEM;
6440 goto exit;
6441 }
b28ab83c 6442 rcu_assign_pointer(swhash->swevent_hlist, hlist);
76e1d904 6443 }
b28ab83c 6444 swhash->hlist_refcount++;
9ed6060d 6445exit:
b28ab83c 6446 mutex_unlock(&swhash->hlist_mutex);
76e1d904
FW
6447
6448 return err;
6449}
6450
6451static int swevent_hlist_get(struct perf_event *event)
6452{
6453 int err;
6454 int cpu, failed_cpu;
6455
76e1d904
FW
6456 get_online_cpus();
6457 for_each_possible_cpu(cpu) {
6458 err = swevent_hlist_get_cpu(event, cpu);
6459 if (err) {
6460 failed_cpu = cpu;
6461 goto fail;
6462 }
6463 }
6464 put_online_cpus();
6465
6466 return 0;
9ed6060d 6467fail:
76e1d904
FW
6468 for_each_possible_cpu(cpu) {
6469 if (cpu == failed_cpu)
6470 break;
6471 swevent_hlist_put_cpu(event, cpu);
6472 }
6473
6474 put_online_cpus();
6475 return err;
6476}
6477
c5905afb 6478struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
95476b64 6479
b0a873eb
PZ
6480static void sw_perf_event_destroy(struct perf_event *event)
6481{
6482 u64 event_id = event->attr.config;
95476b64 6483
b0a873eb
PZ
6484 WARN_ON(event->parent);
6485
c5905afb 6486 static_key_slow_dec(&perf_swevent_enabled[event_id]);
b0a873eb
PZ
6487 swevent_hlist_put(event);
6488}
6489
6490static int perf_swevent_init(struct perf_event *event)
6491{
8176cced 6492 u64 event_id = event->attr.config;
b0a873eb
PZ
6493
6494 if (event->attr.type != PERF_TYPE_SOFTWARE)
6495 return -ENOENT;
6496
2481c5fa
SE
6497 /*
6498 * no branch sampling for software events
6499 */
6500 if (has_branch_stack(event))
6501 return -EOPNOTSUPP;
6502
b0a873eb
PZ
6503 switch (event_id) {
6504 case PERF_COUNT_SW_CPU_CLOCK:
6505 case PERF_COUNT_SW_TASK_CLOCK:
6506 return -ENOENT;
6507
6508 default:
6509 break;
6510 }
6511
ce677831 6512 if (event_id >= PERF_COUNT_SW_MAX)
b0a873eb
PZ
6513 return -ENOENT;
6514
6515 if (!event->parent) {
6516 int err;
6517
6518 err = swevent_hlist_get(event);
6519 if (err)
6520 return err;
6521
c5905afb 6522 static_key_slow_inc(&perf_swevent_enabled[event_id]);
b0a873eb
PZ
6523 event->destroy = sw_perf_event_destroy;
6524 }
6525
6526 return 0;
6527}
6528
6529static struct pmu perf_swevent = {
89a1e187 6530 .task_ctx_nr = perf_sw_context,
95476b64 6531
34f43927
PZ
6532 .capabilities = PERF_PMU_CAP_NO_NMI,
6533
b0a873eb 6534 .event_init = perf_swevent_init,
a4eaf7f1
PZ
6535 .add = perf_swevent_add,
6536 .del = perf_swevent_del,
6537 .start = perf_swevent_start,
6538 .stop = perf_swevent_stop,
1c024eca 6539 .read = perf_swevent_read,
1c024eca
PZ
6540};
6541
b0a873eb
PZ
6542#ifdef CONFIG_EVENT_TRACING
6543
1c024eca
PZ
6544static int perf_tp_filter_match(struct perf_event *event,
6545 struct perf_sample_data *data)
6546{
6547 void *record = data->raw->data;
6548
6549 if (likely(!event->filter) || filter_match_preds(event->filter, record))
6550 return 1;
6551 return 0;
6552}
6553
6554static int perf_tp_event_match(struct perf_event *event,
6555 struct perf_sample_data *data,
6556 struct pt_regs *regs)
6557{
a0f7d0f7
FW
6558 if (event->hw.state & PERF_HES_STOPPED)
6559 return 0;
580d607c
PZ
6560 /*
6561 * All tracepoints are from kernel-space.
6562 */
6563 if (event->attr.exclude_kernel)
1c024eca
PZ
6564 return 0;
6565
6566 if (!perf_tp_filter_match(event, data))
6567 return 0;
6568
6569 return 1;
6570}
6571
6572void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
e6dab5ff
AV
6573 struct pt_regs *regs, struct hlist_head *head, int rctx,
6574 struct task_struct *task)
95476b64
FW
6575{
6576 struct perf_sample_data data;
1c024eca 6577 struct perf_event *event;
1c024eca 6578
95476b64
FW
6579 struct perf_raw_record raw = {
6580 .size = entry_size,
6581 .data = record,
6582 };
6583
fd0d000b 6584 perf_sample_data_init(&data, addr, 0);
95476b64
FW
6585 data.raw = &raw;
6586
b67bfe0d 6587 hlist_for_each_entry_rcu(event, head, hlist_entry) {
1c024eca 6588 if (perf_tp_event_match(event, &data, regs))
a8b0ca17 6589 perf_swevent_event(event, count, &data, regs);
4f41c013 6590 }
ecc55f84 6591
e6dab5ff
AV
6592 /*
6593 * If we got specified a target task, also iterate its context and
6594 * deliver this event there too.
6595 */
6596 if (task && task != current) {
6597 struct perf_event_context *ctx;
6598 struct trace_entry *entry = record;
6599
6600 rcu_read_lock();
6601 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
6602 if (!ctx)
6603 goto unlock;
6604
6605 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
6606 if (event->attr.type != PERF_TYPE_TRACEPOINT)
6607 continue;
6608 if (event->attr.config != entry->type)
6609 continue;
6610 if (perf_tp_event_match(event, &data, regs))
6611 perf_swevent_event(event, count, &data, regs);
6612 }
6613unlock:
6614 rcu_read_unlock();
6615 }
6616
ecc55f84 6617 perf_swevent_put_recursion_context(rctx);
95476b64
FW
6618}
6619EXPORT_SYMBOL_GPL(perf_tp_event);
6620
cdd6c482 6621static void tp_perf_event_destroy(struct perf_event *event)
e077df4f 6622{
1c024eca 6623 perf_trace_destroy(event);
e077df4f
PZ
6624}
6625
b0a873eb 6626static int perf_tp_event_init(struct perf_event *event)
e077df4f 6627{
76e1d904
FW
6628 int err;
6629
b0a873eb
PZ
6630 if (event->attr.type != PERF_TYPE_TRACEPOINT)
6631 return -ENOENT;
6632
2481c5fa
SE
6633 /*
6634 * no branch sampling for tracepoint events
6635 */
6636 if (has_branch_stack(event))
6637 return -EOPNOTSUPP;
6638
1c024eca
PZ
6639 err = perf_trace_init(event);
6640 if (err)
b0a873eb 6641 return err;
e077df4f 6642
cdd6c482 6643 event->destroy = tp_perf_event_destroy;
e077df4f 6644
b0a873eb
PZ
6645 return 0;
6646}
6647
6648static struct pmu perf_tracepoint = {
89a1e187
PZ
6649 .task_ctx_nr = perf_sw_context,
6650
b0a873eb 6651 .event_init = perf_tp_event_init,
a4eaf7f1
PZ
6652 .add = perf_trace_add,
6653 .del = perf_trace_del,
6654 .start = perf_swevent_start,
6655 .stop = perf_swevent_stop,
b0a873eb 6656 .read = perf_swevent_read,
b0a873eb
PZ
6657};
6658
6659static inline void perf_tp_register(void)
6660{
2e80a82a 6661 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
e077df4f 6662}
6fb2915d
LZ
6663
6664static int perf_event_set_filter(struct perf_event *event, void __user *arg)
6665{
6666 char *filter_str;
6667 int ret;
6668
6669 if (event->attr.type != PERF_TYPE_TRACEPOINT)
6670 return -EINVAL;
6671
6672 filter_str = strndup_user(arg, PAGE_SIZE);
6673 if (IS_ERR(filter_str))
6674 return PTR_ERR(filter_str);
6675
6676 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
6677
6678 kfree(filter_str);
6679 return ret;
6680}
6681
6682static void perf_event_free_filter(struct perf_event *event)
6683{
6684 ftrace_profile_free_filter(event);
6685}
6686
2541517c
AS
6687static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
6688{
6689 struct bpf_prog *prog;
6690
6691 if (event->attr.type != PERF_TYPE_TRACEPOINT)
6692 return -EINVAL;
6693
6694 if (event->tp_event->prog)
6695 return -EEXIST;
6696
6697 if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
6698 /* bpf programs can only be attached to kprobes */
6699 return -EINVAL;
6700
6701 prog = bpf_prog_get(prog_fd);
6702 if (IS_ERR(prog))
6703 return PTR_ERR(prog);
6704
6c373ca8 6705 if (prog->type != BPF_PROG_TYPE_KPROBE) {
2541517c
AS
6706 /* valid fd, but invalid bpf program type */
6707 bpf_prog_put(prog);
6708 return -EINVAL;
6709 }
6710
6711 event->tp_event->prog = prog;
6712
6713 return 0;
6714}
6715
6716static void perf_event_free_bpf_prog(struct perf_event *event)
6717{
6718 struct bpf_prog *prog;
6719
6720 if (!event->tp_event)
6721 return;
6722
6723 prog = event->tp_event->prog;
6724 if (prog) {
6725 event->tp_event->prog = NULL;
6726 bpf_prog_put(prog);
6727 }
6728}
6729
e077df4f 6730#else
6fb2915d 6731
b0a873eb 6732static inline void perf_tp_register(void)
e077df4f 6733{
e077df4f 6734}
6fb2915d
LZ
6735
6736static int perf_event_set_filter(struct perf_event *event, void __user *arg)
6737{
6738 return -ENOENT;
6739}
6740
6741static void perf_event_free_filter(struct perf_event *event)
6742{
6743}
6744
2541517c
AS
6745static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
6746{
6747 return -ENOENT;
6748}
6749
6750static void perf_event_free_bpf_prog(struct perf_event *event)
6751{
6752}
07b139c8 6753#endif /* CONFIG_EVENT_TRACING */
e077df4f 6754
24f1e32c 6755#ifdef CONFIG_HAVE_HW_BREAKPOINT
f5ffe02e 6756void perf_bp_event(struct perf_event *bp, void *data)
24f1e32c 6757{
f5ffe02e
FW
6758 struct perf_sample_data sample;
6759 struct pt_regs *regs = data;
6760
fd0d000b 6761 perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
f5ffe02e 6762
a4eaf7f1 6763 if (!bp->hw.state && !perf_exclude_event(bp, regs))
a8b0ca17 6764 perf_swevent_event(bp, 1, &sample, regs);
24f1e32c
FW
6765}
6766#endif
6767
b0a873eb
PZ
6768/*
6769 * hrtimer based swevent callback
6770 */
f29ac756 6771
b0a873eb 6772static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
f29ac756 6773{
b0a873eb
PZ
6774 enum hrtimer_restart ret = HRTIMER_RESTART;
6775 struct perf_sample_data data;
6776 struct pt_regs *regs;
6777 struct perf_event *event;
6778 u64 period;
f29ac756 6779
b0a873eb 6780 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
ba3dd36c
PZ
6781
6782 if (event->state != PERF_EVENT_STATE_ACTIVE)
6783 return HRTIMER_NORESTART;
6784
b0a873eb 6785 event->pmu->read(event);
f344011c 6786
fd0d000b 6787 perf_sample_data_init(&data, 0, event->hw.last_period);
b0a873eb
PZ
6788 regs = get_irq_regs();
6789
6790 if (regs && !perf_exclude_event(event, regs)) {
77aeeebd 6791 if (!(event->attr.exclude_idle && is_idle_task(current)))
33b07b8b 6792 if (__perf_event_overflow(event, 1, &data, regs))
b0a873eb
PZ
6793 ret = HRTIMER_NORESTART;
6794 }
24f1e32c 6795
b0a873eb
PZ
6796 period = max_t(u64, 10000, event->hw.sample_period);
6797 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
24f1e32c 6798
b0a873eb 6799 return ret;
f29ac756
PZ
6800}
6801
b0a873eb 6802static void perf_swevent_start_hrtimer(struct perf_event *event)
5c92d124 6803{
b0a873eb 6804 struct hw_perf_event *hwc = &event->hw;
5d508e82
FBH
6805 s64 period;
6806
6807 if (!is_sampling_event(event))
6808 return;
f5ffe02e 6809
5d508e82
FBH
6810 period = local64_read(&hwc->period_left);
6811 if (period) {
6812 if (period < 0)
6813 period = 10000;
fa407f35 6814
5d508e82
FBH
6815 local64_set(&hwc->period_left, 0);
6816 } else {
6817 period = max_t(u64, 10000, hwc->sample_period);
6818 }
3497d206
TG
6819 hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
6820 HRTIMER_MODE_REL_PINNED);
24f1e32c 6821}
b0a873eb
PZ
6822
6823static void perf_swevent_cancel_hrtimer(struct perf_event *event)
24f1e32c 6824{
b0a873eb
PZ
6825 struct hw_perf_event *hwc = &event->hw;
6826
6c7e550f 6827 if (is_sampling_event(event)) {
b0a873eb 6828 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
fa407f35 6829 local64_set(&hwc->period_left, ktime_to_ns(remaining));
b0a873eb
PZ
6830
6831 hrtimer_cancel(&hwc->hrtimer);
6832 }
24f1e32c
FW
6833}
6834
ba3dd36c
PZ
6835static void perf_swevent_init_hrtimer(struct perf_event *event)
6836{
6837 struct hw_perf_event *hwc = &event->hw;
6838
6839 if (!is_sampling_event(event))
6840 return;
6841
6842 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
6843 hwc->hrtimer.function = perf_swevent_hrtimer;
6844
6845 /*
6846 * Since hrtimers have a fixed rate, we can do a static freq->period
6847 * mapping and avoid the whole period adjust feedback stuff.
6848 */
6849 if (event->attr.freq) {
6850 long freq = event->attr.sample_freq;
6851
6852 event->attr.sample_period = NSEC_PER_SEC / freq;
6853 hwc->sample_period = event->attr.sample_period;
6854 local64_set(&hwc->period_left, hwc->sample_period);
778141e3 6855 hwc->last_period = hwc->sample_period;
ba3dd36c
PZ
6856 event->attr.freq = 0;
6857 }
6858}
6859
b0a873eb
PZ
6860/*
6861 * Software event: cpu wall time clock
6862 */
6863
6864static void cpu_clock_event_update(struct perf_event *event)
24f1e32c 6865{
b0a873eb
PZ
6866 s64 prev;
6867 u64 now;
6868
a4eaf7f1 6869 now = local_clock();
b0a873eb
PZ
6870 prev = local64_xchg(&event->hw.prev_count, now);
6871 local64_add(now - prev, &event->count);
24f1e32c 6872}
24f1e32c 6873
a4eaf7f1 6874static void cpu_clock_event_start(struct perf_event *event, int flags)
b0a873eb 6875{
a4eaf7f1 6876 local64_set(&event->hw.prev_count, local_clock());
b0a873eb 6877 perf_swevent_start_hrtimer(event);
b0a873eb
PZ
6878}
6879
a4eaf7f1 6880static void cpu_clock_event_stop(struct perf_event *event, int flags)
f29ac756 6881{
b0a873eb
PZ
6882 perf_swevent_cancel_hrtimer(event);
6883 cpu_clock_event_update(event);
6884}
f29ac756 6885
a4eaf7f1
PZ
6886static int cpu_clock_event_add(struct perf_event *event, int flags)
6887{
6888 if (flags & PERF_EF_START)
6889 cpu_clock_event_start(event, flags);
6a694a60 6890 perf_event_update_userpage(event);
a4eaf7f1
PZ
6891
6892 return 0;
6893}
6894
6895static void cpu_clock_event_del(struct perf_event *event, int flags)
6896{
6897 cpu_clock_event_stop(event, flags);
6898}
6899
b0a873eb
PZ
6900static void cpu_clock_event_read(struct perf_event *event)
6901{
6902 cpu_clock_event_update(event);
6903}
f344011c 6904
b0a873eb
PZ
6905static int cpu_clock_event_init(struct perf_event *event)
6906{
6907 if (event->attr.type != PERF_TYPE_SOFTWARE)
6908 return -ENOENT;
6909
6910 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
6911 return -ENOENT;
6912
2481c5fa
SE
6913 /*
6914 * no branch sampling for software events
6915 */
6916 if (has_branch_stack(event))
6917 return -EOPNOTSUPP;
6918
ba3dd36c
PZ
6919 perf_swevent_init_hrtimer(event);
6920
b0a873eb 6921 return 0;
f29ac756
PZ
6922}
6923
b0a873eb 6924static struct pmu perf_cpu_clock = {
89a1e187
PZ
6925 .task_ctx_nr = perf_sw_context,
6926
34f43927
PZ
6927 .capabilities = PERF_PMU_CAP_NO_NMI,
6928
b0a873eb 6929 .event_init = cpu_clock_event_init,
a4eaf7f1
PZ
6930 .add = cpu_clock_event_add,
6931 .del = cpu_clock_event_del,
6932 .start = cpu_clock_event_start,
6933 .stop = cpu_clock_event_stop,
b0a873eb
PZ
6934 .read = cpu_clock_event_read,
6935};
6936
6937/*
6938 * Software event: task time clock
6939 */
6940
6941static void task_clock_event_update(struct perf_event *event, u64 now)
5c92d124 6942{
b0a873eb
PZ
6943 u64 prev;
6944 s64 delta;
5c92d124 6945
b0a873eb
PZ
6946 prev = local64_xchg(&event->hw.prev_count, now);
6947 delta = now - prev;
6948 local64_add(delta, &event->count);
6949}
5c92d124 6950
a4eaf7f1 6951static void task_clock_event_start(struct perf_event *event, int flags)
b0a873eb 6952{
a4eaf7f1 6953 local64_set(&event->hw.prev_count, event->ctx->time);
b0a873eb 6954 perf_swevent_start_hrtimer(event);
b0a873eb
PZ
6955}
6956
a4eaf7f1 6957static void task_clock_event_stop(struct perf_event *event, int flags)
b0a873eb
PZ
6958{
6959 perf_swevent_cancel_hrtimer(event);
6960 task_clock_event_update(event, event->ctx->time);
a4eaf7f1
PZ
6961}
6962
6963static int task_clock_event_add(struct perf_event *event, int flags)
6964{
6965 if (flags & PERF_EF_START)
6966 task_clock_event_start(event, flags);
6a694a60 6967 perf_event_update_userpage(event);
b0a873eb 6968
a4eaf7f1
PZ
6969 return 0;
6970}
6971
6972static void task_clock_event_del(struct perf_event *event, int flags)
6973{
6974 task_clock_event_stop(event, PERF_EF_UPDATE);
b0a873eb
PZ
6975}
6976
6977static void task_clock_event_read(struct perf_event *event)
6978{
768a06e2
PZ
6979 u64 now = perf_clock();
6980 u64 delta = now - event->ctx->timestamp;
6981 u64 time = event->ctx->time + delta;
b0a873eb
PZ
6982
6983 task_clock_event_update(event, time);
6984}
6985
6986static int task_clock_event_init(struct perf_event *event)
6fb2915d 6987{
b0a873eb
PZ
6988 if (event->attr.type != PERF_TYPE_SOFTWARE)
6989 return -ENOENT;
6990
6991 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
6992 return -ENOENT;
6993
2481c5fa
SE
6994 /*
6995 * no branch sampling for software events
6996 */
6997 if (has_branch_stack(event))
6998 return -EOPNOTSUPP;
6999
ba3dd36c
PZ
7000 perf_swevent_init_hrtimer(event);
7001
b0a873eb 7002 return 0;
6fb2915d
LZ
7003}
7004
b0a873eb 7005static struct pmu perf_task_clock = {
89a1e187
PZ
7006 .task_ctx_nr = perf_sw_context,
7007
34f43927
PZ
7008 .capabilities = PERF_PMU_CAP_NO_NMI,
7009
b0a873eb 7010 .event_init = task_clock_event_init,
a4eaf7f1
PZ
7011 .add = task_clock_event_add,
7012 .del = task_clock_event_del,
7013 .start = task_clock_event_start,
7014 .stop = task_clock_event_stop,
b0a873eb
PZ
7015 .read = task_clock_event_read,
7016};
6fb2915d 7017
ad5133b7 7018static void perf_pmu_nop_void(struct pmu *pmu)
e077df4f 7019{
e077df4f 7020}
6fb2915d 7021
ad5133b7 7022static int perf_pmu_nop_int(struct pmu *pmu)
6fb2915d 7023{
ad5133b7 7024 return 0;
6fb2915d
LZ
7025}
7026
ad5133b7 7027static void perf_pmu_start_txn(struct pmu *pmu)
6fb2915d 7028{
ad5133b7 7029 perf_pmu_disable(pmu);
6fb2915d
LZ
7030}
7031
ad5133b7
PZ
7032static int perf_pmu_commit_txn(struct pmu *pmu)
7033{
7034 perf_pmu_enable(pmu);
7035 return 0;
7036}
e077df4f 7037
ad5133b7 7038static void perf_pmu_cancel_txn(struct pmu *pmu)
24f1e32c 7039{
ad5133b7 7040 perf_pmu_enable(pmu);
24f1e32c
FW
7041}
7042
35edc2a5
PZ
7043static int perf_event_idx_default(struct perf_event *event)
7044{
c719f560 7045 return 0;
35edc2a5
PZ
7046}
7047
8dc85d54
PZ
7048/*
7049 * Ensures all contexts with the same task_ctx_nr have the same
7050 * pmu_cpu_context too.
7051 */
9e317041 7052static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
24f1e32c 7053{
8dc85d54 7054 struct pmu *pmu;
b326e956 7055
8dc85d54
PZ
7056 if (ctxn < 0)
7057 return NULL;
24f1e32c 7058
8dc85d54
PZ
7059 list_for_each_entry(pmu, &pmus, entry) {
7060 if (pmu->task_ctx_nr == ctxn)
7061 return pmu->pmu_cpu_context;
7062 }
24f1e32c 7063
8dc85d54 7064 return NULL;
24f1e32c
FW
7065}
7066
51676957 7067static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
24f1e32c 7068{
51676957
PZ
7069 int cpu;
7070
7071 for_each_possible_cpu(cpu) {
7072 struct perf_cpu_context *cpuctx;
7073
7074 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
7075
3f1f3320
PZ
7076 if (cpuctx->unique_pmu == old_pmu)
7077 cpuctx->unique_pmu = pmu;
51676957
PZ
7078 }
7079}
7080
7081static void free_pmu_context(struct pmu *pmu)
7082{
7083 struct pmu *i;
f5ffe02e 7084
8dc85d54 7085 mutex_lock(&pmus_lock);
0475f9ea 7086 /*
8dc85d54 7087 * Like a real lame refcount.
0475f9ea 7088 */
51676957
PZ
7089 list_for_each_entry(i, &pmus, entry) {
7090 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
7091 update_pmu_context(i, pmu);
8dc85d54 7092 goto out;
51676957 7093 }
8dc85d54 7094 }
d6d020e9 7095
51676957 7096 free_percpu(pmu->pmu_cpu_context);
8dc85d54
PZ
7097out:
7098 mutex_unlock(&pmus_lock);
24f1e32c 7099}
2e80a82a 7100static struct idr pmu_idr;
d6d020e9 7101
abe43400
PZ
7102static ssize_t
7103type_show(struct device *dev, struct device_attribute *attr, char *page)
7104{
7105 struct pmu *pmu = dev_get_drvdata(dev);
7106
7107 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
7108}
90826ca7 7109static DEVICE_ATTR_RO(type);
abe43400 7110
62b85639
SE
7111static ssize_t
7112perf_event_mux_interval_ms_show(struct device *dev,
7113 struct device_attribute *attr,
7114 char *page)
7115{
7116 struct pmu *pmu = dev_get_drvdata(dev);
7117
7118 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
7119}
7120
272325c4
PZ
7121static DEFINE_MUTEX(mux_interval_mutex);
7122
62b85639
SE
7123static ssize_t
7124perf_event_mux_interval_ms_store(struct device *dev,
7125 struct device_attribute *attr,
7126 const char *buf, size_t count)
7127{
7128 struct pmu *pmu = dev_get_drvdata(dev);
7129 int timer, cpu, ret;
7130
7131 ret = kstrtoint(buf, 0, &timer);
7132 if (ret)
7133 return ret;
7134
7135 if (timer < 1)
7136 return -EINVAL;
7137
7138 /* same value, noting to do */
7139 if (timer == pmu->hrtimer_interval_ms)
7140 return count;
7141
272325c4 7142 mutex_lock(&mux_interval_mutex);
62b85639
SE
7143 pmu->hrtimer_interval_ms = timer;
7144
7145 /* update all cpuctx for this PMU */
272325c4
PZ
7146 get_online_cpus();
7147 for_each_online_cpu(cpu) {
62b85639
SE
7148 struct perf_cpu_context *cpuctx;
7149 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
7150 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
7151
272325c4
PZ
7152 cpu_function_call(cpu,
7153 (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
62b85639 7154 }
272325c4
PZ
7155 put_online_cpus();
7156 mutex_unlock(&mux_interval_mutex);
62b85639
SE
7157
7158 return count;
7159}
90826ca7 7160static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
62b85639 7161
90826ca7
GKH
7162static struct attribute *pmu_dev_attrs[] = {
7163 &dev_attr_type.attr,
7164 &dev_attr_perf_event_mux_interval_ms.attr,
7165 NULL,
abe43400 7166};
90826ca7 7167ATTRIBUTE_GROUPS(pmu_dev);
abe43400
PZ
7168
7169static int pmu_bus_running;
7170static struct bus_type pmu_bus = {
7171 .name = "event_source",
90826ca7 7172 .dev_groups = pmu_dev_groups,
abe43400
PZ
7173};
7174
7175static void pmu_dev_release(struct device *dev)
7176{
7177 kfree(dev);
7178}
7179
7180static int pmu_dev_alloc(struct pmu *pmu)
7181{
7182 int ret = -ENOMEM;
7183
7184 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
7185 if (!pmu->dev)
7186 goto out;
7187
0c9d42ed 7188 pmu->dev->groups = pmu->attr_groups;
abe43400
PZ
7189 device_initialize(pmu->dev);
7190 ret = dev_set_name(pmu->dev, "%s", pmu->name);
7191 if (ret)
7192 goto free_dev;
7193
7194 dev_set_drvdata(pmu->dev, pmu);
7195 pmu->dev->bus = &pmu_bus;
7196 pmu->dev->release = pmu_dev_release;
7197 ret = device_add(pmu->dev);
7198 if (ret)
7199 goto free_dev;
7200
7201out:
7202 return ret;
7203
7204free_dev:
7205 put_device(pmu->dev);
7206 goto out;
7207}
7208
547e9fd7 7209static struct lock_class_key cpuctx_mutex;
facc4307 7210static struct lock_class_key cpuctx_lock;
547e9fd7 7211
03d8e80b 7212int perf_pmu_register(struct pmu *pmu, const char *name, int type)
24f1e32c 7213{
108b02cf 7214 int cpu, ret;
24f1e32c 7215
b0a873eb 7216 mutex_lock(&pmus_lock);
33696fc0
PZ
7217 ret = -ENOMEM;
7218 pmu->pmu_disable_count = alloc_percpu(int);
7219 if (!pmu->pmu_disable_count)
7220 goto unlock;
f29ac756 7221
2e80a82a
PZ
7222 pmu->type = -1;
7223 if (!name)
7224 goto skip_type;
7225 pmu->name = name;
7226
7227 if (type < 0) {
0e9c3be2
TH
7228 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
7229 if (type < 0) {
7230 ret = type;
2e80a82a
PZ
7231 goto free_pdc;
7232 }
7233 }
7234 pmu->type = type;
7235
abe43400
PZ
7236 if (pmu_bus_running) {
7237 ret = pmu_dev_alloc(pmu);
7238 if (ret)
7239 goto free_idr;
7240 }
7241
2e80a82a 7242skip_type:
8dc85d54
PZ
7243 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
7244 if (pmu->pmu_cpu_context)
7245 goto got_cpu_context;
f29ac756 7246
c4814202 7247 ret = -ENOMEM;
108b02cf
PZ
7248 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
7249 if (!pmu->pmu_cpu_context)
abe43400 7250 goto free_dev;
f344011c 7251
108b02cf
PZ
7252 for_each_possible_cpu(cpu) {
7253 struct perf_cpu_context *cpuctx;
7254
7255 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
eb184479 7256 __perf_event_init_context(&cpuctx->ctx);
547e9fd7 7257 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
facc4307 7258 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
108b02cf 7259 cpuctx->ctx.pmu = pmu;
9e630205 7260
272325c4 7261 __perf_mux_hrtimer_init(cpuctx, cpu);
9e630205 7262
3f1f3320 7263 cpuctx->unique_pmu = pmu;
108b02cf 7264 }
76e1d904 7265
8dc85d54 7266got_cpu_context:
ad5133b7
PZ
7267 if (!pmu->start_txn) {
7268 if (pmu->pmu_enable) {
7269 /*
7270 * If we have pmu_enable/pmu_disable calls, install
7271 * transaction stubs that use that to try and batch
7272 * hardware accesses.
7273 */
7274 pmu->start_txn = perf_pmu_start_txn;
7275 pmu->commit_txn = perf_pmu_commit_txn;
7276 pmu->cancel_txn = perf_pmu_cancel_txn;
7277 } else {
7278 pmu->start_txn = perf_pmu_nop_void;
7279 pmu->commit_txn = perf_pmu_nop_int;
7280 pmu->cancel_txn = perf_pmu_nop_void;
f344011c 7281 }
5c92d124 7282 }
15dbf27c 7283
ad5133b7
PZ
7284 if (!pmu->pmu_enable) {
7285 pmu->pmu_enable = perf_pmu_nop_void;
7286 pmu->pmu_disable = perf_pmu_nop_void;
7287 }
7288
35edc2a5
PZ
7289 if (!pmu->event_idx)
7290 pmu->event_idx = perf_event_idx_default;
7291
b0a873eb 7292 list_add_rcu(&pmu->entry, &pmus);
bed5b25a 7293 atomic_set(&pmu->exclusive_cnt, 0);
33696fc0
PZ
7294 ret = 0;
7295unlock:
b0a873eb
PZ
7296 mutex_unlock(&pmus_lock);
7297
33696fc0 7298 return ret;
108b02cf 7299
abe43400
PZ
7300free_dev:
7301 device_del(pmu->dev);
7302 put_device(pmu->dev);
7303
2e80a82a
PZ
7304free_idr:
7305 if (pmu->type >= PERF_TYPE_MAX)
7306 idr_remove(&pmu_idr, pmu->type);
7307
108b02cf
PZ
7308free_pdc:
7309 free_percpu(pmu->pmu_disable_count);
7310 goto unlock;
f29ac756 7311}
c464c76e 7312EXPORT_SYMBOL_GPL(perf_pmu_register);
f29ac756 7313
b0a873eb 7314void perf_pmu_unregister(struct pmu *pmu)
5c92d124 7315{
b0a873eb
PZ
7316 mutex_lock(&pmus_lock);
7317 list_del_rcu(&pmu->entry);
7318 mutex_unlock(&pmus_lock);
5c92d124 7319
0475f9ea 7320 /*
cde8e884
PZ
7321 * We dereference the pmu list under both SRCU and regular RCU, so
7322 * synchronize against both of those.
0475f9ea 7323 */
b0a873eb 7324 synchronize_srcu(&pmus_srcu);
cde8e884 7325 synchronize_rcu();
d6d020e9 7326
33696fc0 7327 free_percpu(pmu->pmu_disable_count);
2e80a82a
PZ
7328 if (pmu->type >= PERF_TYPE_MAX)
7329 idr_remove(&pmu_idr, pmu->type);
abe43400
PZ
7330 device_del(pmu->dev);
7331 put_device(pmu->dev);
51676957 7332 free_pmu_context(pmu);
b0a873eb 7333}
c464c76e 7334EXPORT_SYMBOL_GPL(perf_pmu_unregister);
d6d020e9 7335
cc34b98b
MR
7336static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
7337{
ccd41c86 7338 struct perf_event_context *ctx = NULL;
cc34b98b
MR
7339 int ret;
7340
7341 if (!try_module_get(pmu->module))
7342 return -ENODEV;
ccd41c86
PZ
7343
7344 if (event->group_leader != event) {
7345 ctx = perf_event_ctx_lock(event->group_leader);
7346 BUG_ON(!ctx);
7347 }
7348
cc34b98b
MR
7349 event->pmu = pmu;
7350 ret = pmu->event_init(event);
ccd41c86
PZ
7351
7352 if (ctx)
7353 perf_event_ctx_unlock(event->group_leader, ctx);
7354
cc34b98b
MR
7355 if (ret)
7356 module_put(pmu->module);
7357
7358 return ret;
7359}
7360
b0a873eb
PZ
7361struct pmu *perf_init_event(struct perf_event *event)
7362{
7363 struct pmu *pmu = NULL;
7364 int idx;
940c5b29 7365 int ret;
b0a873eb
PZ
7366
7367 idx = srcu_read_lock(&pmus_srcu);
2e80a82a
PZ
7368
7369 rcu_read_lock();
7370 pmu = idr_find(&pmu_idr, event->attr.type);
7371 rcu_read_unlock();
940c5b29 7372 if (pmu) {
cc34b98b 7373 ret = perf_try_init_event(pmu, event);
940c5b29
LM
7374 if (ret)
7375 pmu = ERR_PTR(ret);
2e80a82a 7376 goto unlock;
940c5b29 7377 }
2e80a82a 7378
b0a873eb 7379 list_for_each_entry_rcu(pmu, &pmus, entry) {
cc34b98b 7380 ret = perf_try_init_event(pmu, event);
b0a873eb 7381 if (!ret)
e5f4d339 7382 goto unlock;
76e1d904 7383
b0a873eb
PZ
7384 if (ret != -ENOENT) {
7385 pmu = ERR_PTR(ret);
e5f4d339 7386 goto unlock;
f344011c 7387 }
5c92d124 7388 }
e5f4d339
PZ
7389 pmu = ERR_PTR(-ENOENT);
7390unlock:
b0a873eb 7391 srcu_read_unlock(&pmus_srcu, idx);
15dbf27c 7392
4aeb0b42 7393 return pmu;
5c92d124
IM
7394}
7395
4beb31f3
FW
7396static void account_event_cpu(struct perf_event *event, int cpu)
7397{
7398 if (event->parent)
7399 return;
7400
4beb31f3
FW
7401 if (is_cgroup_event(event))
7402 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
7403}
7404
766d6c07
FW
7405static void account_event(struct perf_event *event)
7406{
4beb31f3
FW
7407 if (event->parent)
7408 return;
7409
766d6c07
FW
7410 if (event->attach_state & PERF_ATTACH_TASK)
7411 static_key_slow_inc(&perf_sched_events.key);
7412 if (event->attr.mmap || event->attr.mmap_data)
7413 atomic_inc(&nr_mmap_events);
7414 if (event->attr.comm)
7415 atomic_inc(&nr_comm_events);
7416 if (event->attr.task)
7417 atomic_inc(&nr_task_events);
948b26b6
FW
7418 if (event->attr.freq) {
7419 if (atomic_inc_return(&nr_freq_events) == 1)
7420 tick_nohz_full_kick_all();
7421 }
4beb31f3 7422 if (has_branch_stack(event))
766d6c07 7423 static_key_slow_inc(&perf_sched_events.key);
4beb31f3 7424 if (is_cgroup_event(event))
766d6c07 7425 static_key_slow_inc(&perf_sched_events.key);
4beb31f3
FW
7426
7427 account_event_cpu(event, event->cpu);
766d6c07
FW
7428}
7429
0793a61d 7430/*
cdd6c482 7431 * Allocate and initialize a event structure
0793a61d 7432 */
cdd6c482 7433static struct perf_event *
c3f00c70 7434perf_event_alloc(struct perf_event_attr *attr, int cpu,
d580ff86
PZ
7435 struct task_struct *task,
7436 struct perf_event *group_leader,
7437 struct perf_event *parent_event,
4dc0da86 7438 perf_overflow_handler_t overflow_handler,
79dff51e 7439 void *context, int cgroup_fd)
0793a61d 7440{
51b0fe39 7441 struct pmu *pmu;
cdd6c482
IM
7442 struct perf_event *event;
7443 struct hw_perf_event *hwc;
90983b16 7444 long err = -EINVAL;
0793a61d 7445
66832eb4
ON
7446 if ((unsigned)cpu >= nr_cpu_ids) {
7447 if (!task || cpu != -1)
7448 return ERR_PTR(-EINVAL);
7449 }
7450
c3f00c70 7451 event = kzalloc(sizeof(*event), GFP_KERNEL);
cdd6c482 7452 if (!event)
d5d2bc0d 7453 return ERR_PTR(-ENOMEM);
0793a61d 7454
04289bb9 7455 /*
cdd6c482 7456 * Single events are their own group leaders, with an
04289bb9
IM
7457 * empty sibling list:
7458 */
7459 if (!group_leader)
cdd6c482 7460 group_leader = event;
04289bb9 7461
cdd6c482
IM
7462 mutex_init(&event->child_mutex);
7463 INIT_LIST_HEAD(&event->child_list);
fccc714b 7464
cdd6c482
IM
7465 INIT_LIST_HEAD(&event->group_entry);
7466 INIT_LIST_HEAD(&event->event_entry);
7467 INIT_LIST_HEAD(&event->sibling_list);
10c6db11 7468 INIT_LIST_HEAD(&event->rb_entry);
71ad88ef 7469 INIT_LIST_HEAD(&event->active_entry);
f3ae75de
SE
7470 INIT_HLIST_NODE(&event->hlist_entry);
7471
10c6db11 7472
cdd6c482 7473 init_waitqueue_head(&event->waitq);
e360adbe 7474 init_irq_work(&event->pending, perf_pending_event);
0793a61d 7475
cdd6c482 7476 mutex_init(&event->mmap_mutex);
7b732a75 7477
a6fa941d 7478 atomic_long_set(&event->refcount, 1);
cdd6c482
IM
7479 event->cpu = cpu;
7480 event->attr = *attr;
7481 event->group_leader = group_leader;
7482 event->pmu = NULL;
cdd6c482 7483 event->oncpu = -1;
a96bbc16 7484
cdd6c482 7485 event->parent = parent_event;
b84fbc9f 7486
17cf22c3 7487 event->ns = get_pid_ns(task_active_pid_ns(current));
cdd6c482 7488 event->id = atomic64_inc_return(&perf_event_id);
a96bbc16 7489
cdd6c482 7490 event->state = PERF_EVENT_STATE_INACTIVE;
329d876d 7491
d580ff86
PZ
7492 if (task) {
7493 event->attach_state = PERF_ATTACH_TASK;
d580ff86 7494 /*
50f16a8b
PZ
7495 * XXX pmu::event_init needs to know what task to account to
7496 * and we cannot use the ctx information because we need the
7497 * pmu before we get a ctx.
d580ff86 7498 */
50f16a8b 7499 event->hw.target = task;
d580ff86
PZ
7500 }
7501
34f43927
PZ
7502 event->clock = &local_clock;
7503 if (parent_event)
7504 event->clock = parent_event->clock;
7505
4dc0da86 7506 if (!overflow_handler && parent_event) {
b326e956 7507 overflow_handler = parent_event->overflow_handler;
4dc0da86
AK
7508 context = parent_event->overflow_handler_context;
7509 }
66832eb4 7510
b326e956 7511 event->overflow_handler = overflow_handler;
4dc0da86 7512 event->overflow_handler_context = context;
97eaf530 7513
0231bb53 7514 perf_event__state_init(event);
a86ed508 7515
4aeb0b42 7516 pmu = NULL;
b8e83514 7517
cdd6c482 7518 hwc = &event->hw;
bd2b5b12 7519 hwc->sample_period = attr->sample_period;
0d48696f 7520 if (attr->freq && attr->sample_freq)
bd2b5b12 7521 hwc->sample_period = 1;
eced1dfc 7522 hwc->last_period = hwc->sample_period;
bd2b5b12 7523
e7850595 7524 local64_set(&hwc->period_left, hwc->sample_period);
60db5e09 7525
2023b359 7526 /*
cdd6c482 7527 * we currently do not support PERF_FORMAT_GROUP on inherited events
2023b359 7528 */
3dab77fb 7529 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
90983b16 7530 goto err_ns;
a46a2300
YZ
7531
7532 if (!has_branch_stack(event))
7533 event->attr.branch_sample_type = 0;
2023b359 7534
79dff51e
MF
7535 if (cgroup_fd != -1) {
7536 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
7537 if (err)
7538 goto err_ns;
7539 }
7540
b0a873eb 7541 pmu = perf_init_event(event);
4aeb0b42 7542 if (!pmu)
90983b16
FW
7543 goto err_ns;
7544 else if (IS_ERR(pmu)) {
4aeb0b42 7545 err = PTR_ERR(pmu);
90983b16 7546 goto err_ns;
621a01ea 7547 }
d5d2bc0d 7548
bed5b25a
AS
7549 err = exclusive_event_init(event);
7550 if (err)
7551 goto err_pmu;
7552
cdd6c482 7553 if (!event->parent) {
927c7a9e
FW
7554 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
7555 err = get_callchain_buffers();
90983b16 7556 if (err)
bed5b25a 7557 goto err_per_task;
d010b332 7558 }
f344011c 7559 }
9ee318a7 7560
cdd6c482 7561 return event;
90983b16 7562
bed5b25a
AS
7563err_per_task:
7564 exclusive_event_destroy(event);
7565
90983b16
FW
7566err_pmu:
7567 if (event->destroy)
7568 event->destroy(event);
c464c76e 7569 module_put(pmu->module);
90983b16 7570err_ns:
79dff51e
MF
7571 if (is_cgroup_event(event))
7572 perf_detach_cgroup(event);
90983b16
FW
7573 if (event->ns)
7574 put_pid_ns(event->ns);
7575 kfree(event);
7576
7577 return ERR_PTR(err);
0793a61d
TG
7578}
7579
cdd6c482
IM
7580static int perf_copy_attr(struct perf_event_attr __user *uattr,
7581 struct perf_event_attr *attr)
974802ea 7582{
974802ea 7583 u32 size;
cdf8073d 7584 int ret;
974802ea
PZ
7585
7586 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
7587 return -EFAULT;
7588
7589 /*
7590 * zero the full structure, so that a short copy will be nice.
7591 */
7592 memset(attr, 0, sizeof(*attr));
7593
7594 ret = get_user(size, &uattr->size);
7595 if (ret)
7596 return ret;
7597
7598 if (size > PAGE_SIZE) /* silly large */
7599 goto err_size;
7600
7601 if (!size) /* abi compat */
7602 size = PERF_ATTR_SIZE_VER0;
7603
7604 if (size < PERF_ATTR_SIZE_VER0)
7605 goto err_size;
7606
7607 /*
7608 * If we're handed a bigger struct than we know of,
cdf8073d
IS
7609 * ensure all the unknown bits are 0 - i.e. new
7610 * user-space does not rely on any kernel feature
7611 * extensions we dont know about yet.
974802ea
PZ
7612 */
7613 if (size > sizeof(*attr)) {
cdf8073d
IS
7614 unsigned char __user *addr;
7615 unsigned char __user *end;
7616 unsigned char val;
974802ea 7617
cdf8073d
IS
7618 addr = (void __user *)uattr + sizeof(*attr);
7619 end = (void __user *)uattr + size;
974802ea 7620
cdf8073d 7621 for (; addr < end; addr++) {
974802ea
PZ
7622 ret = get_user(val, addr);
7623 if (ret)
7624 return ret;
7625 if (val)
7626 goto err_size;
7627 }
b3e62e35 7628 size = sizeof(*attr);
974802ea
PZ
7629 }
7630
7631 ret = copy_from_user(attr, uattr, size);
7632 if (ret)
7633 return -EFAULT;
7634
cd757645 7635 if (attr->__reserved_1)
974802ea
PZ
7636 return -EINVAL;
7637
7638 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
7639 return -EINVAL;
7640
7641 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
7642 return -EINVAL;
7643
bce38cd5
SE
7644 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
7645 u64 mask = attr->branch_sample_type;
7646
7647 /* only using defined bits */
7648 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
7649 return -EINVAL;
7650
7651 /* at least one branch bit must be set */
7652 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
7653 return -EINVAL;
7654
bce38cd5
SE
7655 /* propagate priv level, when not set for branch */
7656 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
7657
7658 /* exclude_kernel checked on syscall entry */
7659 if (!attr->exclude_kernel)
7660 mask |= PERF_SAMPLE_BRANCH_KERNEL;
7661
7662 if (!attr->exclude_user)
7663 mask |= PERF_SAMPLE_BRANCH_USER;
7664
7665 if (!attr->exclude_hv)
7666 mask |= PERF_SAMPLE_BRANCH_HV;
7667 /*
7668 * adjust user setting (for HW filter setup)
7669 */
7670 attr->branch_sample_type = mask;
7671 }
e712209a
SE
7672 /* privileged levels capture (kernel, hv): check permissions */
7673 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
2b923c8f
SE
7674 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
7675 return -EACCES;
bce38cd5 7676 }
4018994f 7677
c5ebcedb 7678 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
4018994f 7679 ret = perf_reg_validate(attr->sample_regs_user);
c5ebcedb
JO
7680 if (ret)
7681 return ret;
7682 }
7683
7684 if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
7685 if (!arch_perf_have_user_stack_dump())
7686 return -ENOSYS;
7687
7688 /*
7689 * We have __u32 type for the size, but so far
7690 * we can only use __u16 as maximum due to the
7691 * __u16 sample size limit.
7692 */
7693 if (attr->sample_stack_user >= USHRT_MAX)
7694 ret = -EINVAL;
7695 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
7696 ret = -EINVAL;
7697 }
4018994f 7698
60e2364e
SE
7699 if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
7700 ret = perf_reg_validate(attr->sample_regs_intr);
974802ea
PZ
7701out:
7702 return ret;
7703
7704err_size:
7705 put_user(sizeof(*attr), &uattr->size);
7706 ret = -E2BIG;
7707 goto out;
7708}
7709
ac9721f3
PZ
7710static int
7711perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
a4be7c27 7712{
b69cf536 7713 struct ring_buffer *rb = NULL;
a4be7c27
PZ
7714 int ret = -EINVAL;
7715
ac9721f3 7716 if (!output_event)
a4be7c27
PZ
7717 goto set;
7718
ac9721f3
PZ
7719 /* don't allow circular references */
7720 if (event == output_event)
a4be7c27
PZ
7721 goto out;
7722
0f139300
PZ
7723 /*
7724 * Don't allow cross-cpu buffers
7725 */
7726 if (output_event->cpu != event->cpu)
7727 goto out;
7728
7729 /*
76369139 7730 * If its not a per-cpu rb, it must be the same task.
0f139300
PZ
7731 */
7732 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
7733 goto out;
7734
34f43927
PZ
7735 /*
7736 * Mixing clocks in the same buffer is trouble you don't need.
7737 */
7738 if (output_event->clock != event->clock)
7739 goto out;
7740
45bfb2e5
PZ
7741 /*
7742 * If both events generate aux data, they must be on the same PMU
7743 */
7744 if (has_aux(event) && has_aux(output_event) &&
7745 event->pmu != output_event->pmu)
7746 goto out;
7747
a4be7c27 7748set:
cdd6c482 7749 mutex_lock(&event->mmap_mutex);
ac9721f3
PZ
7750 /* Can't redirect output if we've got an active mmap() */
7751 if (atomic_read(&event->mmap_count))
7752 goto unlock;
a4be7c27 7753
ac9721f3 7754 if (output_event) {
76369139
FW
7755 /* get the rb we want to redirect to */
7756 rb = ring_buffer_get(output_event);
7757 if (!rb)
ac9721f3 7758 goto unlock;
a4be7c27
PZ
7759 }
7760
b69cf536 7761 ring_buffer_attach(event, rb);
9bb5d40c 7762
a4be7c27 7763 ret = 0;
ac9721f3
PZ
7764unlock:
7765 mutex_unlock(&event->mmap_mutex);
7766
a4be7c27 7767out:
a4be7c27
PZ
7768 return ret;
7769}
7770
f63a8daa
PZ
7771static void mutex_lock_double(struct mutex *a, struct mutex *b)
7772{
7773 if (b < a)
7774 swap(a, b);
7775
7776 mutex_lock(a);
7777 mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
7778}
7779
34f43927
PZ
7780static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
7781{
7782 bool nmi_safe = false;
7783
7784 switch (clk_id) {
7785 case CLOCK_MONOTONIC:
7786 event->clock = &ktime_get_mono_fast_ns;
7787 nmi_safe = true;
7788 break;
7789
7790 case CLOCK_MONOTONIC_RAW:
7791 event->clock = &ktime_get_raw_fast_ns;
7792 nmi_safe = true;
7793 break;
7794
7795 case CLOCK_REALTIME:
7796 event->clock = &ktime_get_real_ns;
7797 break;
7798
7799 case CLOCK_BOOTTIME:
7800 event->clock = &ktime_get_boot_ns;
7801 break;
7802
7803 case CLOCK_TAI:
7804 event->clock = &ktime_get_tai_ns;
7805 break;
7806
7807 default:
7808 return -EINVAL;
7809 }
7810
7811 if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
7812 return -EINVAL;
7813
7814 return 0;
7815}
7816
0793a61d 7817/**
cdd6c482 7818 * sys_perf_event_open - open a performance event, associate it to a task/cpu
9f66a381 7819 *
cdd6c482 7820 * @attr_uptr: event_id type attributes for monitoring/sampling
0793a61d 7821 * @pid: target pid
9f66a381 7822 * @cpu: target cpu
cdd6c482 7823 * @group_fd: group leader event fd
0793a61d 7824 */
cdd6c482
IM
7825SYSCALL_DEFINE5(perf_event_open,
7826 struct perf_event_attr __user *, attr_uptr,
2743a5b0 7827 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
0793a61d 7828{
b04243ef
PZ
7829 struct perf_event *group_leader = NULL, *output_event = NULL;
7830 struct perf_event *event, *sibling;
cdd6c482 7831 struct perf_event_attr attr;
f63a8daa 7832 struct perf_event_context *ctx, *uninitialized_var(gctx);
cdd6c482 7833 struct file *event_file = NULL;
2903ff01 7834 struct fd group = {NULL, 0};
38a81da2 7835 struct task_struct *task = NULL;
89a1e187 7836 struct pmu *pmu;
ea635c64 7837 int event_fd;
b04243ef 7838 int move_group = 0;
dc86cabe 7839 int err;
a21b0b35 7840 int f_flags = O_RDWR;
79dff51e 7841 int cgroup_fd = -1;
0793a61d 7842
2743a5b0 7843 /* for future expandability... */
e5d1367f 7844 if (flags & ~PERF_FLAG_ALL)
2743a5b0
PM
7845 return -EINVAL;
7846
dc86cabe
IM
7847 err = perf_copy_attr(attr_uptr, &attr);
7848 if (err)
7849 return err;
eab656ae 7850
0764771d
PZ
7851 if (!attr.exclude_kernel) {
7852 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
7853 return -EACCES;
7854 }
7855
df58ab24 7856 if (attr.freq) {
cdd6c482 7857 if (attr.sample_freq > sysctl_perf_event_sample_rate)
df58ab24 7858 return -EINVAL;
0819b2e3
PZ
7859 } else {
7860 if (attr.sample_period & (1ULL << 63))
7861 return -EINVAL;
df58ab24
PZ
7862 }
7863
e5d1367f
SE
7864 /*
7865 * In cgroup mode, the pid argument is used to pass the fd
7866 * opened to the cgroup directory in cgroupfs. The cpu argument
7867 * designates the cpu on which to monitor threads from that
7868 * cgroup.
7869 */
7870 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
7871 return -EINVAL;
7872
a21b0b35
YD
7873 if (flags & PERF_FLAG_FD_CLOEXEC)
7874 f_flags |= O_CLOEXEC;
7875
7876 event_fd = get_unused_fd_flags(f_flags);
ea635c64
AV
7877 if (event_fd < 0)
7878 return event_fd;
7879
ac9721f3 7880 if (group_fd != -1) {
2903ff01
AV
7881 err = perf_fget_light(group_fd, &group);
7882 if (err)
d14b12d7 7883 goto err_fd;
2903ff01 7884 group_leader = group.file->private_data;
ac9721f3
PZ
7885 if (flags & PERF_FLAG_FD_OUTPUT)
7886 output_event = group_leader;
7887 if (flags & PERF_FLAG_FD_NO_GROUP)
7888 group_leader = NULL;
7889 }
7890
e5d1367f 7891 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
c6be5a5c
PZ
7892 task = find_lively_task_by_vpid(pid);
7893 if (IS_ERR(task)) {
7894 err = PTR_ERR(task);
7895 goto err_group_fd;
7896 }
7897 }
7898
1f4ee503
PZ
7899 if (task && group_leader &&
7900 group_leader->attr.inherit != attr.inherit) {
7901 err = -EINVAL;
7902 goto err_task;
7903 }
7904
fbfc623f
YZ
7905 get_online_cpus();
7906
79dff51e
MF
7907 if (flags & PERF_FLAG_PID_CGROUP)
7908 cgroup_fd = pid;
7909
4dc0da86 7910 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
79dff51e 7911 NULL, NULL, cgroup_fd);
d14b12d7
SE
7912 if (IS_ERR(event)) {
7913 err = PTR_ERR(event);
1f4ee503 7914 goto err_cpus;
d14b12d7
SE
7915 }
7916
53b25335
VW
7917 if (is_sampling_event(event)) {
7918 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
7919 err = -ENOTSUPP;
7920 goto err_alloc;
7921 }
7922 }
7923
766d6c07
FW
7924 account_event(event);
7925
89a1e187
PZ
7926 /*
7927 * Special case software events and allow them to be part of
7928 * any hardware group.
7929 */
7930 pmu = event->pmu;
b04243ef 7931
34f43927
PZ
7932 if (attr.use_clockid) {
7933 err = perf_event_set_clock(event, attr.clockid);
7934 if (err)
7935 goto err_alloc;
7936 }
7937
b04243ef
PZ
7938 if (group_leader &&
7939 (is_software_event(event) != is_software_event(group_leader))) {
7940 if (is_software_event(event)) {
7941 /*
7942 * If event and group_leader are not both a software
7943 * event, and event is, then group leader is not.
7944 *
7945 * Allow the addition of software events to !software
7946 * groups, this is safe because software events never
7947 * fail to schedule.
7948 */
7949 pmu = group_leader->pmu;
7950 } else if (is_software_event(group_leader) &&
7951 (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
7952 /*
7953 * In case the group is a pure software group, and we
7954 * try to add a hardware event, move the whole group to
7955 * the hardware context.
7956 */
7957 move_group = 1;
7958 }
7959 }
89a1e187
PZ
7960
7961 /*
7962 * Get the target context (task or percpu):
7963 */
4af57ef2 7964 ctx = find_get_context(pmu, task, event);
89a1e187
PZ
7965 if (IS_ERR(ctx)) {
7966 err = PTR_ERR(ctx);
c6be5a5c 7967 goto err_alloc;
89a1e187
PZ
7968 }
7969
bed5b25a
AS
7970 if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
7971 err = -EBUSY;
7972 goto err_context;
7973 }
7974
fd1edb3a
PZ
7975 if (task) {
7976 put_task_struct(task);
7977 task = NULL;
7978 }
7979
ccff286d 7980 /*
cdd6c482 7981 * Look up the group leader (we will attach this event to it):
04289bb9 7982 */
ac9721f3 7983 if (group_leader) {
dc86cabe 7984 err = -EINVAL;
04289bb9 7985
04289bb9 7986 /*
ccff286d
IM
7987 * Do not allow a recursive hierarchy (this new sibling
7988 * becoming part of another group-sibling):
7989 */
7990 if (group_leader->group_leader != group_leader)
c3f00c70 7991 goto err_context;
34f43927
PZ
7992
7993 /* All events in a group should have the same clock */
7994 if (group_leader->clock != event->clock)
7995 goto err_context;
7996
ccff286d
IM
7997 /*
7998 * Do not allow to attach to a group in a different
7999 * task or CPU context:
04289bb9 8000 */
b04243ef 8001 if (move_group) {
c3c87e77
PZ
8002 /*
8003 * Make sure we're both on the same task, or both
8004 * per-cpu events.
8005 */
8006 if (group_leader->ctx->task != ctx->task)
8007 goto err_context;
8008
8009 /*
8010 * Make sure we're both events for the same CPU;
8011 * grouping events for different CPUs is broken; since
8012 * you can never concurrently schedule them anyhow.
8013 */
8014 if (group_leader->cpu != event->cpu)
b04243ef
PZ
8015 goto err_context;
8016 } else {
8017 if (group_leader->ctx != ctx)
8018 goto err_context;
8019 }
8020
3b6f9e5c
PM
8021 /*
8022 * Only a group leader can be exclusive or pinned
8023 */
0d48696f 8024 if (attr.exclusive || attr.pinned)
c3f00c70 8025 goto err_context;
ac9721f3
PZ
8026 }
8027
8028 if (output_event) {
8029 err = perf_event_set_output(event, output_event);
8030 if (err)
c3f00c70 8031 goto err_context;
ac9721f3 8032 }
0793a61d 8033
a21b0b35
YD
8034 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
8035 f_flags);
ea635c64
AV
8036 if (IS_ERR(event_file)) {
8037 err = PTR_ERR(event_file);
c3f00c70 8038 goto err_context;
ea635c64 8039 }
9b51f66d 8040
b04243ef 8041 if (move_group) {
f63a8daa
PZ
8042 gctx = group_leader->ctx;
8043
8044 /*
8045 * See perf_event_ctx_lock() for comments on the details
8046 * of swizzling perf_event::ctx.
8047 */
8048 mutex_lock_double(&gctx->mutex, &ctx->mutex);
b04243ef 8049
46ce0fe9 8050 perf_remove_from_context(group_leader, false);
0231bb53 8051
b04243ef
PZ
8052 list_for_each_entry(sibling, &group_leader->sibling_list,
8053 group_entry) {
46ce0fe9 8054 perf_remove_from_context(sibling, false);
b04243ef
PZ
8055 put_ctx(gctx);
8056 }
f63a8daa
PZ
8057 } else {
8058 mutex_lock(&ctx->mutex);
ea635c64 8059 }
9b51f66d 8060
ad3a37de 8061 WARN_ON_ONCE(ctx->parent_ctx);
b04243ef
PZ
8062
8063 if (move_group) {
f63a8daa
PZ
8064 /*
8065 * Wait for everybody to stop referencing the events through
8066 * the old lists, before installing it on new lists.
8067 */
0cda4c02 8068 synchronize_rcu();
f63a8daa 8069
8f95b435
PZI
8070 /*
8071 * Install the group siblings before the group leader.
8072 *
8073 * Because a group leader will try and install the entire group
8074 * (through the sibling list, which is still in-tact), we can
8075 * end up with siblings installed in the wrong context.
8076 *
8077 * By installing siblings first we NO-OP because they're not
8078 * reachable through the group lists.
8079 */
b04243ef
PZ
8080 list_for_each_entry(sibling, &group_leader->sibling_list,
8081 group_entry) {
8f95b435 8082 perf_event__state_init(sibling);
9fc81d87 8083 perf_install_in_context(ctx, sibling, sibling->cpu);
b04243ef
PZ
8084 get_ctx(ctx);
8085 }
8f95b435
PZI
8086
8087 /*
8088 * Removing from the context ends up with disabled
8089 * event. What we want here is event in the initial
8090 * startup state, ready to be add into new context.
8091 */
8092 perf_event__state_init(group_leader);
8093 perf_install_in_context(ctx, group_leader, group_leader->cpu);
8094 get_ctx(ctx);
b04243ef
PZ
8095 }
8096
bed5b25a
AS
8097 if (!exclusive_event_installable(event, ctx)) {
8098 err = -EBUSY;
8099 mutex_unlock(&ctx->mutex);
8100 fput(event_file);
8101 goto err_context;
8102 }
8103
e2d37cd2 8104 perf_install_in_context(ctx, event, event->cpu);
fe4b04fa 8105 perf_unpin_context(ctx);
f63a8daa
PZ
8106
8107 if (move_group) {
8108 mutex_unlock(&gctx->mutex);
8109 put_ctx(gctx);
8110 }
d859e29f 8111 mutex_unlock(&ctx->mutex);
9b51f66d 8112
fbfc623f
YZ
8113 put_online_cpus();
8114
cdd6c482 8115 event->owner = current;
8882135b 8116
cdd6c482
IM
8117 mutex_lock(&current->perf_event_mutex);
8118 list_add_tail(&event->owner_entry, &current->perf_event_list);
8119 mutex_unlock(&current->perf_event_mutex);
082ff5a2 8120
c320c7b7
ACM
8121 /*
8122 * Precalculate sample_data sizes
8123 */
8124 perf_event__header_size(event);
6844c09d 8125 perf_event__id_header_size(event);
c320c7b7 8126
8a49542c
PZ
8127 /*
8128 * Drop the reference on the group_event after placing the
8129 * new event on the sibling_list. This ensures destruction
8130 * of the group leader will find the pointer to itself in
8131 * perf_group_detach().
8132 */
2903ff01 8133 fdput(group);
ea635c64
AV
8134 fd_install(event_fd, event_file);
8135 return event_fd;
0793a61d 8136
c3f00c70 8137err_context:
fe4b04fa 8138 perf_unpin_context(ctx);
ea635c64 8139 put_ctx(ctx);
c6be5a5c 8140err_alloc:
ea635c64 8141 free_event(event);
1f4ee503 8142err_cpus:
fbfc623f 8143 put_online_cpus();
1f4ee503 8144err_task:
e7d0bc04
PZ
8145 if (task)
8146 put_task_struct(task);
89a1e187 8147err_group_fd:
2903ff01 8148 fdput(group);
ea635c64
AV
8149err_fd:
8150 put_unused_fd(event_fd);
dc86cabe 8151 return err;
0793a61d
TG
8152}
8153
fb0459d7
AV
8154/**
8155 * perf_event_create_kernel_counter
8156 *
8157 * @attr: attributes of the counter to create
8158 * @cpu: cpu in which the counter is bound
38a81da2 8159 * @task: task to profile (NULL for percpu)
fb0459d7
AV
8160 */
8161struct perf_event *
8162perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
38a81da2 8163 struct task_struct *task,
4dc0da86
AK
8164 perf_overflow_handler_t overflow_handler,
8165 void *context)
fb0459d7 8166{
fb0459d7 8167 struct perf_event_context *ctx;
c3f00c70 8168 struct perf_event *event;
fb0459d7 8169 int err;
d859e29f 8170
fb0459d7
AV
8171 /*
8172 * Get the target context (task or percpu):
8173 */
d859e29f 8174
4dc0da86 8175 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
79dff51e 8176 overflow_handler, context, -1);
c3f00c70
PZ
8177 if (IS_ERR(event)) {
8178 err = PTR_ERR(event);
8179 goto err;
8180 }
d859e29f 8181
f8697762
JO
8182 /* Mark owner so we could distinguish it from user events. */
8183 event->owner = EVENT_OWNER_KERNEL;
8184
766d6c07
FW
8185 account_event(event);
8186
4af57ef2 8187 ctx = find_get_context(event->pmu, task, event);
c6567f64
FW
8188 if (IS_ERR(ctx)) {
8189 err = PTR_ERR(ctx);
c3f00c70 8190 goto err_free;
d859e29f 8191 }
fb0459d7 8192
fb0459d7
AV
8193 WARN_ON_ONCE(ctx->parent_ctx);
8194 mutex_lock(&ctx->mutex);
bed5b25a
AS
8195 if (!exclusive_event_installable(event, ctx)) {
8196 mutex_unlock(&ctx->mutex);
8197 perf_unpin_context(ctx);
8198 put_ctx(ctx);
8199 err = -EBUSY;
8200 goto err_free;
8201 }
8202
fb0459d7 8203 perf_install_in_context(ctx, event, cpu);
fe4b04fa 8204 perf_unpin_context(ctx);
fb0459d7
AV
8205 mutex_unlock(&ctx->mutex);
8206
fb0459d7
AV
8207 return event;
8208
c3f00c70
PZ
8209err_free:
8210 free_event(event);
8211err:
c6567f64 8212 return ERR_PTR(err);
9b51f66d 8213}
fb0459d7 8214EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
9b51f66d 8215
0cda4c02
YZ
8216void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
8217{
8218 struct perf_event_context *src_ctx;
8219 struct perf_event_context *dst_ctx;
8220 struct perf_event *event, *tmp;
8221 LIST_HEAD(events);
8222
8223 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
8224 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
8225
f63a8daa
PZ
8226 /*
8227 * See perf_event_ctx_lock() for comments on the details
8228 * of swizzling perf_event::ctx.
8229 */
8230 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
0cda4c02
YZ
8231 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
8232 event_entry) {
46ce0fe9 8233 perf_remove_from_context(event, false);
9a545de0 8234 unaccount_event_cpu(event, src_cpu);
0cda4c02 8235 put_ctx(src_ctx);
9886167d 8236 list_add(&event->migrate_entry, &events);
0cda4c02 8237 }
0cda4c02 8238
8f95b435
PZI
8239 /*
8240 * Wait for the events to quiesce before re-instating them.
8241 */
0cda4c02
YZ
8242 synchronize_rcu();
8243
8f95b435
PZI
8244 /*
8245 * Re-instate events in 2 passes.
8246 *
8247 * Skip over group leaders and only install siblings on this first
8248 * pass, siblings will not get enabled without a leader, however a
8249 * leader will enable its siblings, even if those are still on the old
8250 * context.
8251 */
8252 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
8253 if (event->group_leader == event)
8254 continue;
8255
8256 list_del(&event->migrate_entry);
8257 if (event->state >= PERF_EVENT_STATE_OFF)
8258 event->state = PERF_EVENT_STATE_INACTIVE;
8259 account_event_cpu(event, dst_cpu);
8260 perf_install_in_context(dst_ctx, event, dst_cpu);
8261 get_ctx(dst_ctx);
8262 }
8263
8264 /*
8265 * Once all the siblings are setup properly, install the group leaders
8266 * to make it go.
8267 */
9886167d
PZ
8268 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
8269 list_del(&event->migrate_entry);
0cda4c02
YZ
8270 if (event->state >= PERF_EVENT_STATE_OFF)
8271 event->state = PERF_EVENT_STATE_INACTIVE;
9a545de0 8272 account_event_cpu(event, dst_cpu);
0cda4c02
YZ
8273 perf_install_in_context(dst_ctx, event, dst_cpu);
8274 get_ctx(dst_ctx);
8275 }
8276 mutex_unlock(&dst_ctx->mutex);
f63a8daa 8277 mutex_unlock(&src_ctx->mutex);
0cda4c02
YZ
8278}
8279EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
8280
cdd6c482 8281static void sync_child_event(struct perf_event *child_event,
38b200d6 8282 struct task_struct *child)
d859e29f 8283{
cdd6c482 8284 struct perf_event *parent_event = child_event->parent;
8bc20959 8285 u64 child_val;
d859e29f 8286
cdd6c482
IM
8287 if (child_event->attr.inherit_stat)
8288 perf_event_read_event(child_event, child);
38b200d6 8289
b5e58793 8290 child_val = perf_event_count(child_event);
d859e29f
PM
8291
8292 /*
8293 * Add back the child's count to the parent's count:
8294 */
a6e6dea6 8295 atomic64_add(child_val, &parent_event->child_count);
cdd6c482
IM
8296 atomic64_add(child_event->total_time_enabled,
8297 &parent_event->child_total_time_enabled);
8298 atomic64_add(child_event->total_time_running,
8299 &parent_event->child_total_time_running);
d859e29f
PM
8300
8301 /*
cdd6c482 8302 * Remove this event from the parent's list
d859e29f 8303 */
cdd6c482
IM
8304 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
8305 mutex_lock(&parent_event->child_mutex);
8306 list_del_init(&child_event->child_list);
8307 mutex_unlock(&parent_event->child_mutex);
d859e29f 8308
dc633982
JO
8309 /*
8310 * Make sure user/parent get notified, that we just
8311 * lost one event.
8312 */
8313 perf_event_wakeup(parent_event);
8314
d859e29f 8315 /*
cdd6c482 8316 * Release the parent event, if this was the last
d859e29f
PM
8317 * reference to it.
8318 */
a6fa941d 8319 put_event(parent_event);
d859e29f
PM
8320}
8321
9b51f66d 8322static void
cdd6c482
IM
8323__perf_event_exit_task(struct perf_event *child_event,
8324 struct perf_event_context *child_ctx,
38b200d6 8325 struct task_struct *child)
9b51f66d 8326{
1903d50c
PZ
8327 /*
8328 * Do not destroy the 'original' grouping; because of the context
8329 * switch optimization the original events could've ended up in a
8330 * random child task.
8331 *
8332 * If we were to destroy the original group, all group related
8333 * operations would cease to function properly after this random
8334 * child dies.
8335 *
8336 * Do destroy all inherited groups, we don't care about those
8337 * and being thorough is better.
8338 */
8339 perf_remove_from_context(child_event, !!child_event->parent);
0cc0c027 8340
9b51f66d 8341 /*
38b435b1 8342 * It can happen that the parent exits first, and has events
9b51f66d 8343 * that are still around due to the child reference. These
38b435b1 8344 * events need to be zapped.
9b51f66d 8345 */
38b435b1 8346 if (child_event->parent) {
cdd6c482
IM
8347 sync_child_event(child_event, child);
8348 free_event(child_event);
179033b3
JO
8349 } else {
8350 child_event->state = PERF_EVENT_STATE_EXIT;
8351 perf_event_wakeup(child_event);
4bcf349a 8352 }
9b51f66d
IM
8353}
8354
8dc85d54 8355static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
9b51f66d 8356{
ebf905fc 8357 struct perf_event *child_event, *next;
211de6eb 8358 struct perf_event_context *child_ctx, *clone_ctx = NULL;
a63eaf34 8359 unsigned long flags;
9b51f66d 8360
8dc85d54 8361 if (likely(!child->perf_event_ctxp[ctxn])) {
cdd6c482 8362 perf_event_task(child, NULL, 0);
9b51f66d 8363 return;
9f498cc5 8364 }
9b51f66d 8365
a63eaf34 8366 local_irq_save(flags);
ad3a37de
PM
8367 /*
8368 * We can't reschedule here because interrupts are disabled,
8369 * and either child is current or it is a task that can't be
8370 * scheduled, so we are now safe from rescheduling changing
8371 * our context.
8372 */
806839b2 8373 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
c93f7669
PM
8374
8375 /*
8376 * Take the context lock here so that if find_get_context is
cdd6c482 8377 * reading child->perf_event_ctxp, we wait until it has
c93f7669
PM
8378 * incremented the context's refcount before we do put_ctx below.
8379 */
e625cce1 8380 raw_spin_lock(&child_ctx->lock);
04dc2dbb 8381 task_ctx_sched_out(child_ctx);
8dc85d54 8382 child->perf_event_ctxp[ctxn] = NULL;
4a1c0f26 8383
71a851b4
PZ
8384 /*
8385 * If this context is a clone; unclone it so it can't get
8386 * swapped to another process while we're removing all
cdd6c482 8387 * the events from it.
71a851b4 8388 */
211de6eb 8389 clone_ctx = unclone_ctx(child_ctx);
5e942bb3 8390 update_context_time(child_ctx);
e625cce1 8391 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
9f498cc5 8392
211de6eb
PZ
8393 if (clone_ctx)
8394 put_ctx(clone_ctx);
4a1c0f26 8395
9f498cc5 8396 /*
cdd6c482
IM
8397 * Report the task dead after unscheduling the events so that we
8398 * won't get any samples after PERF_RECORD_EXIT. We can however still
8399 * get a few PERF_RECORD_READ events.
9f498cc5 8400 */
cdd6c482 8401 perf_event_task(child, child_ctx, 0);
a63eaf34 8402
66fff224
PZ
8403 /*
8404 * We can recurse on the same lock type through:
8405 *
cdd6c482
IM
8406 * __perf_event_exit_task()
8407 * sync_child_event()
a6fa941d
AV
8408 * put_event()
8409 * mutex_lock(&ctx->mutex)
66fff224
PZ
8410 *
8411 * But since its the parent context it won't be the same instance.
8412 */
a0507c84 8413 mutex_lock(&child_ctx->mutex);
a63eaf34 8414
ebf905fc 8415 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
cdd6c482 8416 __perf_event_exit_task(child_event, child_ctx, child);
8bc20959 8417
a63eaf34
PM
8418 mutex_unlock(&child_ctx->mutex);
8419
8420 put_ctx(child_ctx);
9b51f66d
IM
8421}
8422
8dc85d54
PZ
8423/*
8424 * When a child task exits, feed back event values to parent events.
8425 */
8426void perf_event_exit_task(struct task_struct *child)
8427{
8882135b 8428 struct perf_event *event, *tmp;
8dc85d54
PZ
8429 int ctxn;
8430
8882135b
PZ
8431 mutex_lock(&child->perf_event_mutex);
8432 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
8433 owner_entry) {
8434 list_del_init(&event->owner_entry);
8435
8436 /*
8437 * Ensure the list deletion is visible before we clear
8438 * the owner, closes a race against perf_release() where
8439 * we need to serialize on the owner->perf_event_mutex.
8440 */
8441 smp_wmb();
8442 event->owner = NULL;
8443 }
8444 mutex_unlock(&child->perf_event_mutex);
8445
8dc85d54
PZ
8446 for_each_task_context_nr(ctxn)
8447 perf_event_exit_task_context(child, ctxn);
8448}
8449
889ff015
FW
8450static void perf_free_event(struct perf_event *event,
8451 struct perf_event_context *ctx)
8452{
8453 struct perf_event *parent = event->parent;
8454
8455 if (WARN_ON_ONCE(!parent))
8456 return;
8457
8458 mutex_lock(&parent->child_mutex);
8459 list_del_init(&event->child_list);
8460 mutex_unlock(&parent->child_mutex);
8461
a6fa941d 8462 put_event(parent);
889ff015 8463
652884fe 8464 raw_spin_lock_irq(&ctx->lock);
8a49542c 8465 perf_group_detach(event);
889ff015 8466 list_del_event(event, ctx);
652884fe 8467 raw_spin_unlock_irq(&ctx->lock);
889ff015
FW
8468 free_event(event);
8469}
8470
bbbee908 8471/*
652884fe 8472 * Free an unexposed, unused context as created by inheritance by
8dc85d54 8473 * perf_event_init_task below, used by fork() in case of fail.
652884fe
PZ
8474 *
8475 * Not all locks are strictly required, but take them anyway to be nice and
8476 * help out with the lockdep assertions.
bbbee908 8477 */
cdd6c482 8478void perf_event_free_task(struct task_struct *task)
bbbee908 8479{
8dc85d54 8480 struct perf_event_context *ctx;
cdd6c482 8481 struct perf_event *event, *tmp;
8dc85d54 8482 int ctxn;
bbbee908 8483
8dc85d54
PZ
8484 for_each_task_context_nr(ctxn) {
8485 ctx = task->perf_event_ctxp[ctxn];
8486 if (!ctx)
8487 continue;
bbbee908 8488
8dc85d54 8489 mutex_lock(&ctx->mutex);
bbbee908 8490again:
8dc85d54
PZ
8491 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
8492 group_entry)
8493 perf_free_event(event, ctx);
bbbee908 8494
8dc85d54
PZ
8495 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
8496 group_entry)
8497 perf_free_event(event, ctx);
bbbee908 8498
8dc85d54
PZ
8499 if (!list_empty(&ctx->pinned_groups) ||
8500 !list_empty(&ctx->flexible_groups))
8501 goto again;
bbbee908 8502
8dc85d54 8503 mutex_unlock(&ctx->mutex);
bbbee908 8504
8dc85d54
PZ
8505 put_ctx(ctx);
8506 }
889ff015
FW
8507}
8508
4e231c79
PZ
8509void perf_event_delayed_put(struct task_struct *task)
8510{
8511 int ctxn;
8512
8513 for_each_task_context_nr(ctxn)
8514 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
8515}
8516
97dee4f3
PZ
8517/*
8518 * inherit a event from parent task to child task:
8519 */
8520static struct perf_event *
8521inherit_event(struct perf_event *parent_event,
8522 struct task_struct *parent,
8523 struct perf_event_context *parent_ctx,
8524 struct task_struct *child,
8525 struct perf_event *group_leader,
8526 struct perf_event_context *child_ctx)
8527{
1929def9 8528 enum perf_event_active_state parent_state = parent_event->state;
97dee4f3 8529 struct perf_event *child_event;
cee010ec 8530 unsigned long flags;
97dee4f3
PZ
8531
8532 /*
8533 * Instead of creating recursive hierarchies of events,
8534 * we link inherited events back to the original parent,
8535 * which has a filp for sure, which we use as the reference
8536 * count:
8537 */
8538 if (parent_event->parent)
8539 parent_event = parent_event->parent;
8540
8541 child_event = perf_event_alloc(&parent_event->attr,
8542 parent_event->cpu,
d580ff86 8543 child,
97dee4f3 8544 group_leader, parent_event,
79dff51e 8545 NULL, NULL, -1);
97dee4f3
PZ
8546 if (IS_ERR(child_event))
8547 return child_event;
a6fa941d 8548
fadfe7be
JO
8549 if (is_orphaned_event(parent_event) ||
8550 !atomic_long_inc_not_zero(&parent_event->refcount)) {
a6fa941d
AV
8551 free_event(child_event);
8552 return NULL;
8553 }
8554
97dee4f3
PZ
8555 get_ctx(child_ctx);
8556
8557 /*
8558 * Make the child state follow the state of the parent event,
8559 * not its attr.disabled bit. We hold the parent's mutex,
8560 * so we won't race with perf_event_{en, dis}able_family.
8561 */
1929def9 8562 if (parent_state >= PERF_EVENT_STATE_INACTIVE)
97dee4f3
PZ
8563 child_event->state = PERF_EVENT_STATE_INACTIVE;
8564 else
8565 child_event->state = PERF_EVENT_STATE_OFF;
8566
8567 if (parent_event->attr.freq) {
8568 u64 sample_period = parent_event->hw.sample_period;
8569 struct hw_perf_event *hwc = &child_event->hw;
8570
8571 hwc->sample_period = sample_period;
8572 hwc->last_period = sample_period;
8573
8574 local64_set(&hwc->period_left, sample_period);
8575 }
8576
8577 child_event->ctx = child_ctx;
8578 child_event->overflow_handler = parent_event->overflow_handler;
4dc0da86
AK
8579 child_event->overflow_handler_context
8580 = parent_event->overflow_handler_context;
97dee4f3 8581
614b6780
TG
8582 /*
8583 * Precalculate sample_data sizes
8584 */
8585 perf_event__header_size(child_event);
6844c09d 8586 perf_event__id_header_size(child_event);
614b6780 8587
97dee4f3
PZ
8588 /*
8589 * Link it up in the child's context:
8590 */
cee010ec 8591 raw_spin_lock_irqsave(&child_ctx->lock, flags);
97dee4f3 8592 add_event_to_ctx(child_event, child_ctx);
cee010ec 8593 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
97dee4f3 8594
97dee4f3
PZ
8595 /*
8596 * Link this into the parent event's child list
8597 */
8598 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
8599 mutex_lock(&parent_event->child_mutex);
8600 list_add_tail(&child_event->child_list, &parent_event->child_list);
8601 mutex_unlock(&parent_event->child_mutex);
8602
8603 return child_event;
8604}
8605
8606static int inherit_group(struct perf_event *parent_event,
8607 struct task_struct *parent,
8608 struct perf_event_context *parent_ctx,
8609 struct task_struct *child,
8610 struct perf_event_context *child_ctx)
8611{
8612 struct perf_event *leader;
8613 struct perf_event *sub;
8614 struct perf_event *child_ctr;
8615
8616 leader = inherit_event(parent_event, parent, parent_ctx,
8617 child, NULL, child_ctx);
8618 if (IS_ERR(leader))
8619 return PTR_ERR(leader);
8620 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
8621 child_ctr = inherit_event(sub, parent, parent_ctx,
8622 child, leader, child_ctx);
8623 if (IS_ERR(child_ctr))
8624 return PTR_ERR(child_ctr);
8625 }
8626 return 0;
889ff015
FW
8627}
8628
8629static int
8630inherit_task_group(struct perf_event *event, struct task_struct *parent,
8631 struct perf_event_context *parent_ctx,
8dc85d54 8632 struct task_struct *child, int ctxn,
889ff015
FW
8633 int *inherited_all)
8634{
8635 int ret;
8dc85d54 8636 struct perf_event_context *child_ctx;
889ff015
FW
8637
8638 if (!event->attr.inherit) {
8639 *inherited_all = 0;
8640 return 0;
bbbee908
PZ
8641 }
8642
fe4b04fa 8643 child_ctx = child->perf_event_ctxp[ctxn];
889ff015
FW
8644 if (!child_ctx) {
8645 /*
8646 * This is executed from the parent task context, so
8647 * inherit events that have been marked for cloning.
8648 * First allocate and initialize a context for the
8649 * child.
8650 */
bbbee908 8651
734df5ab 8652 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
889ff015
FW
8653 if (!child_ctx)
8654 return -ENOMEM;
bbbee908 8655
8dc85d54 8656 child->perf_event_ctxp[ctxn] = child_ctx;
889ff015
FW
8657 }
8658
8659 ret = inherit_group(event, parent, parent_ctx,
8660 child, child_ctx);
8661
8662 if (ret)
8663 *inherited_all = 0;
8664
8665 return ret;
bbbee908
PZ
8666}
8667
9b51f66d 8668/*
cdd6c482 8669 * Initialize the perf_event context in task_struct
9b51f66d 8670 */
985c8dcb 8671static int perf_event_init_context(struct task_struct *child, int ctxn)
9b51f66d 8672{
889ff015 8673 struct perf_event_context *child_ctx, *parent_ctx;
cdd6c482
IM
8674 struct perf_event_context *cloned_ctx;
8675 struct perf_event *event;
9b51f66d 8676 struct task_struct *parent = current;
564c2b21 8677 int inherited_all = 1;
dddd3379 8678 unsigned long flags;
6ab423e0 8679 int ret = 0;
9b51f66d 8680
8dc85d54 8681 if (likely(!parent->perf_event_ctxp[ctxn]))
6ab423e0
PZ
8682 return 0;
8683
ad3a37de 8684 /*
25346b93
PM
8685 * If the parent's context is a clone, pin it so it won't get
8686 * swapped under us.
ad3a37de 8687 */
8dc85d54 8688 parent_ctx = perf_pin_task_context(parent, ctxn);
ffb4ef21
PZ
8689 if (!parent_ctx)
8690 return 0;
25346b93 8691
ad3a37de
PM
8692 /*
8693 * No need to check if parent_ctx != NULL here; since we saw
8694 * it non-NULL earlier, the only reason for it to become NULL
8695 * is if we exit, and since we're currently in the middle of
8696 * a fork we can't be exiting at the same time.
8697 */
ad3a37de 8698
9b51f66d
IM
8699 /*
8700 * Lock the parent list. No need to lock the child - not PID
8701 * hashed yet and not running, so nobody can access it.
8702 */
d859e29f 8703 mutex_lock(&parent_ctx->mutex);
9b51f66d
IM
8704
8705 /*
8706 * We dont have to disable NMIs - we are only looking at
8707 * the list, not manipulating it:
8708 */
889ff015 8709 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
8dc85d54
PZ
8710 ret = inherit_task_group(event, parent, parent_ctx,
8711 child, ctxn, &inherited_all);
889ff015
FW
8712 if (ret)
8713 break;
8714 }
b93f7978 8715
dddd3379
TG
8716 /*
8717 * We can't hold ctx->lock when iterating the ->flexible_group list due
8718 * to allocations, but we need to prevent rotation because
8719 * rotate_ctx() will change the list from interrupt context.
8720 */
8721 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
8722 parent_ctx->rotate_disable = 1;
8723 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
8724
889ff015 8725 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
8dc85d54
PZ
8726 ret = inherit_task_group(event, parent, parent_ctx,
8727 child, ctxn, &inherited_all);
889ff015 8728 if (ret)
9b51f66d 8729 break;
564c2b21
PM
8730 }
8731
dddd3379
TG
8732 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
8733 parent_ctx->rotate_disable = 0;
dddd3379 8734
8dc85d54 8735 child_ctx = child->perf_event_ctxp[ctxn];
889ff015 8736
05cbaa28 8737 if (child_ctx && inherited_all) {
564c2b21
PM
8738 /*
8739 * Mark the child context as a clone of the parent
8740 * context, or of whatever the parent is a clone of.
c5ed5145
PZ
8741 *
8742 * Note that if the parent is a clone, the holding of
8743 * parent_ctx->lock avoids it from being uncloned.
564c2b21 8744 */
c5ed5145 8745 cloned_ctx = parent_ctx->parent_ctx;
ad3a37de
PM
8746 if (cloned_ctx) {
8747 child_ctx->parent_ctx = cloned_ctx;
25346b93 8748 child_ctx->parent_gen = parent_ctx->parent_gen;
564c2b21
PM
8749 } else {
8750 child_ctx->parent_ctx = parent_ctx;
8751 child_ctx->parent_gen = parent_ctx->generation;
8752 }
8753 get_ctx(child_ctx->parent_ctx);
9b51f66d
IM
8754 }
8755
c5ed5145 8756 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
d859e29f 8757 mutex_unlock(&parent_ctx->mutex);
6ab423e0 8758
25346b93 8759 perf_unpin_context(parent_ctx);
fe4b04fa 8760 put_ctx(parent_ctx);
ad3a37de 8761
6ab423e0 8762 return ret;
9b51f66d
IM
8763}
8764
8dc85d54
PZ
8765/*
8766 * Initialize the perf_event context in task_struct
8767 */
8768int perf_event_init_task(struct task_struct *child)
8769{
8770 int ctxn, ret;
8771
8550d7cb
ON
8772 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
8773 mutex_init(&child->perf_event_mutex);
8774 INIT_LIST_HEAD(&child->perf_event_list);
8775
8dc85d54
PZ
8776 for_each_task_context_nr(ctxn) {
8777 ret = perf_event_init_context(child, ctxn);
6c72e350
PZ
8778 if (ret) {
8779 perf_event_free_task(child);
8dc85d54 8780 return ret;
6c72e350 8781 }
8dc85d54
PZ
8782 }
8783
8784 return 0;
8785}
8786
220b140b
PM
8787static void __init perf_event_init_all_cpus(void)
8788{
b28ab83c 8789 struct swevent_htable *swhash;
220b140b 8790 int cpu;
220b140b
PM
8791
8792 for_each_possible_cpu(cpu) {
b28ab83c
PZ
8793 swhash = &per_cpu(swevent_htable, cpu);
8794 mutex_init(&swhash->hlist_mutex);
2fde4f94 8795 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
220b140b
PM
8796 }
8797}
8798
0db0628d 8799static void perf_event_init_cpu(int cpu)
0793a61d 8800{
108b02cf 8801 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
0793a61d 8802
b28ab83c 8803 mutex_lock(&swhash->hlist_mutex);
39af6b16 8804 swhash->online = true;
4536e4d1 8805 if (swhash->hlist_refcount > 0) {
76e1d904
FW
8806 struct swevent_hlist *hlist;
8807
b28ab83c
PZ
8808 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
8809 WARN_ON(!hlist);
8810 rcu_assign_pointer(swhash->swevent_hlist, hlist);
76e1d904 8811 }
b28ab83c 8812 mutex_unlock(&swhash->hlist_mutex);
0793a61d
TG
8813}
8814
c277443c 8815#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
108b02cf 8816static void __perf_event_exit_context(void *__info)
0793a61d 8817{
226424ee 8818 struct remove_event re = { .detach_group = true };
108b02cf 8819 struct perf_event_context *ctx = __info;
0793a61d 8820
e3703f8c 8821 rcu_read_lock();
46ce0fe9
PZ
8822 list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
8823 __perf_remove_from_context(&re);
e3703f8c 8824 rcu_read_unlock();
0793a61d 8825}
108b02cf
PZ
8826
8827static void perf_event_exit_cpu_context(int cpu)
8828{
8829 struct perf_event_context *ctx;
8830 struct pmu *pmu;
8831 int idx;
8832
8833 idx = srcu_read_lock(&pmus_srcu);
8834 list_for_each_entry_rcu(pmu, &pmus, entry) {
917bdd1c 8835 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
108b02cf
PZ
8836
8837 mutex_lock(&ctx->mutex);
8838 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
8839 mutex_unlock(&ctx->mutex);
8840 }
8841 srcu_read_unlock(&pmus_srcu, idx);
108b02cf
PZ
8842}
8843
cdd6c482 8844static void perf_event_exit_cpu(int cpu)
0793a61d 8845{
b28ab83c 8846 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
d859e29f 8847
e3703f8c
PZ
8848 perf_event_exit_cpu_context(cpu);
8849
b28ab83c 8850 mutex_lock(&swhash->hlist_mutex);
39af6b16 8851 swhash->online = false;
b28ab83c
PZ
8852 swevent_hlist_release(swhash);
8853 mutex_unlock(&swhash->hlist_mutex);
0793a61d
TG
8854}
8855#else
cdd6c482 8856static inline void perf_event_exit_cpu(int cpu) { }
0793a61d
TG
8857#endif
8858
c277443c
PZ
8859static int
8860perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
8861{
8862 int cpu;
8863
8864 for_each_online_cpu(cpu)
8865 perf_event_exit_cpu(cpu);
8866
8867 return NOTIFY_OK;
8868}
8869
8870/*
8871 * Run the perf reboot notifier at the very last possible moment so that
8872 * the generic watchdog code runs as long as possible.
8873 */
8874static struct notifier_block perf_reboot_notifier = {
8875 .notifier_call = perf_reboot,
8876 .priority = INT_MIN,
8877};
8878
0db0628d 8879static int
0793a61d
TG
8880perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
8881{
8882 unsigned int cpu = (long)hcpu;
8883
4536e4d1 8884 switch (action & ~CPU_TASKS_FROZEN) {
0793a61d
TG
8885
8886 case CPU_UP_PREPARE:
5e11637e 8887 case CPU_DOWN_FAILED:
cdd6c482 8888 perf_event_init_cpu(cpu);
0793a61d
TG
8889 break;
8890
5e11637e 8891 case CPU_UP_CANCELED:
0793a61d 8892 case CPU_DOWN_PREPARE:
cdd6c482 8893 perf_event_exit_cpu(cpu);
0793a61d 8894 break;
0793a61d
TG
8895 default:
8896 break;
8897 }
8898
8899 return NOTIFY_OK;
8900}
8901
cdd6c482 8902void __init perf_event_init(void)
0793a61d 8903{
3c502e7a
JW
8904 int ret;
8905
2e80a82a
PZ
8906 idr_init(&pmu_idr);
8907
220b140b 8908 perf_event_init_all_cpus();
b0a873eb 8909 init_srcu_struct(&pmus_srcu);
2e80a82a
PZ
8910 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
8911 perf_pmu_register(&perf_cpu_clock, NULL, -1);
8912 perf_pmu_register(&perf_task_clock, NULL, -1);
b0a873eb
PZ
8913 perf_tp_register();
8914 perf_cpu_notifier(perf_cpu_notify);
c277443c 8915 register_reboot_notifier(&perf_reboot_notifier);
3c502e7a
JW
8916
8917 ret = init_hw_breakpoint();
8918 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
b2029520
GN
8919
8920 /* do not patch jump label more than once per second */
8921 jump_label_rate_limit(&perf_sched_events, HZ);
b01c3a00
JO
8922
8923 /*
8924 * Build time assertion that we keep the data_head at the intended
8925 * location. IOW, validation we got the __reserved[] size right.
8926 */
8927 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
8928 != 1024);
0793a61d 8929}
abe43400 8930
fd979c01
CS
8931ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
8932 char *page)
8933{
8934 struct perf_pmu_events_attr *pmu_attr =
8935 container_of(attr, struct perf_pmu_events_attr, attr);
8936
8937 if (pmu_attr->event_str)
8938 return sprintf(page, "%s\n", pmu_attr->event_str);
8939
8940 return 0;
8941}
8942
abe43400
PZ
8943static int __init perf_event_sysfs_init(void)
8944{
8945 struct pmu *pmu;
8946 int ret;
8947
8948 mutex_lock(&pmus_lock);
8949
8950 ret = bus_register(&pmu_bus);
8951 if (ret)
8952 goto unlock;
8953
8954 list_for_each_entry(pmu, &pmus, entry) {
8955 if (!pmu->name || pmu->type < 0)
8956 continue;
8957
8958 ret = pmu_dev_alloc(pmu);
8959 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
8960 }
8961 pmu_bus_running = 1;
8962 ret = 0;
8963
8964unlock:
8965 mutex_unlock(&pmus_lock);
8966
8967 return ret;
8968}
8969device_initcall(perf_event_sysfs_init);
e5d1367f
SE
8970
8971#ifdef CONFIG_CGROUP_PERF
eb95419b
TH
8972static struct cgroup_subsys_state *
8973perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
e5d1367f
SE
8974{
8975 struct perf_cgroup *jc;
e5d1367f 8976
1b15d055 8977 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
e5d1367f
SE
8978 if (!jc)
8979 return ERR_PTR(-ENOMEM);
8980
e5d1367f
SE
8981 jc->info = alloc_percpu(struct perf_cgroup_info);
8982 if (!jc->info) {
8983 kfree(jc);
8984 return ERR_PTR(-ENOMEM);
8985 }
8986
e5d1367f
SE
8987 return &jc->css;
8988}
8989
eb95419b 8990static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
e5d1367f 8991{
eb95419b
TH
8992 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
8993
e5d1367f
SE
8994 free_percpu(jc->info);
8995 kfree(jc);
8996}
8997
8998static int __perf_cgroup_move(void *info)
8999{
9000 struct task_struct *task = info;
9001 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
9002 return 0;
9003}
9004
eb95419b
TH
9005static void perf_cgroup_attach(struct cgroup_subsys_state *css,
9006 struct cgroup_taskset *tset)
e5d1367f 9007{
bb9d97b6
TH
9008 struct task_struct *task;
9009
924f0d9a 9010 cgroup_taskset_for_each(task, tset)
bb9d97b6 9011 task_function_call(task, __perf_cgroup_move, task);
e5d1367f
SE
9012}
9013
eb95419b
TH
9014static void perf_cgroup_exit(struct cgroup_subsys_state *css,
9015 struct cgroup_subsys_state *old_css,
761b3ef5 9016 struct task_struct *task)
e5d1367f
SE
9017{
9018 /*
9019 * cgroup_exit() is called in the copy_process() failure path.
9020 * Ignore this case since the task hasn't ran yet, this avoids
9021 * trying to poke a half freed task state from generic code.
9022 */
9023 if (!(task->flags & PF_EXITING))
9024 return;
9025
bb9d97b6 9026 task_function_call(task, __perf_cgroup_move, task);
e5d1367f
SE
9027}
9028
073219e9 9029struct cgroup_subsys perf_event_cgrp_subsys = {
92fb9748
TH
9030 .css_alloc = perf_cgroup_css_alloc,
9031 .css_free = perf_cgroup_css_free,
e7e7ee2e 9032 .exit = perf_cgroup_exit,
bb9d97b6 9033 .attach = perf_cgroup_attach,
e5d1367f
SE
9034};
9035#endif /* CONFIG_CGROUP_PERF */