perf_counter: optimize mmap/comm tracking
[linux-2.6-block.git] / kernel / perf_counter.c
CommitLineData
0793a61d
TG
1/*
2 * Performance counter core code
3 *
4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6 *
7b732a75
PZ
7 *
8 * For licensing details see kernel-base/COPYING
0793a61d
TG
9 */
10
11#include <linux/fs.h>
b9cacc7b 12#include <linux/mm.h>
0793a61d
TG
13#include <linux/cpu.h>
14#include <linux/smp.h>
04289bb9 15#include <linux/file.h>
0793a61d
TG
16#include <linux/poll.h>
17#include <linux/sysfs.h>
18#include <linux/ptrace.h>
19#include <linux/percpu.h>
b9cacc7b
PZ
20#include <linux/vmstat.h>
21#include <linux/hardirq.h>
22#include <linux/rculist.h>
0793a61d
TG
23#include <linux/uaccess.h>
24#include <linux/syscalls.h>
25#include <linux/anon_inodes.h>
aa9c4c0f 26#include <linux/kernel_stat.h>
0793a61d 27#include <linux/perf_counter.h>
0a4a9391 28#include <linux/dcache.h>
0793a61d 29
4e193bd4
TB
30#include <asm/irq_regs.h>
31
0793a61d
TG
32/*
33 * Each CPU has a list of per CPU counters:
34 */
35DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
36
088e2852 37int perf_max_counters __read_mostly = 1;
0793a61d
TG
38static int perf_reserved_percpu __read_mostly;
39static int perf_overcommit __read_mostly = 1;
40
9ee318a7
PZ
41static atomic_t nr_mmap_tracking __read_mostly;
42static atomic_t nr_munmap_tracking __read_mostly;
43static atomic_t nr_comm_tracking __read_mostly;
44
0793a61d
TG
45/*
46 * Mutex for (sysadmin-configurable) counter reservations:
47 */
48static DEFINE_MUTEX(perf_resource_mutex);
49
50/*
51 * Architecture provided APIs - weak aliases:
52 */
5c92d124 53extern __weak const struct hw_perf_counter_ops *
621a01ea 54hw_perf_counter_init(struct perf_counter *counter)
0793a61d 55{
ff6f0541 56 return NULL;
0793a61d
TG
57}
58
01b2838c 59u64 __weak hw_perf_save_disable(void) { return 0; }
01ea1cca 60void __weak hw_perf_restore(u64 ctrl) { barrier(); }
01d0287f 61void __weak hw_perf_counter_setup(int cpu) { barrier(); }
3cbed429
PM
62int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
63 struct perf_cpu_context *cpuctx,
64 struct perf_counter_context *ctx, int cpu)
65{
66 return 0;
67}
0793a61d 68
4eb96fcf
PM
69void __weak perf_counter_print_debug(void) { }
70
04289bb9
IM
71static void
72list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
73{
74 struct perf_counter *group_leader = counter->group_leader;
75
76 /*
77 * Depending on whether it is a standalone or sibling counter,
78 * add it straight to the context's counter list, or to the group
79 * leader's sibling list:
80 */
81 if (counter->group_leader == counter)
82 list_add_tail(&counter->list_entry, &ctx->counter_list);
5c148194 83 else {
04289bb9 84 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
5c148194
PZ
85 group_leader->nr_siblings++;
86 }
592903cd
PZ
87
88 list_add_rcu(&counter->event_entry, &ctx->event_list);
04289bb9
IM
89}
90
91static void
92list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
93{
94 struct perf_counter *sibling, *tmp;
95
96 list_del_init(&counter->list_entry);
592903cd 97 list_del_rcu(&counter->event_entry);
04289bb9 98
5c148194
PZ
99 if (counter->group_leader != counter)
100 counter->group_leader->nr_siblings--;
101
04289bb9
IM
102 /*
103 * If this was a group counter with sibling counters then
104 * upgrade the siblings to singleton counters by adding them
105 * to the context list directly:
106 */
107 list_for_each_entry_safe(sibling, tmp,
108 &counter->sibling_list, list_entry) {
109
75564232 110 list_move_tail(&sibling->list_entry, &ctx->counter_list);
04289bb9
IM
111 sibling->group_leader = sibling;
112 }
113}
114
3b6f9e5c
PM
115static void
116counter_sched_out(struct perf_counter *counter,
117 struct perf_cpu_context *cpuctx,
118 struct perf_counter_context *ctx)
119{
120 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
121 return;
122
123 counter->state = PERF_COUNTER_STATE_INACTIVE;
4af4998b 124 counter->tstamp_stopped = ctx->time;
3b6f9e5c
PM
125 counter->hw_ops->disable(counter);
126 counter->oncpu = -1;
127
128 if (!is_software_counter(counter))
129 cpuctx->active_oncpu--;
130 ctx->nr_active--;
131 if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
132 cpuctx->exclusive = 0;
133}
134
d859e29f
PM
135static void
136group_sched_out(struct perf_counter *group_counter,
137 struct perf_cpu_context *cpuctx,
138 struct perf_counter_context *ctx)
139{
140 struct perf_counter *counter;
141
142 if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
143 return;
144
145 counter_sched_out(group_counter, cpuctx, ctx);
146
147 /*
148 * Schedule out siblings (if any):
149 */
150 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
151 counter_sched_out(counter, cpuctx, ctx);
152
153 if (group_counter->hw_event.exclusive)
154 cpuctx->exclusive = 0;
155}
156
0793a61d
TG
157/*
158 * Cross CPU call to remove a performance counter
159 *
160 * We disable the counter on the hardware level first. After that we
161 * remove it from the context list.
162 */
04289bb9 163static void __perf_counter_remove_from_context(void *info)
0793a61d
TG
164{
165 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
166 struct perf_counter *counter = info;
167 struct perf_counter_context *ctx = counter->ctx;
9b51f66d 168 unsigned long flags;
5c92d124 169 u64 perf_flags;
0793a61d
TG
170
171 /*
172 * If this is a task context, we need to check whether it is
173 * the current task context of this cpu. If not it has been
174 * scheduled out before the smp call arrived.
175 */
176 if (ctx->task && cpuctx->task_ctx != ctx)
177 return;
178
849691a6 179 spin_lock_irqsave(&ctx->lock, flags);
0793a61d 180
3b6f9e5c
PM
181 counter_sched_out(counter, cpuctx, ctx);
182
183 counter->task = NULL;
0793a61d
TG
184 ctx->nr_counters--;
185
186 /*
187 * Protect the list operation against NMI by disabling the
188 * counters on a global level. NOP for non NMI based counters.
189 */
01b2838c 190 perf_flags = hw_perf_save_disable();
04289bb9 191 list_del_counter(counter, ctx);
01b2838c 192 hw_perf_restore(perf_flags);
0793a61d
TG
193
194 if (!ctx->task) {
195 /*
196 * Allow more per task counters with respect to the
197 * reservation:
198 */
199 cpuctx->max_pertask =
200 min(perf_max_counters - ctx->nr_counters,
201 perf_max_counters - perf_reserved_percpu);
202 }
203
849691a6 204 spin_unlock_irqrestore(&ctx->lock, flags);
0793a61d
TG
205}
206
207
208/*
209 * Remove the counter from a task's (or a CPU's) list of counters.
210 *
d859e29f 211 * Must be called with counter->mutex and ctx->mutex held.
0793a61d
TG
212 *
213 * CPU counters are removed with a smp call. For task counters we only
214 * call when the task is on a CPU.
215 */
04289bb9 216static void perf_counter_remove_from_context(struct perf_counter *counter)
0793a61d
TG
217{
218 struct perf_counter_context *ctx = counter->ctx;
219 struct task_struct *task = ctx->task;
220
221 if (!task) {
222 /*
223 * Per cpu counters are removed via an smp call and
224 * the removal is always sucessful.
225 */
226 smp_call_function_single(counter->cpu,
04289bb9 227 __perf_counter_remove_from_context,
0793a61d
TG
228 counter, 1);
229 return;
230 }
231
232retry:
04289bb9 233 task_oncpu_function_call(task, __perf_counter_remove_from_context,
0793a61d
TG
234 counter);
235
236 spin_lock_irq(&ctx->lock);
237 /*
238 * If the context is active we need to retry the smp call.
239 */
04289bb9 240 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
0793a61d
TG
241 spin_unlock_irq(&ctx->lock);
242 goto retry;
243 }
244
245 /*
246 * The lock prevents that this context is scheduled in so we
04289bb9 247 * can remove the counter safely, if the call above did not
0793a61d
TG
248 * succeed.
249 */
04289bb9 250 if (!list_empty(&counter->list_entry)) {
0793a61d 251 ctx->nr_counters--;
04289bb9 252 list_del_counter(counter, ctx);
0793a61d
TG
253 counter->task = NULL;
254 }
255 spin_unlock_irq(&ctx->lock);
256}
257
4af4998b 258static inline u64 perf_clock(void)
53cfbf59 259{
4af4998b 260 return cpu_clock(smp_processor_id());
53cfbf59
PM
261}
262
263/*
264 * Update the record of the current time in a context.
265 */
4af4998b 266static void update_context_time(struct perf_counter_context *ctx)
53cfbf59 267{
4af4998b
PZ
268 u64 now = perf_clock();
269
270 ctx->time += now - ctx->timestamp;
271 ctx->timestamp = now;
53cfbf59
PM
272}
273
274/*
275 * Update the total_time_enabled and total_time_running fields for a counter.
276 */
277static void update_counter_times(struct perf_counter *counter)
278{
279 struct perf_counter_context *ctx = counter->ctx;
280 u64 run_end;
281
4af4998b
PZ
282 if (counter->state < PERF_COUNTER_STATE_INACTIVE)
283 return;
284
285 counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
286
287 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
288 run_end = counter->tstamp_stopped;
289 else
290 run_end = ctx->time;
291
292 counter->total_time_running = run_end - counter->tstamp_running;
53cfbf59
PM
293}
294
295/*
296 * Update total_time_enabled and total_time_running for all counters in a group.
297 */
298static void update_group_times(struct perf_counter *leader)
299{
300 struct perf_counter *counter;
301
302 update_counter_times(leader);
303 list_for_each_entry(counter, &leader->sibling_list, list_entry)
304 update_counter_times(counter);
305}
306
d859e29f
PM
307/*
308 * Cross CPU call to disable a performance counter
309 */
310static void __perf_counter_disable(void *info)
311{
312 struct perf_counter *counter = info;
313 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
314 struct perf_counter_context *ctx = counter->ctx;
315 unsigned long flags;
316
317 /*
318 * If this is a per-task counter, need to check whether this
319 * counter's task is the current task on this cpu.
320 */
321 if (ctx->task && cpuctx->task_ctx != ctx)
322 return;
323
849691a6 324 spin_lock_irqsave(&ctx->lock, flags);
d859e29f
PM
325
326 /*
327 * If the counter is on, turn it off.
328 * If it is in error state, leave it in error state.
329 */
330 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
4af4998b 331 update_context_time(ctx);
53cfbf59 332 update_counter_times(counter);
d859e29f
PM
333 if (counter == counter->group_leader)
334 group_sched_out(counter, cpuctx, ctx);
335 else
336 counter_sched_out(counter, cpuctx, ctx);
337 counter->state = PERF_COUNTER_STATE_OFF;
338 }
339
849691a6 340 spin_unlock_irqrestore(&ctx->lock, flags);
d859e29f
PM
341}
342
343/*
344 * Disable a counter.
345 */
346static void perf_counter_disable(struct perf_counter *counter)
347{
348 struct perf_counter_context *ctx = counter->ctx;
349 struct task_struct *task = ctx->task;
350
351 if (!task) {
352 /*
353 * Disable the counter on the cpu that it's on
354 */
355 smp_call_function_single(counter->cpu, __perf_counter_disable,
356 counter, 1);
357 return;
358 }
359
360 retry:
361 task_oncpu_function_call(task, __perf_counter_disable, counter);
362
363 spin_lock_irq(&ctx->lock);
364 /*
365 * If the counter is still active, we need to retry the cross-call.
366 */
367 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
368 spin_unlock_irq(&ctx->lock);
369 goto retry;
370 }
371
372 /*
373 * Since we have the lock this context can't be scheduled
374 * in, so we can change the state safely.
375 */
53cfbf59
PM
376 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
377 update_counter_times(counter);
d859e29f 378 counter->state = PERF_COUNTER_STATE_OFF;
53cfbf59 379 }
d859e29f
PM
380
381 spin_unlock_irq(&ctx->lock);
382}
383
384/*
385 * Disable a counter and all its children.
386 */
387static void perf_counter_disable_family(struct perf_counter *counter)
388{
389 struct perf_counter *child;
390
391 perf_counter_disable(counter);
392
393 /*
394 * Lock the mutex to protect the list of children
395 */
396 mutex_lock(&counter->mutex);
397 list_for_each_entry(child, &counter->child_list, child_list)
398 perf_counter_disable(child);
399 mutex_unlock(&counter->mutex);
400}
401
235c7fc7
IM
402static int
403counter_sched_in(struct perf_counter *counter,
404 struct perf_cpu_context *cpuctx,
405 struct perf_counter_context *ctx,
406 int cpu)
407{
3b6f9e5c 408 if (counter->state <= PERF_COUNTER_STATE_OFF)
235c7fc7
IM
409 return 0;
410
411 counter->state = PERF_COUNTER_STATE_ACTIVE;
412 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
413 /*
414 * The new state must be visible before we turn it on in the hardware:
415 */
416 smp_wmb();
417
418 if (counter->hw_ops->enable(counter)) {
419 counter->state = PERF_COUNTER_STATE_INACTIVE;
420 counter->oncpu = -1;
421 return -EAGAIN;
422 }
423
4af4998b 424 counter->tstamp_running += ctx->time - counter->tstamp_stopped;
53cfbf59 425
3b6f9e5c
PM
426 if (!is_software_counter(counter))
427 cpuctx->active_oncpu++;
235c7fc7
IM
428 ctx->nr_active++;
429
3b6f9e5c
PM
430 if (counter->hw_event.exclusive)
431 cpuctx->exclusive = 1;
432
235c7fc7
IM
433 return 0;
434}
435
3b6f9e5c
PM
436/*
437 * Return 1 for a group consisting entirely of software counters,
438 * 0 if the group contains any hardware counters.
439 */
440static int is_software_only_group(struct perf_counter *leader)
441{
442 struct perf_counter *counter;
443
444 if (!is_software_counter(leader))
445 return 0;
5c148194 446
3b6f9e5c
PM
447 list_for_each_entry(counter, &leader->sibling_list, list_entry)
448 if (!is_software_counter(counter))
449 return 0;
5c148194 450
3b6f9e5c
PM
451 return 1;
452}
453
454/*
455 * Work out whether we can put this counter group on the CPU now.
456 */
457static int group_can_go_on(struct perf_counter *counter,
458 struct perf_cpu_context *cpuctx,
459 int can_add_hw)
460{
461 /*
462 * Groups consisting entirely of software counters can always go on.
463 */
464 if (is_software_only_group(counter))
465 return 1;
466 /*
467 * If an exclusive group is already on, no other hardware
468 * counters can go on.
469 */
470 if (cpuctx->exclusive)
471 return 0;
472 /*
473 * If this group is exclusive and there are already
474 * counters on the CPU, it can't go on.
475 */
476 if (counter->hw_event.exclusive && cpuctx->active_oncpu)
477 return 0;
478 /*
479 * Otherwise, try to add it if all previous groups were able
480 * to go on.
481 */
482 return can_add_hw;
483}
484
53cfbf59
PM
485static void add_counter_to_ctx(struct perf_counter *counter,
486 struct perf_counter_context *ctx)
487{
488 list_add_counter(counter, ctx);
489 ctx->nr_counters++;
490 counter->prev_state = PERF_COUNTER_STATE_OFF;
4af4998b
PZ
491 counter->tstamp_enabled = ctx->time;
492 counter->tstamp_running = ctx->time;
493 counter->tstamp_stopped = ctx->time;
53cfbf59
PM
494}
495
0793a61d 496/*
235c7fc7 497 * Cross CPU call to install and enable a performance counter
0793a61d
TG
498 */
499static void __perf_install_in_context(void *info)
500{
501 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
502 struct perf_counter *counter = info;
503 struct perf_counter_context *ctx = counter->ctx;
d859e29f 504 struct perf_counter *leader = counter->group_leader;
0793a61d 505 int cpu = smp_processor_id();
9b51f66d 506 unsigned long flags;
5c92d124 507 u64 perf_flags;
3b6f9e5c 508 int err;
0793a61d
TG
509
510 /*
511 * If this is a task context, we need to check whether it is
512 * the current task context of this cpu. If not it has been
513 * scheduled out before the smp call arrived.
514 */
515 if (ctx->task && cpuctx->task_ctx != ctx)
516 return;
517
849691a6 518 spin_lock_irqsave(&ctx->lock, flags);
4af4998b 519 update_context_time(ctx);
0793a61d
TG
520
521 /*
522 * Protect the list operation against NMI by disabling the
523 * counters on a global level. NOP for non NMI based counters.
524 */
01b2838c 525 perf_flags = hw_perf_save_disable();
0793a61d 526
53cfbf59 527 add_counter_to_ctx(counter, ctx);
0793a61d 528
d859e29f
PM
529 /*
530 * Don't put the counter on if it is disabled or if
531 * it is in a group and the group isn't on.
532 */
533 if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
534 (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
535 goto unlock;
536
3b6f9e5c
PM
537 /*
538 * An exclusive counter can't go on if there are already active
539 * hardware counters, and no hardware counter can go on if there
540 * is already an exclusive counter on.
541 */
d859e29f 542 if (!group_can_go_on(counter, cpuctx, 1))
3b6f9e5c
PM
543 err = -EEXIST;
544 else
545 err = counter_sched_in(counter, cpuctx, ctx, cpu);
546
d859e29f
PM
547 if (err) {
548 /*
549 * This counter couldn't go on. If it is in a group
550 * then we have to pull the whole group off.
551 * If the counter group is pinned then put it in error state.
552 */
553 if (leader != counter)
554 group_sched_out(leader, cpuctx, ctx);
53cfbf59
PM
555 if (leader->hw_event.pinned) {
556 update_group_times(leader);
d859e29f 557 leader->state = PERF_COUNTER_STATE_ERROR;
53cfbf59 558 }
d859e29f 559 }
0793a61d 560
3b6f9e5c 561 if (!err && !ctx->task && cpuctx->max_pertask)
0793a61d
TG
562 cpuctx->max_pertask--;
563
d859e29f 564 unlock:
235c7fc7
IM
565 hw_perf_restore(perf_flags);
566
849691a6 567 spin_unlock_irqrestore(&ctx->lock, flags);
0793a61d
TG
568}
569
570/*
571 * Attach a performance counter to a context
572 *
573 * First we add the counter to the list with the hardware enable bit
574 * in counter->hw_config cleared.
575 *
576 * If the counter is attached to a task which is on a CPU we use a smp
577 * call to enable it in the task context. The task might have been
578 * scheduled away, but we check this in the smp call again.
d859e29f
PM
579 *
580 * Must be called with ctx->mutex held.
0793a61d
TG
581 */
582static void
583perf_install_in_context(struct perf_counter_context *ctx,
584 struct perf_counter *counter,
585 int cpu)
586{
587 struct task_struct *task = ctx->task;
588
0793a61d
TG
589 if (!task) {
590 /*
591 * Per cpu counters are installed via an smp call and
592 * the install is always sucessful.
593 */
594 smp_call_function_single(cpu, __perf_install_in_context,
595 counter, 1);
596 return;
597 }
598
599 counter->task = task;
600retry:
601 task_oncpu_function_call(task, __perf_install_in_context,
602 counter);
603
604 spin_lock_irq(&ctx->lock);
605 /*
0793a61d
TG
606 * we need to retry the smp call.
607 */
d859e29f 608 if (ctx->is_active && list_empty(&counter->list_entry)) {
0793a61d
TG
609 spin_unlock_irq(&ctx->lock);
610 goto retry;
611 }
612
613 /*
614 * The lock prevents that this context is scheduled in so we
615 * can add the counter safely, if it the call above did not
616 * succeed.
617 */
53cfbf59
PM
618 if (list_empty(&counter->list_entry))
619 add_counter_to_ctx(counter, ctx);
0793a61d
TG
620 spin_unlock_irq(&ctx->lock);
621}
622
d859e29f
PM
623/*
624 * Cross CPU call to enable a performance counter
625 */
626static void __perf_counter_enable(void *info)
04289bb9 627{
d859e29f
PM
628 struct perf_counter *counter = info;
629 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
630 struct perf_counter_context *ctx = counter->ctx;
631 struct perf_counter *leader = counter->group_leader;
632 unsigned long flags;
633 int err;
04289bb9 634
d859e29f
PM
635 /*
636 * If this is a per-task counter, need to check whether this
637 * counter's task is the current task on this cpu.
638 */
639 if (ctx->task && cpuctx->task_ctx != ctx)
3cbed429
PM
640 return;
641
849691a6 642 spin_lock_irqsave(&ctx->lock, flags);
4af4998b 643 update_context_time(ctx);
d859e29f 644
c07c99b6 645 counter->prev_state = counter->state;
d859e29f
PM
646 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
647 goto unlock;
648 counter->state = PERF_COUNTER_STATE_INACTIVE;
4af4998b 649 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
04289bb9
IM
650
651 /*
d859e29f
PM
652 * If the counter is in a group and isn't the group leader,
653 * then don't put it on unless the group is on.
04289bb9 654 */
d859e29f
PM
655 if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
656 goto unlock;
3b6f9e5c 657
d859e29f
PM
658 if (!group_can_go_on(counter, cpuctx, 1))
659 err = -EEXIST;
660 else
661 err = counter_sched_in(counter, cpuctx, ctx,
662 smp_processor_id());
663
664 if (err) {
665 /*
666 * If this counter can't go on and it's part of a
667 * group, then the whole group has to come off.
668 */
669 if (leader != counter)
670 group_sched_out(leader, cpuctx, ctx);
53cfbf59
PM
671 if (leader->hw_event.pinned) {
672 update_group_times(leader);
d859e29f 673 leader->state = PERF_COUNTER_STATE_ERROR;
53cfbf59 674 }
d859e29f
PM
675 }
676
677 unlock:
849691a6 678 spin_unlock_irqrestore(&ctx->lock, flags);
d859e29f
PM
679}
680
681/*
682 * Enable a counter.
683 */
684static void perf_counter_enable(struct perf_counter *counter)
685{
686 struct perf_counter_context *ctx = counter->ctx;
687 struct task_struct *task = ctx->task;
688
689 if (!task) {
690 /*
691 * Enable the counter on the cpu that it's on
692 */
693 smp_call_function_single(counter->cpu, __perf_counter_enable,
694 counter, 1);
695 return;
696 }
697
698 spin_lock_irq(&ctx->lock);
699 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
700 goto out;
701
702 /*
703 * If the counter is in error state, clear that first.
704 * That way, if we see the counter in error state below, we
705 * know that it has gone back into error state, as distinct
706 * from the task having been scheduled away before the
707 * cross-call arrived.
708 */
709 if (counter->state == PERF_COUNTER_STATE_ERROR)
710 counter->state = PERF_COUNTER_STATE_OFF;
711
712 retry:
713 spin_unlock_irq(&ctx->lock);
714 task_oncpu_function_call(task, __perf_counter_enable, counter);
715
716 spin_lock_irq(&ctx->lock);
717
718 /*
719 * If the context is active and the counter is still off,
720 * we need to retry the cross-call.
721 */
722 if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
723 goto retry;
724
725 /*
726 * Since we have the lock this context can't be scheduled
727 * in, so we can change the state safely.
728 */
53cfbf59 729 if (counter->state == PERF_COUNTER_STATE_OFF) {
d859e29f 730 counter->state = PERF_COUNTER_STATE_INACTIVE;
4af4998b
PZ
731 counter->tstamp_enabled =
732 ctx->time - counter->total_time_enabled;
53cfbf59 733 }
d859e29f
PM
734 out:
735 spin_unlock_irq(&ctx->lock);
736}
737
79f14641
PZ
738static void perf_counter_refresh(struct perf_counter *counter, int refresh)
739{
740 atomic_add(refresh, &counter->event_limit);
741 perf_counter_enable(counter);
742}
743
d859e29f
PM
744/*
745 * Enable a counter and all its children.
746 */
747static void perf_counter_enable_family(struct perf_counter *counter)
748{
749 struct perf_counter *child;
750
751 perf_counter_enable(counter);
752
753 /*
754 * Lock the mutex to protect the list of children
755 */
756 mutex_lock(&counter->mutex);
757 list_for_each_entry(child, &counter->child_list, child_list)
758 perf_counter_enable(child);
759 mutex_unlock(&counter->mutex);
04289bb9
IM
760}
761
235c7fc7
IM
762void __perf_counter_sched_out(struct perf_counter_context *ctx,
763 struct perf_cpu_context *cpuctx)
764{
765 struct perf_counter *counter;
3cbed429 766 u64 flags;
235c7fc7 767
d859e29f
PM
768 spin_lock(&ctx->lock);
769 ctx->is_active = 0;
235c7fc7 770 if (likely(!ctx->nr_counters))
d859e29f 771 goto out;
4af4998b 772 update_context_time(ctx);
235c7fc7 773
3cbed429 774 flags = hw_perf_save_disable();
235c7fc7
IM
775 if (ctx->nr_active) {
776 list_for_each_entry(counter, &ctx->counter_list, list_entry)
777 group_sched_out(counter, cpuctx, ctx);
778 }
3cbed429 779 hw_perf_restore(flags);
d859e29f 780 out:
235c7fc7
IM
781 spin_unlock(&ctx->lock);
782}
783
0793a61d
TG
784/*
785 * Called from scheduler to remove the counters of the current task,
786 * with interrupts disabled.
787 *
788 * We stop each counter and update the counter value in counter->count.
789 *
7671581f 790 * This does not protect us against NMI, but disable()
0793a61d
TG
791 * sets the disabled bit in the control field of counter _before_
792 * accessing the counter control register. If a NMI hits, then it will
793 * not restart the counter.
794 */
795void perf_counter_task_sched_out(struct task_struct *task, int cpu)
796{
797 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
798 struct perf_counter_context *ctx = &task->perf_counter_ctx;
4a0deca6 799 struct pt_regs *regs;
0793a61d
TG
800
801 if (likely(!cpuctx->task_ctx))
802 return;
803
bce379bf
PZ
804 update_context_time(ctx);
805
4a0deca6 806 regs = task_pt_regs(task);
78f13e95 807 perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
235c7fc7
IM
808 __perf_counter_sched_out(ctx, cpuctx);
809
0793a61d
TG
810 cpuctx->task_ctx = NULL;
811}
812
235c7fc7 813static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
04289bb9 814{
235c7fc7 815 __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
04289bb9
IM
816}
817
7995888f 818static int
04289bb9
IM
819group_sched_in(struct perf_counter *group_counter,
820 struct perf_cpu_context *cpuctx,
821 struct perf_counter_context *ctx,
822 int cpu)
823{
95cdd2e7 824 struct perf_counter *counter, *partial_group;
3cbed429
PM
825 int ret;
826
827 if (group_counter->state == PERF_COUNTER_STATE_OFF)
828 return 0;
829
830 ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
831 if (ret)
832 return ret < 0 ? ret : 0;
04289bb9 833
c07c99b6 834 group_counter->prev_state = group_counter->state;
95cdd2e7
IM
835 if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
836 return -EAGAIN;
04289bb9
IM
837
838 /*
839 * Schedule in siblings as one group (if any):
840 */
7995888f 841 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
c07c99b6 842 counter->prev_state = counter->state;
95cdd2e7
IM
843 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
844 partial_group = counter;
845 goto group_error;
846 }
95cdd2e7
IM
847 }
848
3cbed429 849 return 0;
95cdd2e7
IM
850
851group_error:
852 /*
853 * Groups can be scheduled in as one unit only, so undo any
854 * partial group before returning:
855 */
856 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
857 if (counter == partial_group)
858 break;
859 counter_sched_out(counter, cpuctx, ctx);
7995888f 860 }
95cdd2e7 861 counter_sched_out(group_counter, cpuctx, ctx);
7995888f 862
95cdd2e7 863 return -EAGAIN;
04289bb9
IM
864}
865
235c7fc7
IM
866static void
867__perf_counter_sched_in(struct perf_counter_context *ctx,
868 struct perf_cpu_context *cpuctx, int cpu)
0793a61d 869{
0793a61d 870 struct perf_counter *counter;
3cbed429 871 u64 flags;
dd0e6ba2 872 int can_add_hw = 1;
0793a61d 873
d859e29f
PM
874 spin_lock(&ctx->lock);
875 ctx->is_active = 1;
0793a61d 876 if (likely(!ctx->nr_counters))
d859e29f 877 goto out;
0793a61d 878
4af4998b 879 ctx->timestamp = perf_clock();
53cfbf59 880
3cbed429 881 flags = hw_perf_save_disable();
3b6f9e5c
PM
882
883 /*
884 * First go through the list and put on any pinned groups
885 * in order to give them the best chance of going on.
886 */
887 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
888 if (counter->state <= PERF_COUNTER_STATE_OFF ||
889 !counter->hw_event.pinned)
890 continue;
891 if (counter->cpu != -1 && counter->cpu != cpu)
892 continue;
893
894 if (group_can_go_on(counter, cpuctx, 1))
895 group_sched_in(counter, cpuctx, ctx, cpu);
896
897 /*
898 * If this pinned group hasn't been scheduled,
899 * put it in error state.
900 */
53cfbf59
PM
901 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
902 update_group_times(counter);
3b6f9e5c 903 counter->state = PERF_COUNTER_STATE_ERROR;
53cfbf59 904 }
3b6f9e5c
PM
905 }
906
04289bb9 907 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
3b6f9e5c
PM
908 /*
909 * Ignore counters in OFF or ERROR state, and
910 * ignore pinned counters since we did them already.
911 */
912 if (counter->state <= PERF_COUNTER_STATE_OFF ||
913 counter->hw_event.pinned)
914 continue;
915
04289bb9
IM
916 /*
917 * Listen to the 'cpu' scheduling filter constraint
918 * of counters:
919 */
0793a61d
TG
920 if (counter->cpu != -1 && counter->cpu != cpu)
921 continue;
922
3b6f9e5c 923 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
dd0e6ba2
PM
924 if (group_sched_in(counter, cpuctx, ctx, cpu))
925 can_add_hw = 0;
3b6f9e5c 926 }
0793a61d 927 }
3cbed429 928 hw_perf_restore(flags);
d859e29f 929 out:
0793a61d 930 spin_unlock(&ctx->lock);
235c7fc7
IM
931}
932
933/*
934 * Called from scheduler to add the counters of the current task
935 * with interrupts disabled.
936 *
937 * We restore the counter value and then enable it.
938 *
939 * This does not protect us against NMI, but enable()
940 * sets the enabled bit in the control field of counter _before_
941 * accessing the counter control register. If a NMI hits, then it will
942 * keep the counter running.
943 */
944void perf_counter_task_sched_in(struct task_struct *task, int cpu)
945{
946 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
947 struct perf_counter_context *ctx = &task->perf_counter_ctx;
04289bb9 948
235c7fc7 949 __perf_counter_sched_in(ctx, cpuctx, cpu);
0793a61d
TG
950 cpuctx->task_ctx = ctx;
951}
952
235c7fc7
IM
953static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
954{
955 struct perf_counter_context *ctx = &cpuctx->ctx;
956
957 __perf_counter_sched_in(ctx, cpuctx, cpu);
958}
959
1d1c7ddb
IM
960int perf_counter_task_disable(void)
961{
962 struct task_struct *curr = current;
963 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
964 struct perf_counter *counter;
aa9c4c0f 965 unsigned long flags;
1d1c7ddb
IM
966 u64 perf_flags;
967 int cpu;
968
969 if (likely(!ctx->nr_counters))
970 return 0;
971
849691a6 972 local_irq_save(flags);
1d1c7ddb
IM
973 cpu = smp_processor_id();
974
975 perf_counter_task_sched_out(curr, cpu);
976
977 spin_lock(&ctx->lock);
978
979 /*
980 * Disable all the counters:
981 */
982 perf_flags = hw_perf_save_disable();
983
3b6f9e5c 984 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
53cfbf59
PM
985 if (counter->state != PERF_COUNTER_STATE_ERROR) {
986 update_group_times(counter);
3b6f9e5c 987 counter->state = PERF_COUNTER_STATE_OFF;
53cfbf59 988 }
3b6f9e5c 989 }
9b51f66d 990
1d1c7ddb
IM
991 hw_perf_restore(perf_flags);
992
849691a6 993 spin_unlock_irqrestore(&ctx->lock, flags);
1d1c7ddb
IM
994
995 return 0;
996}
997
998int perf_counter_task_enable(void)
999{
1000 struct task_struct *curr = current;
1001 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
1002 struct perf_counter *counter;
aa9c4c0f 1003 unsigned long flags;
1d1c7ddb
IM
1004 u64 perf_flags;
1005 int cpu;
1006
1007 if (likely(!ctx->nr_counters))
1008 return 0;
1009
849691a6 1010 local_irq_save(flags);
1d1c7ddb
IM
1011 cpu = smp_processor_id();
1012
235c7fc7
IM
1013 perf_counter_task_sched_out(curr, cpu);
1014
1d1c7ddb
IM
1015 spin_lock(&ctx->lock);
1016
1017 /*
1018 * Disable all the counters:
1019 */
1020 perf_flags = hw_perf_save_disable();
1021
1022 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
3b6f9e5c 1023 if (counter->state > PERF_COUNTER_STATE_OFF)
1d1c7ddb 1024 continue;
6a930700 1025 counter->state = PERF_COUNTER_STATE_INACTIVE;
4af4998b
PZ
1026 counter->tstamp_enabled =
1027 ctx->time - counter->total_time_enabled;
aa9c4c0f 1028 counter->hw_event.disabled = 0;
1d1c7ddb
IM
1029 }
1030 hw_perf_restore(perf_flags);
1031
1032 spin_unlock(&ctx->lock);
1033
1034 perf_counter_task_sched_in(curr, cpu);
1035
849691a6 1036 local_irq_restore(flags);
1d1c7ddb
IM
1037
1038 return 0;
1039}
1040
235c7fc7
IM
1041/*
1042 * Round-robin a context's counters:
1043 */
1044static void rotate_ctx(struct perf_counter_context *ctx)
0793a61d 1045{
0793a61d 1046 struct perf_counter *counter;
5c92d124 1047 u64 perf_flags;
0793a61d 1048
235c7fc7 1049 if (!ctx->nr_counters)
0793a61d
TG
1050 return;
1051
0793a61d 1052 spin_lock(&ctx->lock);
0793a61d 1053 /*
04289bb9 1054 * Rotate the first entry last (works just fine for group counters too):
0793a61d 1055 */
01b2838c 1056 perf_flags = hw_perf_save_disable();
04289bb9 1057 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
75564232 1058 list_move_tail(&counter->list_entry, &ctx->counter_list);
0793a61d
TG
1059 break;
1060 }
01b2838c 1061 hw_perf_restore(perf_flags);
0793a61d
TG
1062
1063 spin_unlock(&ctx->lock);
235c7fc7
IM
1064}
1065
1066void perf_counter_task_tick(struct task_struct *curr, int cpu)
1067{
1068 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1069 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
1070 const int rotate_percpu = 0;
1071
1072 if (rotate_percpu)
1073 perf_counter_cpu_sched_out(cpuctx);
1074 perf_counter_task_sched_out(curr, cpu);
0793a61d 1075
235c7fc7
IM
1076 if (rotate_percpu)
1077 rotate_ctx(&cpuctx->ctx);
1078 rotate_ctx(ctx);
1079
1080 if (rotate_percpu)
1081 perf_counter_cpu_sched_in(cpuctx, cpu);
0793a61d
TG
1082 perf_counter_task_sched_in(curr, cpu);
1083}
1084
0793a61d
TG
1085/*
1086 * Cross CPU call to read the hardware counter
1087 */
7671581f 1088static void __read(void *info)
0793a61d 1089{
621a01ea 1090 struct perf_counter *counter = info;
53cfbf59 1091 struct perf_counter_context *ctx = counter->ctx;
aa9c4c0f 1092 unsigned long flags;
621a01ea 1093
849691a6 1094 local_irq_save(flags);
53cfbf59 1095 if (ctx->is_active)
4af4998b 1096 update_context_time(ctx);
7671581f 1097 counter->hw_ops->read(counter);
53cfbf59 1098 update_counter_times(counter);
849691a6 1099 local_irq_restore(flags);
0793a61d
TG
1100}
1101
04289bb9 1102static u64 perf_counter_read(struct perf_counter *counter)
0793a61d
TG
1103{
1104 /*
1105 * If counter is enabled and currently active on a CPU, update the
1106 * value in the counter structure:
1107 */
6a930700 1108 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
0793a61d 1109 smp_call_function_single(counter->oncpu,
7671581f 1110 __read, counter, 1);
53cfbf59
PM
1111 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1112 update_counter_times(counter);
0793a61d
TG
1113 }
1114
ee06094f 1115 return atomic64_read(&counter->count);
0793a61d
TG
1116}
1117
0793a61d
TG
1118static void put_context(struct perf_counter_context *ctx)
1119{
1120 if (ctx->task)
1121 put_task_struct(ctx->task);
1122}
1123
1124static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1125{
1126 struct perf_cpu_context *cpuctx;
1127 struct perf_counter_context *ctx;
1128 struct task_struct *task;
1129
1130 /*
1131 * If cpu is not a wildcard then this is a percpu counter:
1132 */
1133 if (cpu != -1) {
1134 /* Must be root to operate on a CPU counter: */
1135 if (!capable(CAP_SYS_ADMIN))
1136 return ERR_PTR(-EACCES);
1137
1138 if (cpu < 0 || cpu > num_possible_cpus())
1139 return ERR_PTR(-EINVAL);
1140
1141 /*
1142 * We could be clever and allow to attach a counter to an
1143 * offline CPU and activate it when the CPU comes up, but
1144 * that's for later.
1145 */
1146 if (!cpu_isset(cpu, cpu_online_map))
1147 return ERR_PTR(-ENODEV);
1148
1149 cpuctx = &per_cpu(perf_cpu_context, cpu);
1150 ctx = &cpuctx->ctx;
1151
0793a61d
TG
1152 return ctx;
1153 }
1154
1155 rcu_read_lock();
1156 if (!pid)
1157 task = current;
1158 else
1159 task = find_task_by_vpid(pid);
1160 if (task)
1161 get_task_struct(task);
1162 rcu_read_unlock();
1163
1164 if (!task)
1165 return ERR_PTR(-ESRCH);
1166
1167 ctx = &task->perf_counter_ctx;
1168 ctx->task = task;
1169
1170 /* Reuse ptrace permission checks for now. */
1171 if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
1172 put_context(ctx);
1173 return ERR_PTR(-EACCES);
1174 }
1175
1176 return ctx;
1177}
1178
592903cd
PZ
1179static void free_counter_rcu(struct rcu_head *head)
1180{
1181 struct perf_counter *counter;
1182
1183 counter = container_of(head, struct perf_counter, rcu_head);
1184 kfree(counter);
1185}
1186
925d519a
PZ
1187static void perf_pending_sync(struct perf_counter *counter);
1188
f1600952
PZ
1189static void free_counter(struct perf_counter *counter)
1190{
925d519a
PZ
1191 perf_pending_sync(counter);
1192
9ee318a7
PZ
1193 if (counter->hw_event.mmap)
1194 atomic_dec(&nr_mmap_tracking);
1195 if (counter->hw_event.munmap)
1196 atomic_dec(&nr_munmap_tracking);
1197 if (counter->hw_event.comm)
1198 atomic_dec(&nr_comm_tracking);
1199
e077df4f
PZ
1200 if (counter->destroy)
1201 counter->destroy(counter);
1202
f1600952
PZ
1203 call_rcu(&counter->rcu_head, free_counter_rcu);
1204}
1205
0793a61d
TG
1206/*
1207 * Called when the last reference to the file is gone.
1208 */
1209static int perf_release(struct inode *inode, struct file *file)
1210{
1211 struct perf_counter *counter = file->private_data;
1212 struct perf_counter_context *ctx = counter->ctx;
1213
1214 file->private_data = NULL;
1215
d859e29f 1216 mutex_lock(&ctx->mutex);
0793a61d
TG
1217 mutex_lock(&counter->mutex);
1218
04289bb9 1219 perf_counter_remove_from_context(counter);
0793a61d
TG
1220
1221 mutex_unlock(&counter->mutex);
d859e29f 1222 mutex_unlock(&ctx->mutex);
0793a61d 1223
f1600952 1224 free_counter(counter);
5af75917 1225 put_context(ctx);
0793a61d
TG
1226
1227 return 0;
1228}
1229
1230/*
1231 * Read the performance counter - simple non blocking version for now
1232 */
1233static ssize_t
1234perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1235{
53cfbf59
PM
1236 u64 values[3];
1237 int n;
0793a61d 1238
3b6f9e5c
PM
1239 /*
1240 * Return end-of-file for a read on a counter that is in
1241 * error state (i.e. because it was pinned but it couldn't be
1242 * scheduled on to the CPU at some point).
1243 */
1244 if (counter->state == PERF_COUNTER_STATE_ERROR)
1245 return 0;
1246
0793a61d 1247 mutex_lock(&counter->mutex);
53cfbf59
PM
1248 values[0] = perf_counter_read(counter);
1249 n = 1;
1250 if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1251 values[n++] = counter->total_time_enabled +
1252 atomic64_read(&counter->child_total_time_enabled);
1253 if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1254 values[n++] = counter->total_time_running +
1255 atomic64_read(&counter->child_total_time_running);
0793a61d
TG
1256 mutex_unlock(&counter->mutex);
1257
53cfbf59
PM
1258 if (count < n * sizeof(u64))
1259 return -EINVAL;
1260 count = n * sizeof(u64);
1261
1262 if (copy_to_user(buf, values, count))
1263 return -EFAULT;
1264
1265 return count;
0793a61d
TG
1266}
1267
0793a61d
TG
1268static ssize_t
1269perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1270{
1271 struct perf_counter *counter = file->private_data;
1272
7b732a75 1273 return perf_read_hw(counter, buf, count);
0793a61d
TG
1274}
1275
1276static unsigned int perf_poll(struct file *file, poll_table *wait)
1277{
1278 struct perf_counter *counter = file->private_data;
c7138f37
PZ
1279 struct perf_mmap_data *data;
1280 unsigned int events;
1281
1282 rcu_read_lock();
1283 data = rcu_dereference(counter->data);
1284 if (data)
1285 events = atomic_xchg(&data->wakeup, 0);
1286 else
1287 events = POLL_HUP;
1288 rcu_read_unlock();
0793a61d
TG
1289
1290 poll_wait(file, &counter->waitq, wait);
1291
0793a61d
TG
1292 return events;
1293}
1294
d859e29f
PM
1295static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1296{
1297 struct perf_counter *counter = file->private_data;
1298 int err = 0;
1299
1300 switch (cmd) {
1301 case PERF_COUNTER_IOC_ENABLE:
1302 perf_counter_enable_family(counter);
1303 break;
1304 case PERF_COUNTER_IOC_DISABLE:
1305 perf_counter_disable_family(counter);
1306 break;
79f14641
PZ
1307 case PERF_COUNTER_IOC_REFRESH:
1308 perf_counter_refresh(counter, arg);
1309 break;
d859e29f
PM
1310 default:
1311 err = -ENOTTY;
1312 }
1313 return err;
1314}
1315
38ff667b
PZ
1316/*
1317 * Callers need to ensure there can be no nesting of this function, otherwise
1318 * the seqlock logic goes bad. We can not serialize this because the arch
1319 * code calls this from NMI context.
1320 */
1321void perf_counter_update_userpage(struct perf_counter *counter)
37d81828 1322{
38ff667b
PZ
1323 struct perf_mmap_data *data;
1324 struct perf_counter_mmap_page *userpg;
1325
1326 rcu_read_lock();
1327 data = rcu_dereference(counter->data);
1328 if (!data)
1329 goto unlock;
1330
1331 userpg = data->user_page;
37d81828 1332
7b732a75
PZ
1333 /*
1334 * Disable preemption so as to not let the corresponding user-space
1335 * spin too long if we get preempted.
1336 */
1337 preempt_disable();
37d81828 1338 ++userpg->lock;
92f22a38 1339 barrier();
37d81828
PM
1340 userpg->index = counter->hw.idx;
1341 userpg->offset = atomic64_read(&counter->count);
1342 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1343 userpg->offset -= atomic64_read(&counter->hw.prev_count);
7b732a75 1344
92f22a38 1345 barrier();
37d81828 1346 ++userpg->lock;
7b732a75 1347 preempt_enable();
38ff667b 1348unlock:
7b732a75 1349 rcu_read_unlock();
37d81828
PM
1350}
1351
1352static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1353{
1354 struct perf_counter *counter = vma->vm_file->private_data;
7b732a75
PZ
1355 struct perf_mmap_data *data;
1356 int ret = VM_FAULT_SIGBUS;
1357
1358 rcu_read_lock();
1359 data = rcu_dereference(counter->data);
1360 if (!data)
1361 goto unlock;
1362
1363 if (vmf->pgoff == 0) {
1364 vmf->page = virt_to_page(data->user_page);
1365 } else {
1366 int nr = vmf->pgoff - 1;
37d81828 1367
7b732a75
PZ
1368 if ((unsigned)nr > data->nr_pages)
1369 goto unlock;
37d81828 1370
7b732a75
PZ
1371 vmf->page = virt_to_page(data->data_pages[nr]);
1372 }
37d81828 1373 get_page(vmf->page);
7b732a75
PZ
1374 ret = 0;
1375unlock:
1376 rcu_read_unlock();
1377
1378 return ret;
1379}
1380
1381static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
1382{
1383 struct perf_mmap_data *data;
1384 unsigned long size;
1385 int i;
1386
1387 WARN_ON(atomic_read(&counter->mmap_count));
1388
1389 size = sizeof(struct perf_mmap_data);
1390 size += nr_pages * sizeof(void *);
1391
1392 data = kzalloc(size, GFP_KERNEL);
1393 if (!data)
1394 goto fail;
1395
1396 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
1397 if (!data->user_page)
1398 goto fail_user_page;
1399
1400 for (i = 0; i < nr_pages; i++) {
1401 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
1402 if (!data->data_pages[i])
1403 goto fail_data_pages;
1404 }
1405
1406 data->nr_pages = nr_pages;
1407
1408 rcu_assign_pointer(counter->data, data);
1409
37d81828 1410 return 0;
7b732a75
PZ
1411
1412fail_data_pages:
1413 for (i--; i >= 0; i--)
1414 free_page((unsigned long)data->data_pages[i]);
1415
1416 free_page((unsigned long)data->user_page);
1417
1418fail_user_page:
1419 kfree(data);
1420
1421fail:
1422 return -ENOMEM;
1423}
1424
1425static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1426{
1427 struct perf_mmap_data *data = container_of(rcu_head,
1428 struct perf_mmap_data, rcu_head);
1429 int i;
1430
1431 free_page((unsigned long)data->user_page);
1432 for (i = 0; i < data->nr_pages; i++)
1433 free_page((unsigned long)data->data_pages[i]);
1434 kfree(data);
1435}
1436
1437static void perf_mmap_data_free(struct perf_counter *counter)
1438{
1439 struct perf_mmap_data *data = counter->data;
1440
1441 WARN_ON(atomic_read(&counter->mmap_count));
1442
1443 rcu_assign_pointer(counter->data, NULL);
1444 call_rcu(&data->rcu_head, __perf_mmap_data_free);
1445}
1446
1447static void perf_mmap_open(struct vm_area_struct *vma)
1448{
1449 struct perf_counter *counter = vma->vm_file->private_data;
1450
1451 atomic_inc(&counter->mmap_count);
1452}
1453
1454static void perf_mmap_close(struct vm_area_struct *vma)
1455{
1456 struct perf_counter *counter = vma->vm_file->private_data;
1457
1458 if (atomic_dec_and_mutex_lock(&counter->mmap_count,
1459 &counter->mmap_mutex)) {
ebb3c4c4 1460 vma->vm_mm->locked_vm -= counter->data->nr_pages + 1;
7b732a75
PZ
1461 perf_mmap_data_free(counter);
1462 mutex_unlock(&counter->mmap_mutex);
1463 }
37d81828
PM
1464}
1465
1466static struct vm_operations_struct perf_mmap_vmops = {
ebb3c4c4 1467 .open = perf_mmap_open,
7b732a75 1468 .close = perf_mmap_close,
37d81828
PM
1469 .fault = perf_mmap_fault,
1470};
1471
1472static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1473{
1474 struct perf_counter *counter = file->private_data;
7b732a75
PZ
1475 unsigned long vma_size;
1476 unsigned long nr_pages;
1477 unsigned long locked, lock_limit;
1478 int ret = 0;
37d81828
PM
1479
1480 if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
1481 return -EINVAL;
7b732a75
PZ
1482
1483 vma_size = vma->vm_end - vma->vm_start;
1484 nr_pages = (vma_size / PAGE_SIZE) - 1;
1485
7730d865
PZ
1486 /*
1487 * If we have data pages ensure they're a power-of-two number, so we
1488 * can do bitmasks instead of modulo.
1489 */
1490 if (nr_pages != 0 && !is_power_of_2(nr_pages))
37d81828
PM
1491 return -EINVAL;
1492
7b732a75 1493 if (vma_size != PAGE_SIZE * (1 + nr_pages))
37d81828
PM
1494 return -EINVAL;
1495
7b732a75
PZ
1496 if (vma->vm_pgoff != 0)
1497 return -EINVAL;
37d81828 1498
ebb3c4c4
PZ
1499 mutex_lock(&counter->mmap_mutex);
1500 if (atomic_inc_not_zero(&counter->mmap_count)) {
1501 if (nr_pages != counter->data->nr_pages)
1502 ret = -EINVAL;
1503 goto unlock;
1504 }
1505
1506 locked = vma->vm_mm->locked_vm;
1507 locked += nr_pages + 1;
7b732a75
PZ
1508
1509 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
1510 lock_limit >>= PAGE_SHIFT;
1511
ebb3c4c4
PZ
1512 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
1513 ret = -EPERM;
1514 goto unlock;
1515 }
7b732a75
PZ
1516
1517 WARN_ON(counter->data);
1518 ret = perf_mmap_data_alloc(counter, nr_pages);
ebb3c4c4
PZ
1519 if (ret)
1520 goto unlock;
1521
1522 atomic_set(&counter->mmap_count, 1);
1523 vma->vm_mm->locked_vm += nr_pages + 1;
1524unlock:
7b732a75 1525 mutex_unlock(&counter->mmap_mutex);
37d81828
PM
1526
1527 vma->vm_flags &= ~VM_MAYWRITE;
1528 vma->vm_flags |= VM_RESERVED;
1529 vma->vm_ops = &perf_mmap_vmops;
7b732a75
PZ
1530
1531 return ret;
37d81828
PM
1532}
1533
3c446b3d
PZ
1534static int perf_fasync(int fd, struct file *filp, int on)
1535{
1536 struct perf_counter *counter = filp->private_data;
1537 struct inode *inode = filp->f_path.dentry->d_inode;
1538 int retval;
1539
1540 mutex_lock(&inode->i_mutex);
1541 retval = fasync_helper(fd, filp, on, &counter->fasync);
1542 mutex_unlock(&inode->i_mutex);
1543
1544 if (retval < 0)
1545 return retval;
1546
1547 return 0;
1548}
1549
0793a61d
TG
1550static const struct file_operations perf_fops = {
1551 .release = perf_release,
1552 .read = perf_read,
1553 .poll = perf_poll,
d859e29f
PM
1554 .unlocked_ioctl = perf_ioctl,
1555 .compat_ioctl = perf_ioctl,
37d81828 1556 .mmap = perf_mmap,
3c446b3d 1557 .fasync = perf_fasync,
0793a61d
TG
1558};
1559
925d519a
PZ
1560/*
1561 * Perf counter wakeup
1562 *
1563 * If there's data, ensure we set the poll() state and publish everything
1564 * to user-space before waking everybody up.
1565 */
1566
1567void perf_counter_wakeup(struct perf_counter *counter)
1568{
1569 struct perf_mmap_data *data;
1570
1571 rcu_read_lock();
1572 data = rcu_dereference(counter->data);
1573 if (data) {
3c446b3d 1574 atomic_set(&data->wakeup, POLL_IN);
38ff667b
PZ
1575 /*
1576 * Ensure all data writes are issued before updating the
1577 * user-space data head information. The matching rmb()
1578 * will be in userspace after reading this value.
1579 */
1580 smp_wmb();
1581 data->user_page->data_head = atomic_read(&data->head);
925d519a
PZ
1582 }
1583 rcu_read_unlock();
1584
1585 wake_up_all(&counter->waitq);
4c9e2542
PZ
1586
1587 if (counter->pending_kill) {
1588 kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
1589 counter->pending_kill = 0;
1590 }
925d519a
PZ
1591}
1592
1593/*
1594 * Pending wakeups
1595 *
1596 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
1597 *
1598 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
1599 * single linked list and use cmpxchg() to add entries lockless.
1600 */
1601
79f14641
PZ
1602static void perf_pending_counter(struct perf_pending_entry *entry)
1603{
1604 struct perf_counter *counter = container_of(entry,
1605 struct perf_counter, pending);
1606
1607 if (counter->pending_disable) {
1608 counter->pending_disable = 0;
1609 perf_counter_disable(counter);
1610 }
1611
1612 if (counter->pending_wakeup) {
1613 counter->pending_wakeup = 0;
1614 perf_counter_wakeup(counter);
1615 }
1616}
1617
671dec5d 1618#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
925d519a 1619
671dec5d 1620static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
925d519a
PZ
1621 PENDING_TAIL,
1622};
1623
671dec5d
PZ
1624static void perf_pending_queue(struct perf_pending_entry *entry,
1625 void (*func)(struct perf_pending_entry *))
925d519a 1626{
671dec5d 1627 struct perf_pending_entry **head;
925d519a 1628
671dec5d 1629 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
925d519a
PZ
1630 return;
1631
671dec5d
PZ
1632 entry->func = func;
1633
1634 head = &get_cpu_var(perf_pending_head);
925d519a
PZ
1635
1636 do {
671dec5d
PZ
1637 entry->next = *head;
1638 } while (cmpxchg(head, entry->next, entry) != entry->next);
925d519a
PZ
1639
1640 set_perf_counter_pending();
1641
671dec5d 1642 put_cpu_var(perf_pending_head);
925d519a
PZ
1643}
1644
1645static int __perf_pending_run(void)
1646{
671dec5d 1647 struct perf_pending_entry *list;
925d519a
PZ
1648 int nr = 0;
1649
671dec5d 1650 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
925d519a 1651 while (list != PENDING_TAIL) {
671dec5d
PZ
1652 void (*func)(struct perf_pending_entry *);
1653 struct perf_pending_entry *entry = list;
925d519a
PZ
1654
1655 list = list->next;
1656
671dec5d
PZ
1657 func = entry->func;
1658 entry->next = NULL;
925d519a
PZ
1659 /*
1660 * Ensure we observe the unqueue before we issue the wakeup,
1661 * so that we won't be waiting forever.
1662 * -- see perf_not_pending().
1663 */
1664 smp_wmb();
1665
671dec5d 1666 func(entry);
925d519a
PZ
1667 nr++;
1668 }
1669
1670 return nr;
1671}
1672
1673static inline int perf_not_pending(struct perf_counter *counter)
1674{
1675 /*
1676 * If we flush on whatever cpu we run, there is a chance we don't
1677 * need to wait.
1678 */
1679 get_cpu();
1680 __perf_pending_run();
1681 put_cpu();
1682
1683 /*
1684 * Ensure we see the proper queue state before going to sleep
1685 * so that we do not miss the wakeup. -- see perf_pending_handle()
1686 */
1687 smp_rmb();
671dec5d 1688 return counter->pending.next == NULL;
925d519a
PZ
1689}
1690
1691static void perf_pending_sync(struct perf_counter *counter)
1692{
1693 wait_event(counter->waitq, perf_not_pending(counter));
1694}
1695
1696void perf_counter_do_pending(void)
1697{
1698 __perf_pending_run();
1699}
1700
394ee076
PZ
1701/*
1702 * Callchain support -- arch specific
1703 */
1704
9c03d88e 1705__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
394ee076
PZ
1706{
1707 return NULL;
1708}
1709
0322cd6e
PZ
1710/*
1711 * Output
1712 */
1713
b9cacc7b
PZ
1714struct perf_output_handle {
1715 struct perf_counter *counter;
1716 struct perf_mmap_data *data;
1717 unsigned int offset;
63e35b25 1718 unsigned int head;
b9cacc7b 1719 int wakeup;
78d613eb 1720 int nmi;
4c9e2542 1721 int overflow;
b9cacc7b
PZ
1722};
1723
78d613eb
PZ
1724static inline void __perf_output_wakeup(struct perf_output_handle *handle)
1725{
671dec5d 1726 if (handle->nmi) {
79f14641 1727 handle->counter->pending_wakeup = 1;
671dec5d 1728 perf_pending_queue(&handle->counter->pending,
79f14641 1729 perf_pending_counter);
671dec5d 1730 } else
78d613eb
PZ
1731 perf_counter_wakeup(handle->counter);
1732}
1733
b9cacc7b 1734static int perf_output_begin(struct perf_output_handle *handle,
78d613eb 1735 struct perf_counter *counter, unsigned int size,
4c9e2542 1736 int nmi, int overflow)
0322cd6e 1737{
7b732a75 1738 struct perf_mmap_data *data;
b9cacc7b 1739 unsigned int offset, head;
0322cd6e 1740
7b732a75 1741 rcu_read_lock();
7b732a75
PZ
1742 data = rcu_dereference(counter->data);
1743 if (!data)
1744 goto out;
1745
4c9e2542
PZ
1746 handle->counter = counter;
1747 handle->nmi = nmi;
1748 handle->overflow = overflow;
78d613eb 1749
7b732a75 1750 if (!data->nr_pages)
78d613eb 1751 goto fail;
7b732a75 1752
7b732a75
PZ
1753 do {
1754 offset = head = atomic_read(&data->head);
c7138f37 1755 head += size;
7b732a75
PZ
1756 } while (atomic_cmpxchg(&data->head, offset, head) != offset);
1757
b9cacc7b
PZ
1758 handle->data = data;
1759 handle->offset = offset;
63e35b25 1760 handle->head = head;
b9cacc7b 1761 handle->wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
0322cd6e 1762
b9cacc7b 1763 return 0;
7b732a75 1764
78d613eb
PZ
1765fail:
1766 __perf_output_wakeup(handle);
b9cacc7b
PZ
1767out:
1768 rcu_read_unlock();
7b732a75 1769
b9cacc7b
PZ
1770 return -ENOSPC;
1771}
7b732a75 1772
b9cacc7b
PZ
1773static void perf_output_copy(struct perf_output_handle *handle,
1774 void *buf, unsigned int len)
1775{
1776 unsigned int pages_mask;
1777 unsigned int offset;
1778 unsigned int size;
1779 void **pages;
1780
1781 offset = handle->offset;
1782 pages_mask = handle->data->nr_pages - 1;
1783 pages = handle->data->data_pages;
1784
1785 do {
1786 unsigned int page_offset;
1787 int nr;
1788
1789 nr = (offset >> PAGE_SHIFT) & pages_mask;
1790 page_offset = offset & (PAGE_SIZE - 1);
1791 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
1792
1793 memcpy(pages[nr] + page_offset, buf, size);
1794
1795 len -= size;
1796 buf += size;
1797 offset += size;
1798 } while (len);
1799
1800 handle->offset = offset;
63e35b25
PZ
1801
1802 WARN_ON_ONCE(handle->offset > handle->head);
b9cacc7b
PZ
1803}
1804
5c148194
PZ
1805#define perf_output_put(handle, x) \
1806 perf_output_copy((handle), &(x), sizeof(x))
1807
78d613eb 1808static void perf_output_end(struct perf_output_handle *handle)
b9cacc7b 1809{
c457810a
PZ
1810 int wakeup_events = handle->counter->hw_event.wakeup_events;
1811
4c9e2542 1812 if (handle->overflow && wakeup_events) {
c457810a
PZ
1813 int events = atomic_inc_return(&handle->data->events);
1814 if (events >= wakeup_events) {
1815 atomic_sub(wakeup_events, &handle->data->events);
1816 __perf_output_wakeup(handle);
1817 }
1818 } else if (handle->wakeup)
78d613eb 1819 __perf_output_wakeup(handle);
7b732a75 1820 rcu_read_unlock();
b9cacc7b
PZ
1821}
1822
f6c7d5fe 1823static void perf_counter_output(struct perf_counter *counter,
78f13e95 1824 int nmi, struct pt_regs *regs, u64 addr)
7b732a75 1825{
5ed00415 1826 int ret;
8a057d84 1827 u64 record_type = counter->hw_event.record_type;
5ed00415
PZ
1828 struct perf_output_handle handle;
1829 struct perf_event_header header;
1830 u64 ip;
5c148194 1831 struct {
ea5d20cf 1832 u32 pid, tid;
5ed00415 1833 } tid_entry;
8a057d84
PZ
1834 struct {
1835 u64 event;
1836 u64 counter;
1837 } group_entry;
394ee076
PZ
1838 struct perf_callchain_entry *callchain = NULL;
1839 int callchain_size = 0;
339f7c90 1840 u64 time;
7b732a75 1841
6b6e5486 1842 header.type = 0;
5ed00415 1843 header.size = sizeof(header);
7b732a75 1844
6b6e5486
PZ
1845 header.misc = PERF_EVENT_MISC_OVERFLOW;
1846 header.misc |= user_mode(regs) ?
6fab0192
PZ
1847 PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL;
1848
8a057d84
PZ
1849 if (record_type & PERF_RECORD_IP) {
1850 ip = instruction_pointer(regs);
6b6e5486 1851 header.type |= PERF_RECORD_IP;
8a057d84
PZ
1852 header.size += sizeof(ip);
1853 }
ea5d20cf 1854
8a057d84 1855 if (record_type & PERF_RECORD_TID) {
ea5d20cf 1856 /* namespace issues */
5ed00415
PZ
1857 tid_entry.pid = current->group_leader->pid;
1858 tid_entry.tid = current->pid;
1859
6b6e5486 1860 header.type |= PERF_RECORD_TID;
5ed00415
PZ
1861 header.size += sizeof(tid_entry);
1862 }
1863
4d855457
PZ
1864 if (record_type & PERF_RECORD_TIME) {
1865 /*
1866 * Maybe do better on x86 and provide cpu_clock_nmi()
1867 */
1868 time = sched_clock();
1869
1870 header.type |= PERF_RECORD_TIME;
1871 header.size += sizeof(u64);
1872 }
1873
78f13e95
PZ
1874 if (record_type & PERF_RECORD_ADDR) {
1875 header.type |= PERF_RECORD_ADDR;
1876 header.size += sizeof(u64);
1877 }
1878
8a057d84 1879 if (record_type & PERF_RECORD_GROUP) {
6b6e5486 1880 header.type |= PERF_RECORD_GROUP;
8a057d84
PZ
1881 header.size += sizeof(u64) +
1882 counter->nr_siblings * sizeof(group_entry);
1883 }
1884
1885 if (record_type & PERF_RECORD_CALLCHAIN) {
394ee076
PZ
1886 callchain = perf_callchain(regs);
1887
1888 if (callchain) {
9c03d88e 1889 callchain_size = (1 + callchain->nr) * sizeof(u64);
394ee076 1890
6b6e5486 1891 header.type |= PERF_RECORD_CALLCHAIN;
394ee076
PZ
1892 header.size += callchain_size;
1893 }
1894 }
1895
4c9e2542 1896 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
5ed00415
PZ
1897 if (ret)
1898 return;
ea5d20cf 1899
5ed00415 1900 perf_output_put(&handle, header);
5c148194 1901
8a057d84
PZ
1902 if (record_type & PERF_RECORD_IP)
1903 perf_output_put(&handle, ip);
5c148194 1904
8a057d84
PZ
1905 if (record_type & PERF_RECORD_TID)
1906 perf_output_put(&handle, tid_entry);
5c148194 1907
4d855457
PZ
1908 if (record_type & PERF_RECORD_TIME)
1909 perf_output_put(&handle, time);
1910
78f13e95
PZ
1911 if (record_type & PERF_RECORD_ADDR)
1912 perf_output_put(&handle, addr);
1913
8a057d84
PZ
1914 if (record_type & PERF_RECORD_GROUP) {
1915 struct perf_counter *leader, *sub;
1916 u64 nr = counter->nr_siblings;
5c148194 1917
8a057d84 1918 perf_output_put(&handle, nr);
0322cd6e 1919
8a057d84
PZ
1920 leader = counter->group_leader;
1921 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1922 if (sub != counter)
1923 sub->hw_ops->read(sub);
7b732a75 1924
8a057d84
PZ
1925 group_entry.event = sub->hw_event.config;
1926 group_entry.counter = atomic64_read(&sub->count);
7b732a75 1927
8a057d84
PZ
1928 perf_output_put(&handle, group_entry);
1929 }
0322cd6e 1930 }
5c148194 1931
8a057d84
PZ
1932 if (callchain)
1933 perf_output_copy(&handle, callchain, callchain_size);
0322cd6e 1934
8a057d84 1935 perf_output_end(&handle);
0322cd6e
PZ
1936}
1937
8d1b2d93
PZ
1938/*
1939 * comm tracking
1940 */
1941
1942struct perf_comm_event {
1943 struct task_struct *task;
1944 char *comm;
1945 int comm_size;
1946
1947 struct {
1948 struct perf_event_header header;
1949
1950 u32 pid;
1951 u32 tid;
1952 } event;
1953};
1954
1955static void perf_counter_comm_output(struct perf_counter *counter,
1956 struct perf_comm_event *comm_event)
1957{
1958 struct perf_output_handle handle;
1959 int size = comm_event->event.header.size;
1960 int ret = perf_output_begin(&handle, counter, size, 0, 0);
1961
1962 if (ret)
1963 return;
1964
1965 perf_output_put(&handle, comm_event->event);
1966 perf_output_copy(&handle, comm_event->comm,
1967 comm_event->comm_size);
1968 perf_output_end(&handle);
1969}
1970
1971static int perf_counter_comm_match(struct perf_counter *counter,
1972 struct perf_comm_event *comm_event)
1973{
1974 if (counter->hw_event.comm &&
1975 comm_event->event.header.type == PERF_EVENT_COMM)
1976 return 1;
1977
1978 return 0;
1979}
1980
1981static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
1982 struct perf_comm_event *comm_event)
1983{
1984 struct perf_counter *counter;
1985
1986 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
1987 return;
1988
1989 rcu_read_lock();
1990 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
1991 if (perf_counter_comm_match(counter, comm_event))
1992 perf_counter_comm_output(counter, comm_event);
1993 }
1994 rcu_read_unlock();
1995}
1996
1997static void perf_counter_comm_event(struct perf_comm_event *comm_event)
1998{
1999 struct perf_cpu_context *cpuctx;
2000 unsigned int size;
2001 char *comm = comm_event->task->comm;
2002
888fcee0 2003 size = ALIGN(strlen(comm)+1, sizeof(u64));
8d1b2d93
PZ
2004
2005 comm_event->comm = comm;
2006 comm_event->comm_size = size;
2007
2008 comm_event->event.header.size = sizeof(comm_event->event) + size;
2009
2010 cpuctx = &get_cpu_var(perf_cpu_context);
2011 perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
2012 put_cpu_var(perf_cpu_context);
2013
2014 perf_counter_comm_ctx(&current->perf_counter_ctx, comm_event);
2015}
2016
2017void perf_counter_comm(struct task_struct *task)
2018{
9ee318a7
PZ
2019 struct perf_comm_event comm_event;
2020
2021 if (!atomic_read(&nr_comm_tracking))
2022 return;
2023
2024 comm_event = (struct perf_comm_event){
8d1b2d93
PZ
2025 .task = task,
2026 .event = {
2027 .header = { .type = PERF_EVENT_COMM, },
2028 .pid = task->group_leader->pid,
2029 .tid = task->pid,
2030 },
2031 };
2032
2033 perf_counter_comm_event(&comm_event);
2034}
2035
0a4a9391
PZ
2036/*
2037 * mmap tracking
2038 */
2039
2040struct perf_mmap_event {
2041 struct file *file;
2042 char *file_name;
2043 int file_size;
2044
2045 struct {
2046 struct perf_event_header header;
2047
2048 u32 pid;
2049 u32 tid;
2050 u64 start;
2051 u64 len;
2052 u64 pgoff;
2053 } event;
2054};
2055
2056static void perf_counter_mmap_output(struct perf_counter *counter,
2057 struct perf_mmap_event *mmap_event)
2058{
2059 struct perf_output_handle handle;
2060 int size = mmap_event->event.header.size;
4c9e2542 2061 int ret = perf_output_begin(&handle, counter, size, 0, 0);
0a4a9391
PZ
2062
2063 if (ret)
2064 return;
2065
2066 perf_output_put(&handle, mmap_event->event);
2067 perf_output_copy(&handle, mmap_event->file_name,
2068 mmap_event->file_size);
78d613eb 2069 perf_output_end(&handle);
0a4a9391
PZ
2070}
2071
2072static int perf_counter_mmap_match(struct perf_counter *counter,
2073 struct perf_mmap_event *mmap_event)
2074{
2075 if (counter->hw_event.mmap &&
2076 mmap_event->event.header.type == PERF_EVENT_MMAP)
2077 return 1;
2078
2079 if (counter->hw_event.munmap &&
2080 mmap_event->event.header.type == PERF_EVENT_MUNMAP)
2081 return 1;
2082
2083 return 0;
2084}
2085
2086static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
2087 struct perf_mmap_event *mmap_event)
2088{
2089 struct perf_counter *counter;
2090
2091 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2092 return;
2093
2094 rcu_read_lock();
2095 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2096 if (perf_counter_mmap_match(counter, mmap_event))
2097 perf_counter_mmap_output(counter, mmap_event);
2098 }
2099 rcu_read_unlock();
2100}
2101
2102static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
2103{
2104 struct perf_cpu_context *cpuctx;
2105 struct file *file = mmap_event->file;
2106 unsigned int size;
2107 char tmp[16];
2108 char *buf = NULL;
2109 char *name;
2110
2111 if (file) {
2112 buf = kzalloc(PATH_MAX, GFP_KERNEL);
2113 if (!buf) {
2114 name = strncpy(tmp, "//enomem", sizeof(tmp));
2115 goto got_name;
2116 }
2117 name = dentry_path(file->f_dentry, buf, PATH_MAX);
2118 if (IS_ERR(name)) {
2119 name = strncpy(tmp, "//toolong", sizeof(tmp));
2120 goto got_name;
2121 }
2122 } else {
2123 name = strncpy(tmp, "//anon", sizeof(tmp));
2124 goto got_name;
2125 }
2126
2127got_name:
888fcee0 2128 size = ALIGN(strlen(name)+1, sizeof(u64));
0a4a9391
PZ
2129
2130 mmap_event->file_name = name;
2131 mmap_event->file_size = size;
2132
2133 mmap_event->event.header.size = sizeof(mmap_event->event) + size;
2134
2135 cpuctx = &get_cpu_var(perf_cpu_context);
2136 perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
2137 put_cpu_var(perf_cpu_context);
2138
2139 perf_counter_mmap_ctx(&current->perf_counter_ctx, mmap_event);
2140
2141 kfree(buf);
2142}
2143
2144void perf_counter_mmap(unsigned long addr, unsigned long len,
2145 unsigned long pgoff, struct file *file)
2146{
9ee318a7
PZ
2147 struct perf_mmap_event mmap_event;
2148
2149 if (!atomic_read(&nr_mmap_tracking))
2150 return;
2151
2152 mmap_event = (struct perf_mmap_event){
0a4a9391
PZ
2153 .file = file,
2154 .event = {
2155 .header = { .type = PERF_EVENT_MMAP, },
2156 .pid = current->group_leader->pid,
2157 .tid = current->pid,
2158 .start = addr,
2159 .len = len,
2160 .pgoff = pgoff,
2161 },
2162 };
2163
2164 perf_counter_mmap_event(&mmap_event);
2165}
2166
2167void perf_counter_munmap(unsigned long addr, unsigned long len,
2168 unsigned long pgoff, struct file *file)
2169{
9ee318a7
PZ
2170 struct perf_mmap_event mmap_event;
2171
2172 if (!atomic_read(&nr_munmap_tracking))
2173 return;
2174
2175 mmap_event = (struct perf_mmap_event){
0a4a9391
PZ
2176 .file = file,
2177 .event = {
2178 .header = { .type = PERF_EVENT_MUNMAP, },
2179 .pid = current->group_leader->pid,
2180 .tid = current->pid,
2181 .start = addr,
2182 .len = len,
2183 .pgoff = pgoff,
2184 },
2185 };
2186
2187 perf_counter_mmap_event(&mmap_event);
2188}
2189
f6c7d5fe
PZ
2190/*
2191 * Generic counter overflow handling.
2192 */
2193
2194int perf_counter_overflow(struct perf_counter *counter,
78f13e95 2195 int nmi, struct pt_regs *regs, u64 addr)
f6c7d5fe 2196{
79f14641
PZ
2197 int events = atomic_read(&counter->event_limit);
2198 int ret = 0;
2199
4c9e2542 2200 counter->pending_kill = POLL_IN;
79f14641
PZ
2201 if (events && atomic_dec_and_test(&counter->event_limit)) {
2202 ret = 1;
4c9e2542 2203 counter->pending_kill = POLL_HUP;
79f14641
PZ
2204 if (nmi) {
2205 counter->pending_disable = 1;
2206 perf_pending_queue(&counter->pending,
2207 perf_pending_counter);
2208 } else
2209 perf_counter_disable(counter);
2210 }
2211
78f13e95 2212 perf_counter_output(counter, nmi, regs, addr);
79f14641 2213 return ret;
f6c7d5fe
PZ
2214}
2215
15dbf27c
PZ
2216/*
2217 * Generic software counter infrastructure
2218 */
2219
2220static void perf_swcounter_update(struct perf_counter *counter)
2221{
2222 struct hw_perf_counter *hwc = &counter->hw;
2223 u64 prev, now;
2224 s64 delta;
2225
2226again:
2227 prev = atomic64_read(&hwc->prev_count);
2228 now = atomic64_read(&hwc->count);
2229 if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
2230 goto again;
2231
2232 delta = now - prev;
2233
2234 atomic64_add(delta, &counter->count);
2235 atomic64_sub(delta, &hwc->period_left);
2236}
2237
2238static void perf_swcounter_set_period(struct perf_counter *counter)
2239{
2240 struct hw_perf_counter *hwc = &counter->hw;
2241 s64 left = atomic64_read(&hwc->period_left);
2242 s64 period = hwc->irq_period;
2243
2244 if (unlikely(left <= -period)) {
2245 left = period;
2246 atomic64_set(&hwc->period_left, left);
2247 }
2248
2249 if (unlikely(left <= 0)) {
2250 left += period;
2251 atomic64_add(period, &hwc->period_left);
2252 }
2253
2254 atomic64_set(&hwc->prev_count, -left);
2255 atomic64_set(&hwc->count, -left);
2256}
2257
d6d020e9
PZ
2258static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
2259{
f6c7d5fe 2260 enum hrtimer_restart ret = HRTIMER_RESTART;
d6d020e9
PZ
2261 struct perf_counter *counter;
2262 struct pt_regs *regs;
2263
2264 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
2265 counter->hw_ops->read(counter);
2266
2267 regs = get_irq_regs();
2268 /*
2269 * In case we exclude kernel IPs or are somehow not in interrupt
2270 * context, provide the next best thing, the user IP.
2271 */
2272 if ((counter->hw_event.exclude_kernel || !regs) &&
2273 !counter->hw_event.exclude_user)
2274 regs = task_pt_regs(current);
2275
f6c7d5fe 2276 if (regs) {
78f13e95 2277 if (perf_counter_overflow(counter, 0, regs, 0))
f6c7d5fe
PZ
2278 ret = HRTIMER_NORESTART;
2279 }
d6d020e9
PZ
2280
2281 hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
2282
f6c7d5fe 2283 return ret;
d6d020e9
PZ
2284}
2285
2286static void perf_swcounter_overflow(struct perf_counter *counter,
78f13e95 2287 int nmi, struct pt_regs *regs, u64 addr)
d6d020e9 2288{
b8e83514
PZ
2289 perf_swcounter_update(counter);
2290 perf_swcounter_set_period(counter);
78f13e95 2291 if (perf_counter_overflow(counter, nmi, regs, addr))
f6c7d5fe
PZ
2292 /* soft-disable the counter */
2293 ;
2294
d6d020e9
PZ
2295}
2296
15dbf27c 2297static int perf_swcounter_match(struct perf_counter *counter,
b8e83514
PZ
2298 enum perf_event_types type,
2299 u32 event, struct pt_regs *regs)
15dbf27c
PZ
2300{
2301 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
2302 return 0;
2303
f4a2deb4 2304 if (perf_event_raw(&counter->hw_event))
b8e83514
PZ
2305 return 0;
2306
f4a2deb4 2307 if (perf_event_type(&counter->hw_event) != type)
15dbf27c
PZ
2308 return 0;
2309
f4a2deb4 2310 if (perf_event_id(&counter->hw_event) != event)
15dbf27c
PZ
2311 return 0;
2312
2313 if (counter->hw_event.exclude_user && user_mode(regs))
2314 return 0;
2315
2316 if (counter->hw_event.exclude_kernel && !user_mode(regs))
2317 return 0;
2318
2319 return 1;
2320}
2321
d6d020e9 2322static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
78f13e95 2323 int nmi, struct pt_regs *regs, u64 addr)
d6d020e9
PZ
2324{
2325 int neg = atomic64_add_negative(nr, &counter->hw.count);
2326 if (counter->hw.irq_period && !neg)
78f13e95 2327 perf_swcounter_overflow(counter, nmi, regs, addr);
d6d020e9
PZ
2328}
2329
15dbf27c 2330static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
b8e83514 2331 enum perf_event_types type, u32 event,
78f13e95
PZ
2332 u64 nr, int nmi, struct pt_regs *regs,
2333 u64 addr)
15dbf27c
PZ
2334{
2335 struct perf_counter *counter;
15dbf27c 2336
01ef09d9 2337 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
15dbf27c
PZ
2338 return;
2339
592903cd
PZ
2340 rcu_read_lock();
2341 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
b8e83514 2342 if (perf_swcounter_match(counter, type, event, regs))
78f13e95 2343 perf_swcounter_add(counter, nr, nmi, regs, addr);
15dbf27c 2344 }
592903cd 2345 rcu_read_unlock();
15dbf27c
PZ
2346}
2347
96f6d444
PZ
2348static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
2349{
2350 if (in_nmi())
2351 return &cpuctx->recursion[3];
2352
2353 if (in_irq())
2354 return &cpuctx->recursion[2];
2355
2356 if (in_softirq())
2357 return &cpuctx->recursion[1];
2358
2359 return &cpuctx->recursion[0];
2360}
2361
b8e83514 2362static void __perf_swcounter_event(enum perf_event_types type, u32 event,
78f13e95
PZ
2363 u64 nr, int nmi, struct pt_regs *regs,
2364 u64 addr)
15dbf27c
PZ
2365{
2366 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
96f6d444
PZ
2367 int *recursion = perf_swcounter_recursion_context(cpuctx);
2368
2369 if (*recursion)
2370 goto out;
2371
2372 (*recursion)++;
2373 barrier();
15dbf27c 2374
78f13e95
PZ
2375 perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
2376 nr, nmi, regs, addr);
b8e83514
PZ
2377 if (cpuctx->task_ctx) {
2378 perf_swcounter_ctx_event(cpuctx->task_ctx, type, event,
78f13e95 2379 nr, nmi, regs, addr);
b8e83514 2380 }
15dbf27c 2381
96f6d444
PZ
2382 barrier();
2383 (*recursion)--;
2384
2385out:
15dbf27c
PZ
2386 put_cpu_var(perf_cpu_context);
2387}
2388
78f13e95
PZ
2389void
2390perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
b8e83514 2391{
78f13e95 2392 __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr);
b8e83514
PZ
2393}
2394
15dbf27c
PZ
2395static void perf_swcounter_read(struct perf_counter *counter)
2396{
2397 perf_swcounter_update(counter);
2398}
2399
2400static int perf_swcounter_enable(struct perf_counter *counter)
2401{
2402 perf_swcounter_set_period(counter);
2403 return 0;
2404}
2405
2406static void perf_swcounter_disable(struct perf_counter *counter)
2407{
2408 perf_swcounter_update(counter);
2409}
2410
ac17dc8e
PZ
2411static const struct hw_perf_counter_ops perf_ops_generic = {
2412 .enable = perf_swcounter_enable,
2413 .disable = perf_swcounter_disable,
2414 .read = perf_swcounter_read,
2415};
2416
15dbf27c
PZ
2417/*
2418 * Software counter: cpu wall time clock
2419 */
2420
9abf8a08
PM
2421static void cpu_clock_perf_counter_update(struct perf_counter *counter)
2422{
2423 int cpu = raw_smp_processor_id();
2424 s64 prev;
2425 u64 now;
2426
2427 now = cpu_clock(cpu);
2428 prev = atomic64_read(&counter->hw.prev_count);
2429 atomic64_set(&counter->hw.prev_count, now);
2430 atomic64_add(now - prev, &counter->count);
2431}
2432
d6d020e9
PZ
2433static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
2434{
2435 struct hw_perf_counter *hwc = &counter->hw;
2436 int cpu = raw_smp_processor_id();
2437
2438 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
039fc91e
PZ
2439 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2440 hwc->hrtimer.function = perf_swcounter_hrtimer;
d6d020e9 2441 if (hwc->irq_period) {
d6d020e9
PZ
2442 __hrtimer_start_range_ns(&hwc->hrtimer,
2443 ns_to_ktime(hwc->irq_period), 0,
2444 HRTIMER_MODE_REL, 0);
2445 }
2446
2447 return 0;
2448}
2449
5c92d124
IM
2450static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
2451{
d6d020e9 2452 hrtimer_cancel(&counter->hw.hrtimer);
9abf8a08 2453 cpu_clock_perf_counter_update(counter);
5c92d124
IM
2454}
2455
2456static void cpu_clock_perf_counter_read(struct perf_counter *counter)
2457{
9abf8a08 2458 cpu_clock_perf_counter_update(counter);
5c92d124
IM
2459}
2460
2461static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
7671581f
IM
2462 .enable = cpu_clock_perf_counter_enable,
2463 .disable = cpu_clock_perf_counter_disable,
2464 .read = cpu_clock_perf_counter_read,
5c92d124
IM
2465};
2466
15dbf27c
PZ
2467/*
2468 * Software counter: task time clock
2469 */
2470
e30e08f6 2471static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
aa9c4c0f 2472{
e30e08f6 2473 u64 prev;
8cb391e8
IM
2474 s64 delta;
2475
a39d6f25 2476 prev = atomic64_xchg(&counter->hw.prev_count, now);
8cb391e8 2477 delta = now - prev;
8cb391e8 2478 atomic64_add(delta, &counter->count);
bae43c99
IM
2479}
2480
95cdd2e7 2481static int task_clock_perf_counter_enable(struct perf_counter *counter)
8cb391e8 2482{
d6d020e9 2483 struct hw_perf_counter *hwc = &counter->hw;
a39d6f25
PZ
2484 u64 now;
2485
a39d6f25 2486 now = counter->ctx->time;
d6d020e9 2487
a39d6f25 2488 atomic64_set(&hwc->prev_count, now);
039fc91e
PZ
2489 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2490 hwc->hrtimer.function = perf_swcounter_hrtimer;
d6d020e9 2491 if (hwc->irq_period) {
d6d020e9
PZ
2492 __hrtimer_start_range_ns(&hwc->hrtimer,
2493 ns_to_ktime(hwc->irq_period), 0,
2494 HRTIMER_MODE_REL, 0);
2495 }
95cdd2e7
IM
2496
2497 return 0;
8cb391e8
IM
2498}
2499
2500static void task_clock_perf_counter_disable(struct perf_counter *counter)
bae43c99 2501{
d6d020e9 2502 hrtimer_cancel(&counter->hw.hrtimer);
e30e08f6
PZ
2503 task_clock_perf_counter_update(counter, counter->ctx->time);
2504
d6d020e9 2505}
aa9c4c0f 2506
d6d020e9
PZ
2507static void task_clock_perf_counter_read(struct perf_counter *counter)
2508{
e30e08f6
PZ
2509 u64 time;
2510
2511 if (!in_nmi()) {
2512 update_context_time(counter->ctx);
2513 time = counter->ctx->time;
2514 } else {
2515 u64 now = perf_clock();
2516 u64 delta = now - counter->ctx->timestamp;
2517 time = counter->ctx->time + delta;
2518 }
2519
2520 task_clock_perf_counter_update(counter, time);
bae43c99
IM
2521}
2522
2523static const struct hw_perf_counter_ops perf_ops_task_clock = {
7671581f
IM
2524 .enable = task_clock_perf_counter_enable,
2525 .disable = task_clock_perf_counter_disable,
2526 .read = task_clock_perf_counter_read,
bae43c99
IM
2527};
2528
15dbf27c
PZ
2529/*
2530 * Software counter: cpu migrations
2531 */
2532
23a185ca 2533static inline u64 get_cpu_migrations(struct perf_counter *counter)
6c594c21 2534{
23a185ca
PM
2535 struct task_struct *curr = counter->ctx->task;
2536
2537 if (curr)
2538 return curr->se.nr_migrations;
2539 return cpu_nr_migrations(smp_processor_id());
6c594c21
IM
2540}
2541
2542static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
2543{
2544 u64 prev, now;
2545 s64 delta;
2546
2547 prev = atomic64_read(&counter->hw.prev_count);
23a185ca 2548 now = get_cpu_migrations(counter);
6c594c21
IM
2549
2550 atomic64_set(&counter->hw.prev_count, now);
2551
2552 delta = now - prev;
6c594c21
IM
2553
2554 atomic64_add(delta, &counter->count);
2555}
2556
2557static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
2558{
2559 cpu_migrations_perf_counter_update(counter);
2560}
2561
95cdd2e7 2562static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
6c594c21 2563{
c07c99b6
PM
2564 if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
2565 atomic64_set(&counter->hw.prev_count,
2566 get_cpu_migrations(counter));
95cdd2e7 2567 return 0;
6c594c21
IM
2568}
2569
2570static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
2571{
2572 cpu_migrations_perf_counter_update(counter);
2573}
2574
2575static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
7671581f
IM
2576 .enable = cpu_migrations_perf_counter_enable,
2577 .disable = cpu_migrations_perf_counter_disable,
2578 .read = cpu_migrations_perf_counter_read,
6c594c21
IM
2579};
2580
e077df4f
PZ
2581#ifdef CONFIG_EVENT_PROFILE
2582void perf_tpcounter_event(int event_id)
2583{
b8e83514
PZ
2584 struct pt_regs *regs = get_irq_regs();
2585
2586 if (!regs)
2587 regs = task_pt_regs(current);
2588
78f13e95 2589 __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0);
e077df4f
PZ
2590}
2591
2592extern int ftrace_profile_enable(int);
2593extern void ftrace_profile_disable(int);
2594
2595static void tp_perf_counter_destroy(struct perf_counter *counter)
2596{
f4a2deb4 2597 ftrace_profile_disable(perf_event_id(&counter->hw_event));
e077df4f
PZ
2598}
2599
2600static const struct hw_perf_counter_ops *
2601tp_perf_counter_init(struct perf_counter *counter)
2602{
f4a2deb4 2603 int event_id = perf_event_id(&counter->hw_event);
e077df4f
PZ
2604 int ret;
2605
2606 ret = ftrace_profile_enable(event_id);
2607 if (ret)
2608 return NULL;
2609
2610 counter->destroy = tp_perf_counter_destroy;
b8e83514 2611 counter->hw.irq_period = counter->hw_event.irq_period;
e077df4f
PZ
2612
2613 return &perf_ops_generic;
2614}
2615#else
2616static const struct hw_perf_counter_ops *
2617tp_perf_counter_init(struct perf_counter *counter)
2618{
2619 return NULL;
2620}
2621#endif
2622
5c92d124
IM
2623static const struct hw_perf_counter_ops *
2624sw_perf_counter_init(struct perf_counter *counter)
2625{
15dbf27c 2626 struct perf_counter_hw_event *hw_event = &counter->hw_event;
5c92d124 2627 const struct hw_perf_counter_ops *hw_ops = NULL;
15dbf27c 2628 struct hw_perf_counter *hwc = &counter->hw;
5c92d124 2629
0475f9ea
PM
2630 /*
2631 * Software counters (currently) can't in general distinguish
2632 * between user, kernel and hypervisor events.
2633 * However, context switches and cpu migrations are considered
2634 * to be kernel events, and page faults are never hypervisor
2635 * events.
2636 */
f4a2deb4 2637 switch (perf_event_id(&counter->hw_event)) {
5c92d124 2638 case PERF_COUNT_CPU_CLOCK:
d6d020e9
PZ
2639 hw_ops = &perf_ops_cpu_clock;
2640
2641 if (hw_event->irq_period && hw_event->irq_period < 10000)
2642 hw_event->irq_period = 10000;
5c92d124 2643 break;
bae43c99 2644 case PERF_COUNT_TASK_CLOCK:
23a185ca
PM
2645 /*
2646 * If the user instantiates this as a per-cpu counter,
2647 * use the cpu_clock counter instead.
2648 */
2649 if (counter->ctx->task)
2650 hw_ops = &perf_ops_task_clock;
2651 else
2652 hw_ops = &perf_ops_cpu_clock;
d6d020e9
PZ
2653
2654 if (hw_event->irq_period && hw_event->irq_period < 10000)
2655 hw_event->irq_period = 10000;
bae43c99 2656 break;
e06c61a8 2657 case PERF_COUNT_PAGE_FAULTS:
ac17dc8e
PZ
2658 case PERF_COUNT_PAGE_FAULTS_MIN:
2659 case PERF_COUNT_PAGE_FAULTS_MAJ:
5d6a27d8 2660 case PERF_COUNT_CONTEXT_SWITCHES:
4a0deca6 2661 hw_ops = &perf_ops_generic;
5d6a27d8 2662 break;
6c594c21 2663 case PERF_COUNT_CPU_MIGRATIONS:
0475f9ea
PM
2664 if (!counter->hw_event.exclude_kernel)
2665 hw_ops = &perf_ops_cpu_migrations;
6c594c21 2666 break;
5c92d124 2667 }
15dbf27c
PZ
2668
2669 if (hw_ops)
2670 hwc->irq_period = hw_event->irq_period;
2671
5c92d124
IM
2672 return hw_ops;
2673}
2674
0793a61d
TG
2675/*
2676 * Allocate and initialize a counter structure
2677 */
2678static struct perf_counter *
04289bb9
IM
2679perf_counter_alloc(struct perf_counter_hw_event *hw_event,
2680 int cpu,
23a185ca 2681 struct perf_counter_context *ctx,
9b51f66d
IM
2682 struct perf_counter *group_leader,
2683 gfp_t gfpflags)
0793a61d 2684{
5c92d124 2685 const struct hw_perf_counter_ops *hw_ops;
621a01ea 2686 struct perf_counter *counter;
d5d2bc0d 2687 long err;
0793a61d 2688
9b51f66d 2689 counter = kzalloc(sizeof(*counter), gfpflags);
0793a61d 2690 if (!counter)
d5d2bc0d 2691 return ERR_PTR(-ENOMEM);
0793a61d 2692
04289bb9
IM
2693 /*
2694 * Single counters are their own group leaders, with an
2695 * empty sibling list:
2696 */
2697 if (!group_leader)
2698 group_leader = counter;
2699
0793a61d 2700 mutex_init(&counter->mutex);
04289bb9 2701 INIT_LIST_HEAD(&counter->list_entry);
592903cd 2702 INIT_LIST_HEAD(&counter->event_entry);
04289bb9 2703 INIT_LIST_HEAD(&counter->sibling_list);
0793a61d
TG
2704 init_waitqueue_head(&counter->waitq);
2705
7b732a75
PZ
2706 mutex_init(&counter->mmap_mutex);
2707
d859e29f
PM
2708 INIT_LIST_HEAD(&counter->child_list);
2709
9f66a381
IM
2710 counter->cpu = cpu;
2711 counter->hw_event = *hw_event;
04289bb9 2712 counter->group_leader = group_leader;
621a01ea 2713 counter->hw_ops = NULL;
23a185ca 2714 counter->ctx = ctx;
621a01ea 2715
235c7fc7 2716 counter->state = PERF_COUNTER_STATE_INACTIVE;
a86ed508
IM
2717 if (hw_event->disabled)
2718 counter->state = PERF_COUNTER_STATE_OFF;
2719
5c92d124 2720 hw_ops = NULL;
b8e83514 2721
f4a2deb4 2722 if (perf_event_raw(hw_event)) {
b8e83514 2723 hw_ops = hw_perf_counter_init(counter);
f4a2deb4
PZ
2724 goto done;
2725 }
2726
2727 switch (perf_event_type(hw_event)) {
b8e83514 2728 case PERF_TYPE_HARDWARE:
5c92d124 2729 hw_ops = hw_perf_counter_init(counter);
b8e83514
PZ
2730 break;
2731
2732 case PERF_TYPE_SOFTWARE:
2733 hw_ops = sw_perf_counter_init(counter);
2734 break;
2735
2736 case PERF_TYPE_TRACEPOINT:
2737 hw_ops = tp_perf_counter_init(counter);
2738 break;
2739 }
d5d2bc0d
PM
2740done:
2741 err = 0;
2742 if (!hw_ops)
2743 err = -EINVAL;
2744 else if (IS_ERR(hw_ops))
2745 err = PTR_ERR(hw_ops);
5c92d124 2746
d5d2bc0d 2747 if (err) {
621a01ea 2748 kfree(counter);
d5d2bc0d 2749 return ERR_PTR(err);
621a01ea 2750 }
d5d2bc0d 2751
621a01ea 2752 counter->hw_ops = hw_ops;
0793a61d 2753
9ee318a7
PZ
2754 if (counter->hw_event.mmap)
2755 atomic_inc(&nr_mmap_tracking);
2756 if (counter->hw_event.munmap)
2757 atomic_inc(&nr_munmap_tracking);
2758 if (counter->hw_event.comm)
2759 atomic_inc(&nr_comm_tracking);
2760
0793a61d
TG
2761 return counter;
2762}
2763
2764/**
2743a5b0 2765 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
9f66a381
IM
2766 *
2767 * @hw_event_uptr: event type attributes for monitoring/sampling
0793a61d 2768 * @pid: target pid
9f66a381
IM
2769 * @cpu: target cpu
2770 * @group_fd: group leader counter fd
0793a61d 2771 */
2743a5b0 2772SYSCALL_DEFINE5(perf_counter_open,
f3dfd265 2773 const struct perf_counter_hw_event __user *, hw_event_uptr,
2743a5b0 2774 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
0793a61d 2775{
04289bb9 2776 struct perf_counter *counter, *group_leader;
9f66a381 2777 struct perf_counter_hw_event hw_event;
04289bb9 2778 struct perf_counter_context *ctx;
9b51f66d 2779 struct file *counter_file = NULL;
04289bb9
IM
2780 struct file *group_file = NULL;
2781 int fput_needed = 0;
9b51f66d 2782 int fput_needed2 = 0;
0793a61d
TG
2783 int ret;
2784
2743a5b0
PM
2785 /* for future expandability... */
2786 if (flags)
2787 return -EINVAL;
2788
9f66a381 2789 if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
eab656ae
TG
2790 return -EFAULT;
2791
04289bb9 2792 /*
ccff286d
IM
2793 * Get the target context (task or percpu):
2794 */
2795 ctx = find_get_context(pid, cpu);
2796 if (IS_ERR(ctx))
2797 return PTR_ERR(ctx);
2798
2799 /*
2800 * Look up the group leader (we will attach this counter to it):
04289bb9
IM
2801 */
2802 group_leader = NULL;
2803 if (group_fd != -1) {
2804 ret = -EINVAL;
2805 group_file = fget_light(group_fd, &fput_needed);
2806 if (!group_file)
ccff286d 2807 goto err_put_context;
04289bb9 2808 if (group_file->f_op != &perf_fops)
ccff286d 2809 goto err_put_context;
04289bb9
IM
2810
2811 group_leader = group_file->private_data;
2812 /*
ccff286d
IM
2813 * Do not allow a recursive hierarchy (this new sibling
2814 * becoming part of another group-sibling):
2815 */
2816 if (group_leader->group_leader != group_leader)
2817 goto err_put_context;
2818 /*
2819 * Do not allow to attach to a group in a different
2820 * task or CPU context:
04289bb9 2821 */
ccff286d
IM
2822 if (group_leader->ctx != ctx)
2823 goto err_put_context;
3b6f9e5c
PM
2824 /*
2825 * Only a group leader can be exclusive or pinned
2826 */
2827 if (hw_event.exclusive || hw_event.pinned)
2828 goto err_put_context;
04289bb9
IM
2829 }
2830
23a185ca
PM
2831 counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
2832 GFP_KERNEL);
d5d2bc0d
PM
2833 ret = PTR_ERR(counter);
2834 if (IS_ERR(counter))
0793a61d
TG
2835 goto err_put_context;
2836
0793a61d
TG
2837 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
2838 if (ret < 0)
9b51f66d
IM
2839 goto err_free_put_context;
2840
2841 counter_file = fget_light(ret, &fput_needed2);
2842 if (!counter_file)
2843 goto err_free_put_context;
2844
2845 counter->filp = counter_file;
d859e29f 2846 mutex_lock(&ctx->mutex);
9b51f66d 2847 perf_install_in_context(ctx, counter, cpu);
d859e29f 2848 mutex_unlock(&ctx->mutex);
9b51f66d
IM
2849
2850 fput_light(counter_file, fput_needed2);
0793a61d 2851
04289bb9
IM
2852out_fput:
2853 fput_light(group_file, fput_needed);
2854
0793a61d
TG
2855 return ret;
2856
9b51f66d 2857err_free_put_context:
0793a61d
TG
2858 kfree(counter);
2859
2860err_put_context:
2861 put_context(ctx);
2862
04289bb9 2863 goto out_fput;
0793a61d
TG
2864}
2865
9b51f66d
IM
2866/*
2867 * Initialize the perf_counter context in a task_struct:
2868 */
2869static void
2870__perf_counter_init_context(struct perf_counter_context *ctx,
2871 struct task_struct *task)
2872{
2873 memset(ctx, 0, sizeof(*ctx));
2874 spin_lock_init(&ctx->lock);
d859e29f 2875 mutex_init(&ctx->mutex);
9b51f66d 2876 INIT_LIST_HEAD(&ctx->counter_list);
592903cd 2877 INIT_LIST_HEAD(&ctx->event_list);
9b51f66d
IM
2878 ctx->task = task;
2879}
2880
2881/*
2882 * inherit a counter from parent task to child task:
2883 */
d859e29f 2884static struct perf_counter *
9b51f66d
IM
2885inherit_counter(struct perf_counter *parent_counter,
2886 struct task_struct *parent,
2887 struct perf_counter_context *parent_ctx,
2888 struct task_struct *child,
d859e29f 2889 struct perf_counter *group_leader,
9b51f66d
IM
2890 struct perf_counter_context *child_ctx)
2891{
2892 struct perf_counter *child_counter;
2893
d859e29f
PM
2894 /*
2895 * Instead of creating recursive hierarchies of counters,
2896 * we link inherited counters back to the original parent,
2897 * which has a filp for sure, which we use as the reference
2898 * count:
2899 */
2900 if (parent_counter->parent)
2901 parent_counter = parent_counter->parent;
2902
9b51f66d 2903 child_counter = perf_counter_alloc(&parent_counter->hw_event,
23a185ca
PM
2904 parent_counter->cpu, child_ctx,
2905 group_leader, GFP_KERNEL);
d5d2bc0d
PM
2906 if (IS_ERR(child_counter))
2907 return child_counter;
9b51f66d
IM
2908
2909 /*
2910 * Link it up in the child's context:
2911 */
9b51f66d 2912 child_counter->task = child;
53cfbf59 2913 add_counter_to_ctx(child_counter, child_ctx);
9b51f66d
IM
2914
2915 child_counter->parent = parent_counter;
9b51f66d
IM
2916 /*
2917 * inherit into child's child as well:
2918 */
2919 child_counter->hw_event.inherit = 1;
2920
2921 /*
2922 * Get a reference to the parent filp - we will fput it
2923 * when the child counter exits. This is safe to do because
2924 * we are in the parent and we know that the filp still
2925 * exists and has a nonzero count:
2926 */
2927 atomic_long_inc(&parent_counter->filp->f_count);
2928
d859e29f
PM
2929 /*
2930 * Link this into the parent counter's child list
2931 */
2932 mutex_lock(&parent_counter->mutex);
2933 list_add_tail(&child_counter->child_list, &parent_counter->child_list);
2934
2935 /*
2936 * Make the child state follow the state of the parent counter,
2937 * not its hw_event.disabled bit. We hold the parent's mutex,
2938 * so we won't race with perf_counter_{en,dis}able_family.
2939 */
2940 if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
2941 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
2942 else
2943 child_counter->state = PERF_COUNTER_STATE_OFF;
2944
2945 mutex_unlock(&parent_counter->mutex);
2946
2947 return child_counter;
2948}
2949
2950static int inherit_group(struct perf_counter *parent_counter,
2951 struct task_struct *parent,
2952 struct perf_counter_context *parent_ctx,
2953 struct task_struct *child,
2954 struct perf_counter_context *child_ctx)
2955{
2956 struct perf_counter *leader;
2957 struct perf_counter *sub;
d5d2bc0d 2958 struct perf_counter *child_ctr;
d859e29f
PM
2959
2960 leader = inherit_counter(parent_counter, parent, parent_ctx,
2961 child, NULL, child_ctx);
d5d2bc0d
PM
2962 if (IS_ERR(leader))
2963 return PTR_ERR(leader);
d859e29f 2964 list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
d5d2bc0d
PM
2965 child_ctr = inherit_counter(sub, parent, parent_ctx,
2966 child, leader, child_ctx);
2967 if (IS_ERR(child_ctr))
2968 return PTR_ERR(child_ctr);
d859e29f 2969 }
9b51f66d
IM
2970 return 0;
2971}
2972
d859e29f
PM
2973static void sync_child_counter(struct perf_counter *child_counter,
2974 struct perf_counter *parent_counter)
2975{
2976 u64 parent_val, child_val;
2977
2978 parent_val = atomic64_read(&parent_counter->count);
2979 child_val = atomic64_read(&child_counter->count);
2980
2981 /*
2982 * Add back the child's count to the parent's count:
2983 */
2984 atomic64_add(child_val, &parent_counter->count);
53cfbf59
PM
2985 atomic64_add(child_counter->total_time_enabled,
2986 &parent_counter->child_total_time_enabled);
2987 atomic64_add(child_counter->total_time_running,
2988 &parent_counter->child_total_time_running);
d859e29f
PM
2989
2990 /*
2991 * Remove this counter from the parent's list
2992 */
2993 mutex_lock(&parent_counter->mutex);
2994 list_del_init(&child_counter->child_list);
2995 mutex_unlock(&parent_counter->mutex);
2996
2997 /*
2998 * Release the parent counter, if this was the last
2999 * reference to it.
3000 */
3001 fput(parent_counter->filp);
3002}
3003
9b51f66d
IM
3004static void
3005__perf_counter_exit_task(struct task_struct *child,
3006 struct perf_counter *child_counter,
3007 struct perf_counter_context *child_ctx)
3008{
3009 struct perf_counter *parent_counter;
d859e29f 3010 struct perf_counter *sub, *tmp;
9b51f66d
IM
3011
3012 /*
235c7fc7
IM
3013 * If we do not self-reap then we have to wait for the
3014 * child task to unschedule (it will happen for sure),
3015 * so that its counter is at its final count. (This
3016 * condition triggers rarely - child tasks usually get
3017 * off their CPU before the parent has a chance to
3018 * get this far into the reaping action)
9b51f66d 3019 */
235c7fc7
IM
3020 if (child != current) {
3021 wait_task_inactive(child, 0);
3022 list_del_init(&child_counter->list_entry);
53cfbf59 3023 update_counter_times(child_counter);
235c7fc7 3024 } else {
0cc0c027 3025 struct perf_cpu_context *cpuctx;
235c7fc7
IM
3026 unsigned long flags;
3027 u64 perf_flags;
3028
3029 /*
3030 * Disable and unlink this counter.
3031 *
3032 * Be careful about zapping the list - IRQ/NMI context
3033 * could still be processing it:
3034 */
849691a6 3035 local_irq_save(flags);
235c7fc7 3036 perf_flags = hw_perf_save_disable();
0cc0c027
IM
3037
3038 cpuctx = &__get_cpu_var(perf_cpu_context);
3039
d859e29f 3040 group_sched_out(child_counter, cpuctx, child_ctx);
53cfbf59 3041 update_counter_times(child_counter);
0cc0c027 3042
235c7fc7 3043 list_del_init(&child_counter->list_entry);
0cc0c027 3044
235c7fc7 3045 child_ctx->nr_counters--;
9b51f66d 3046
235c7fc7 3047 hw_perf_restore(perf_flags);
849691a6 3048 local_irq_restore(flags);
235c7fc7 3049 }
9b51f66d
IM
3050
3051 parent_counter = child_counter->parent;
3052 /*
3053 * It can happen that parent exits first, and has counters
3054 * that are still around due to the child reference. These
3055 * counters need to be zapped - but otherwise linger.
3056 */
d859e29f
PM
3057 if (parent_counter) {
3058 sync_child_counter(child_counter, parent_counter);
3059 list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
3060 list_entry) {
4bcf349a 3061 if (sub->parent) {
d859e29f 3062 sync_child_counter(sub, sub->parent);
f1600952 3063 free_counter(sub);
4bcf349a 3064 }
d859e29f 3065 }
f1600952 3066 free_counter(child_counter);
4bcf349a 3067 }
9b51f66d
IM
3068}
3069
3070/*
d859e29f 3071 * When a child task exits, feed back counter values to parent counters.
9b51f66d 3072 *
d859e29f 3073 * Note: we may be running in child context, but the PID is not hashed
9b51f66d
IM
3074 * anymore so new counters will not be added.
3075 */
3076void perf_counter_exit_task(struct task_struct *child)
3077{
3078 struct perf_counter *child_counter, *tmp;
3079 struct perf_counter_context *child_ctx;
3080
3081 child_ctx = &child->perf_counter_ctx;
3082
3083 if (likely(!child_ctx->nr_counters))
3084 return;
3085
3086 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
3087 list_entry)
3088 __perf_counter_exit_task(child, child_counter, child_ctx);
3089}
3090
3091/*
3092 * Initialize the perf_counter context in task_struct
3093 */
3094void perf_counter_init_task(struct task_struct *child)
3095{
3096 struct perf_counter_context *child_ctx, *parent_ctx;
d859e29f 3097 struct perf_counter *counter;
9b51f66d 3098 struct task_struct *parent = current;
9b51f66d
IM
3099
3100 child_ctx = &child->perf_counter_ctx;
3101 parent_ctx = &parent->perf_counter_ctx;
3102
3103 __perf_counter_init_context(child_ctx, child);
3104
3105 /*
3106 * This is executed from the parent task context, so inherit
3107 * counters that have been marked for cloning:
3108 */
3109
3110 if (likely(!parent_ctx->nr_counters))
3111 return;
3112
3113 /*
3114 * Lock the parent list. No need to lock the child - not PID
3115 * hashed yet and not running, so nobody can access it.
3116 */
d859e29f 3117 mutex_lock(&parent_ctx->mutex);
9b51f66d
IM
3118
3119 /*
3120 * We dont have to disable NMIs - we are only looking at
3121 * the list, not manipulating it:
3122 */
3123 list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
d859e29f 3124 if (!counter->hw_event.inherit)
9b51f66d
IM
3125 continue;
3126
d859e29f 3127 if (inherit_group(counter, parent,
9b51f66d
IM
3128 parent_ctx, child, child_ctx))
3129 break;
3130 }
3131
d859e29f 3132 mutex_unlock(&parent_ctx->mutex);
9b51f66d
IM
3133}
3134
04289bb9 3135static void __cpuinit perf_counter_init_cpu(int cpu)
0793a61d 3136{
04289bb9 3137 struct perf_cpu_context *cpuctx;
0793a61d 3138
04289bb9
IM
3139 cpuctx = &per_cpu(perf_cpu_context, cpu);
3140 __perf_counter_init_context(&cpuctx->ctx, NULL);
0793a61d
TG
3141
3142 mutex_lock(&perf_resource_mutex);
04289bb9 3143 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
0793a61d 3144 mutex_unlock(&perf_resource_mutex);
04289bb9 3145
01d0287f 3146 hw_perf_counter_setup(cpu);
0793a61d
TG
3147}
3148
3149#ifdef CONFIG_HOTPLUG_CPU
04289bb9 3150static void __perf_counter_exit_cpu(void *info)
0793a61d
TG
3151{
3152 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
3153 struct perf_counter_context *ctx = &cpuctx->ctx;
3154 struct perf_counter *counter, *tmp;
3155
04289bb9
IM
3156 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
3157 __perf_counter_remove_from_context(counter);
0793a61d 3158}
04289bb9 3159static void perf_counter_exit_cpu(int cpu)
0793a61d 3160{
d859e29f
PM
3161 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
3162 struct perf_counter_context *ctx = &cpuctx->ctx;
3163
3164 mutex_lock(&ctx->mutex);
04289bb9 3165 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
d859e29f 3166 mutex_unlock(&ctx->mutex);
0793a61d
TG
3167}
3168#else
04289bb9 3169static inline void perf_counter_exit_cpu(int cpu) { }
0793a61d
TG
3170#endif
3171
3172static int __cpuinit
3173perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
3174{
3175 unsigned int cpu = (long)hcpu;
3176
3177 switch (action) {
3178
3179 case CPU_UP_PREPARE:
3180 case CPU_UP_PREPARE_FROZEN:
04289bb9 3181 perf_counter_init_cpu(cpu);
0793a61d
TG
3182 break;
3183
3184 case CPU_DOWN_PREPARE:
3185 case CPU_DOWN_PREPARE_FROZEN:
04289bb9 3186 perf_counter_exit_cpu(cpu);
0793a61d
TG
3187 break;
3188
3189 default:
3190 break;
3191 }
3192
3193 return NOTIFY_OK;
3194}
3195
3196static struct notifier_block __cpuinitdata perf_cpu_nb = {
3197 .notifier_call = perf_cpu_notify,
3198};
3199
3200static int __init perf_counter_init(void)
3201{
3202 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
3203 (void *)(long)smp_processor_id());
3204 register_cpu_notifier(&perf_cpu_nb);
3205
3206 return 0;
3207}
3208early_initcall(perf_counter_init);
3209
3210static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
3211{
3212 return sprintf(buf, "%d\n", perf_reserved_percpu);
3213}
3214
3215static ssize_t
3216perf_set_reserve_percpu(struct sysdev_class *class,
3217 const char *buf,
3218 size_t count)
3219{
3220 struct perf_cpu_context *cpuctx;
3221 unsigned long val;
3222 int err, cpu, mpt;
3223
3224 err = strict_strtoul(buf, 10, &val);
3225 if (err)
3226 return err;
3227 if (val > perf_max_counters)
3228 return -EINVAL;
3229
3230 mutex_lock(&perf_resource_mutex);
3231 perf_reserved_percpu = val;
3232 for_each_online_cpu(cpu) {
3233 cpuctx = &per_cpu(perf_cpu_context, cpu);
3234 spin_lock_irq(&cpuctx->ctx.lock);
3235 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
3236 perf_max_counters - perf_reserved_percpu);
3237 cpuctx->max_pertask = mpt;
3238 spin_unlock_irq(&cpuctx->ctx.lock);
3239 }
3240 mutex_unlock(&perf_resource_mutex);
3241
3242 return count;
3243}
3244
3245static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
3246{
3247 return sprintf(buf, "%d\n", perf_overcommit);
3248}
3249
3250static ssize_t
3251perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
3252{
3253 unsigned long val;
3254 int err;
3255
3256 err = strict_strtoul(buf, 10, &val);
3257 if (err)
3258 return err;
3259 if (val > 1)
3260 return -EINVAL;
3261
3262 mutex_lock(&perf_resource_mutex);
3263 perf_overcommit = val;
3264 mutex_unlock(&perf_resource_mutex);
3265
3266 return count;
3267}
3268
3269static SYSDEV_CLASS_ATTR(
3270 reserve_percpu,
3271 0644,
3272 perf_show_reserve_percpu,
3273 perf_set_reserve_percpu
3274 );
3275
3276static SYSDEV_CLASS_ATTR(
3277 overcommit,
3278 0644,
3279 perf_show_overcommit,
3280 perf_set_overcommit
3281 );
3282
3283static struct attribute *perfclass_attrs[] = {
3284 &attr_reserve_percpu.attr,
3285 &attr_overcommit.attr,
3286 NULL
3287};
3288
3289static struct attribute_group perfclass_attr_group = {
3290 .attrs = perfclass_attrs,
3291 .name = "perf_counters",
3292};
3293
3294static int __init perf_counter_sysfs_init(void)
3295{
3296 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
3297 &perfclass_attr_group);
3298}
3299device_initcall(perf_counter_sysfs_init);