perfcounters: hw ops rename
[linux-2.6-block.git] / kernel / perf_counter.c
CommitLineData
0793a61d
TG
1/*
2 * Performance counter core code
3 *
4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6 *
7 * For licencing details see kernel-base/COPYING
8 */
9
10#include <linux/fs.h>
11#include <linux/cpu.h>
12#include <linux/smp.h>
04289bb9 13#include <linux/file.h>
0793a61d
TG
14#include <linux/poll.h>
15#include <linux/sysfs.h>
16#include <linux/ptrace.h>
17#include <linux/percpu.h>
18#include <linux/uaccess.h>
19#include <linux/syscalls.h>
20#include <linux/anon_inodes.h>
21#include <linux/perf_counter.h>
22
23/*
24 * Each CPU has a list of per CPU counters:
25 */
26DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
27
088e2852 28int perf_max_counters __read_mostly = 1;
0793a61d
TG
29static int perf_reserved_percpu __read_mostly;
30static int perf_overcommit __read_mostly = 1;
31
32/*
33 * Mutex for (sysadmin-configurable) counter reservations:
34 */
35static DEFINE_MUTEX(perf_resource_mutex);
36
37/*
38 * Architecture provided APIs - weak aliases:
39 */
5c92d124 40extern __weak const struct hw_perf_counter_ops *
621a01ea 41hw_perf_counter_init(struct perf_counter *counter)
0793a61d 42{
621a01ea 43 return ERR_PTR(-EINVAL);
0793a61d
TG
44}
45
01b2838c 46u64 __weak hw_perf_save_disable(void) { return 0; }
ee06094f 47void __weak hw_perf_restore(u64 ctrl) { }
5c92d124 48void __weak hw_perf_counter_setup(void) { }
0793a61d 49
04289bb9
IM
50static void
51list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
52{
53 struct perf_counter *group_leader = counter->group_leader;
54
55 /*
56 * Depending on whether it is a standalone or sibling counter,
57 * add it straight to the context's counter list, or to the group
58 * leader's sibling list:
59 */
60 if (counter->group_leader == counter)
61 list_add_tail(&counter->list_entry, &ctx->counter_list);
62 else
63 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
64}
65
66static void
67list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
68{
69 struct perf_counter *sibling, *tmp;
70
71 list_del_init(&counter->list_entry);
72
04289bb9
IM
73 /*
74 * If this was a group counter with sibling counters then
75 * upgrade the siblings to singleton counters by adding them
76 * to the context list directly:
77 */
78 list_for_each_entry_safe(sibling, tmp,
79 &counter->sibling_list, list_entry) {
80
81 list_del_init(&sibling->list_entry);
82 list_add_tail(&sibling->list_entry, &ctx->counter_list);
04289bb9
IM
83 sibling->group_leader = sibling;
84 }
85}
86
0793a61d
TG
87/*
88 * Cross CPU call to remove a performance counter
89 *
90 * We disable the counter on the hardware level first. After that we
91 * remove it from the context list.
92 */
04289bb9 93static void __perf_counter_remove_from_context(void *info)
0793a61d
TG
94{
95 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
96 struct perf_counter *counter = info;
97 struct perf_counter_context *ctx = counter->ctx;
9b51f66d 98 unsigned long flags;
5c92d124 99 u64 perf_flags;
0793a61d
TG
100
101 /*
102 * If this is a task context, we need to check whether it is
103 * the current task context of this cpu. If not it has been
104 * scheduled out before the smp call arrived.
105 */
106 if (ctx->task && cpuctx->task_ctx != ctx)
107 return;
108
9b51f66d 109 spin_lock_irqsave(&ctx->lock, flags);
0793a61d 110
6a930700 111 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
7671581f 112 counter->hw_ops->disable(counter);
6a930700 113 counter->state = PERF_COUNTER_STATE_INACTIVE;
0793a61d
TG
114 ctx->nr_active--;
115 cpuctx->active_oncpu--;
116 counter->task = NULL;
117 }
118 ctx->nr_counters--;
119
120 /*
121 * Protect the list operation against NMI by disabling the
122 * counters on a global level. NOP for non NMI based counters.
123 */
01b2838c 124 perf_flags = hw_perf_save_disable();
04289bb9 125 list_del_counter(counter, ctx);
01b2838c 126 hw_perf_restore(perf_flags);
0793a61d
TG
127
128 if (!ctx->task) {
129 /*
130 * Allow more per task counters with respect to the
131 * reservation:
132 */
133 cpuctx->max_pertask =
134 min(perf_max_counters - ctx->nr_counters,
135 perf_max_counters - perf_reserved_percpu);
136 }
137
9b51f66d 138 spin_unlock_irqrestore(&ctx->lock, flags);
0793a61d
TG
139}
140
141
142/*
143 * Remove the counter from a task's (or a CPU's) list of counters.
144 *
145 * Must be called with counter->mutex held.
146 *
147 * CPU counters are removed with a smp call. For task counters we only
148 * call when the task is on a CPU.
149 */
04289bb9 150static void perf_counter_remove_from_context(struct perf_counter *counter)
0793a61d
TG
151{
152 struct perf_counter_context *ctx = counter->ctx;
153 struct task_struct *task = ctx->task;
154
155 if (!task) {
156 /*
157 * Per cpu counters are removed via an smp call and
158 * the removal is always sucessful.
159 */
160 smp_call_function_single(counter->cpu,
04289bb9 161 __perf_counter_remove_from_context,
0793a61d
TG
162 counter, 1);
163 return;
164 }
165
166retry:
04289bb9 167 task_oncpu_function_call(task, __perf_counter_remove_from_context,
0793a61d
TG
168 counter);
169
170 spin_lock_irq(&ctx->lock);
171 /*
172 * If the context is active we need to retry the smp call.
173 */
04289bb9 174 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
0793a61d
TG
175 spin_unlock_irq(&ctx->lock);
176 goto retry;
177 }
178
179 /*
180 * The lock prevents that this context is scheduled in so we
04289bb9 181 * can remove the counter safely, if the call above did not
0793a61d
TG
182 * succeed.
183 */
04289bb9 184 if (!list_empty(&counter->list_entry)) {
0793a61d 185 ctx->nr_counters--;
04289bb9 186 list_del_counter(counter, ctx);
0793a61d
TG
187 counter->task = NULL;
188 }
189 spin_unlock_irq(&ctx->lock);
190}
191
192/*
193 * Cross CPU call to install and enable a preformance counter
194 */
195static void __perf_install_in_context(void *info)
196{
197 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
198 struct perf_counter *counter = info;
199 struct perf_counter_context *ctx = counter->ctx;
200 int cpu = smp_processor_id();
9b51f66d 201 unsigned long flags;
5c92d124 202 u64 perf_flags;
0793a61d
TG
203
204 /*
205 * If this is a task context, we need to check whether it is
206 * the current task context of this cpu. If not it has been
207 * scheduled out before the smp call arrived.
208 */
209 if (ctx->task && cpuctx->task_ctx != ctx)
210 return;
211
9b51f66d 212 spin_lock_irqsave(&ctx->lock, flags);
0793a61d
TG
213
214 /*
215 * Protect the list operation against NMI by disabling the
216 * counters on a global level. NOP for non NMI based counters.
217 */
01b2838c 218 perf_flags = hw_perf_save_disable();
04289bb9 219 list_add_counter(counter, ctx);
01b2838c 220 hw_perf_restore(perf_flags);
0793a61d
TG
221
222 ctx->nr_counters++;
223
224 if (cpuctx->active_oncpu < perf_max_counters) {
6a930700 225 counter->state = PERF_COUNTER_STATE_ACTIVE;
0793a61d
TG
226 counter->oncpu = cpu;
227 ctx->nr_active++;
228 cpuctx->active_oncpu++;
7671581f 229 counter->hw_ops->enable(counter);
0793a61d
TG
230 }
231
232 if (!ctx->task && cpuctx->max_pertask)
233 cpuctx->max_pertask--;
234
9b51f66d 235 spin_unlock_irqrestore(&ctx->lock, flags);
0793a61d
TG
236}
237
238/*
239 * Attach a performance counter to a context
240 *
241 * First we add the counter to the list with the hardware enable bit
242 * in counter->hw_config cleared.
243 *
244 * If the counter is attached to a task which is on a CPU we use a smp
245 * call to enable it in the task context. The task might have been
246 * scheduled away, but we check this in the smp call again.
247 */
248static void
249perf_install_in_context(struct perf_counter_context *ctx,
250 struct perf_counter *counter,
251 int cpu)
252{
253 struct task_struct *task = ctx->task;
254
255 counter->ctx = ctx;
256 if (!task) {
257 /*
258 * Per cpu counters are installed via an smp call and
259 * the install is always sucessful.
260 */
261 smp_call_function_single(cpu, __perf_install_in_context,
262 counter, 1);
263 return;
264 }
265
266 counter->task = task;
267retry:
268 task_oncpu_function_call(task, __perf_install_in_context,
269 counter);
270
271 spin_lock_irq(&ctx->lock);
272 /*
0793a61d
TG
273 * we need to retry the smp call.
274 */
04289bb9 275 if (ctx->nr_active && list_empty(&counter->list_entry)) {
0793a61d
TG
276 spin_unlock_irq(&ctx->lock);
277 goto retry;
278 }
279
280 /*
281 * The lock prevents that this context is scheduled in so we
282 * can add the counter safely, if it the call above did not
283 * succeed.
284 */
04289bb9
IM
285 if (list_empty(&counter->list_entry)) {
286 list_add_counter(counter, ctx);
0793a61d
TG
287 ctx->nr_counters++;
288 }
289 spin_unlock_irq(&ctx->lock);
290}
291
04289bb9
IM
292static void
293counter_sched_out(struct perf_counter *counter,
294 struct perf_cpu_context *cpuctx,
295 struct perf_counter_context *ctx)
296{
6a930700 297 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
04289bb9
IM
298 return;
299
7671581f 300 counter->hw_ops->disable(counter);
6a930700
IM
301 counter->state = PERF_COUNTER_STATE_INACTIVE;
302 counter->oncpu = -1;
04289bb9
IM
303
304 cpuctx->active_oncpu--;
305 ctx->nr_active--;
306}
307
308static void
309group_sched_out(struct perf_counter *group_counter,
310 struct perf_cpu_context *cpuctx,
311 struct perf_counter_context *ctx)
312{
313 struct perf_counter *counter;
314
315 counter_sched_out(group_counter, cpuctx, ctx);
316
317 /*
318 * Schedule out siblings (if any):
319 */
320 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
321 counter_sched_out(counter, cpuctx, ctx);
322}
323
0793a61d
TG
324/*
325 * Called from scheduler to remove the counters of the current task,
326 * with interrupts disabled.
327 *
328 * We stop each counter and update the counter value in counter->count.
329 *
7671581f 330 * This does not protect us against NMI, but disable()
0793a61d
TG
331 * sets the disabled bit in the control field of counter _before_
332 * accessing the counter control register. If a NMI hits, then it will
333 * not restart the counter.
334 */
335void perf_counter_task_sched_out(struct task_struct *task, int cpu)
336{
337 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
338 struct perf_counter_context *ctx = &task->perf_counter_ctx;
339 struct perf_counter *counter;
340
341 if (likely(!cpuctx->task_ctx))
342 return;
343
344 spin_lock(&ctx->lock);
04289bb9
IM
345 if (ctx->nr_active) {
346 list_for_each_entry(counter, &ctx->counter_list, list_entry)
347 group_sched_out(counter, cpuctx, ctx);
0793a61d
TG
348 }
349 spin_unlock(&ctx->lock);
350 cpuctx->task_ctx = NULL;
351}
352
04289bb9
IM
353static void
354counter_sched_in(struct perf_counter *counter,
355 struct perf_cpu_context *cpuctx,
356 struct perf_counter_context *ctx,
357 int cpu)
358{
6a930700 359 if (counter->state == PERF_COUNTER_STATE_OFF)
1d1c7ddb
IM
360 return;
361
7671581f 362 counter->hw_ops->enable(counter);
6a930700 363 counter->state = PERF_COUNTER_STATE_ACTIVE;
04289bb9
IM
364 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
365
366 cpuctx->active_oncpu++;
367 ctx->nr_active++;
368}
369
7995888f 370static int
04289bb9
IM
371group_sched_in(struct perf_counter *group_counter,
372 struct perf_cpu_context *cpuctx,
373 struct perf_counter_context *ctx,
374 int cpu)
375{
376 struct perf_counter *counter;
7995888f 377 int was_group = 0;
04289bb9
IM
378
379 counter_sched_in(group_counter, cpuctx, ctx, cpu);
380
381 /*
382 * Schedule in siblings as one group (if any):
383 */
7995888f 384 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
04289bb9 385 counter_sched_in(counter, cpuctx, ctx, cpu);
7995888f
IM
386 was_group = 1;
387 }
388
389 return was_group;
04289bb9
IM
390}
391
0793a61d
TG
392/*
393 * Called from scheduler to add the counters of the current task
394 * with interrupts disabled.
395 *
396 * We restore the counter value and then enable it.
397 *
7671581f 398 * This does not protect us against NMI, but enable()
0793a61d
TG
399 * sets the enabled bit in the control field of counter _before_
400 * accessing the counter control register. If a NMI hits, then it will
401 * keep the counter running.
402 */
403void perf_counter_task_sched_in(struct task_struct *task, int cpu)
404{
405 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
406 struct perf_counter_context *ctx = &task->perf_counter_ctx;
407 struct perf_counter *counter;
408
409 if (likely(!ctx->nr_counters))
410 return;
411
412 spin_lock(&ctx->lock);
04289bb9 413 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
0793a61d
TG
414 if (ctx->nr_active == cpuctx->max_pertask)
415 break;
04289bb9
IM
416
417 /*
418 * Listen to the 'cpu' scheduling filter constraint
419 * of counters:
420 */
0793a61d
TG
421 if (counter->cpu != -1 && counter->cpu != cpu)
422 continue;
423
7995888f
IM
424 /*
425 * If we scheduled in a group atomically and
426 * exclusively, break out:
427 */
428 if (group_sched_in(counter, cpuctx, ctx, cpu))
429 break;
0793a61d
TG
430 }
431 spin_unlock(&ctx->lock);
04289bb9 432
0793a61d
TG
433 cpuctx->task_ctx = ctx;
434}
435
1d1c7ddb
IM
436int perf_counter_task_disable(void)
437{
438 struct task_struct *curr = current;
439 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
440 struct perf_counter *counter;
441 u64 perf_flags;
442 int cpu;
443
444 if (likely(!ctx->nr_counters))
445 return 0;
446
447 local_irq_disable();
448 cpu = smp_processor_id();
449
450 perf_counter_task_sched_out(curr, cpu);
451
452 spin_lock(&ctx->lock);
453
454 /*
455 * Disable all the counters:
456 */
457 perf_flags = hw_perf_save_disable();
458
9b51f66d 459 list_for_each_entry(counter, &ctx->counter_list, list_entry)
6a930700 460 counter->state = PERF_COUNTER_STATE_OFF;
9b51f66d 461
1d1c7ddb
IM
462 hw_perf_restore(perf_flags);
463
464 spin_unlock(&ctx->lock);
465
466 local_irq_enable();
467
468 return 0;
469}
470
471int perf_counter_task_enable(void)
472{
473 struct task_struct *curr = current;
474 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
475 struct perf_counter *counter;
476 u64 perf_flags;
477 int cpu;
478
479 if (likely(!ctx->nr_counters))
480 return 0;
481
482 local_irq_disable();
483 cpu = smp_processor_id();
484
485 spin_lock(&ctx->lock);
486
487 /*
488 * Disable all the counters:
489 */
490 perf_flags = hw_perf_save_disable();
491
492 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
6a930700 493 if (counter->state != PERF_COUNTER_STATE_OFF)
1d1c7ddb 494 continue;
6a930700 495 counter->state = PERF_COUNTER_STATE_INACTIVE;
1d1c7ddb
IM
496 }
497 hw_perf_restore(perf_flags);
498
499 spin_unlock(&ctx->lock);
500
501 perf_counter_task_sched_in(curr, cpu);
502
503 local_irq_enable();
504
505 return 0;
506}
507
0793a61d
TG
508void perf_counter_task_tick(struct task_struct *curr, int cpu)
509{
510 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
511 struct perf_counter *counter;
5c92d124 512 u64 perf_flags;
0793a61d
TG
513
514 if (likely(!ctx->nr_counters))
515 return;
516
517 perf_counter_task_sched_out(curr, cpu);
518
519 spin_lock(&ctx->lock);
520
521 /*
04289bb9 522 * Rotate the first entry last (works just fine for group counters too):
0793a61d 523 */
01b2838c 524 perf_flags = hw_perf_save_disable();
04289bb9
IM
525 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
526 list_del(&counter->list_entry);
527 list_add_tail(&counter->list_entry, &ctx->counter_list);
0793a61d
TG
528 break;
529 }
01b2838c 530 hw_perf_restore(perf_flags);
0793a61d
TG
531
532 spin_unlock(&ctx->lock);
533
534 perf_counter_task_sched_in(curr, cpu);
535}
536
0793a61d
TG
537/*
538 * Cross CPU call to read the hardware counter
539 */
7671581f 540static void __read(void *info)
0793a61d 541{
621a01ea
IM
542 struct perf_counter *counter = info;
543
7671581f 544 counter->hw_ops->read(counter);
0793a61d
TG
545}
546
04289bb9 547static u64 perf_counter_read(struct perf_counter *counter)
0793a61d
TG
548{
549 /*
550 * If counter is enabled and currently active on a CPU, update the
551 * value in the counter structure:
552 */
6a930700 553 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
0793a61d 554 smp_call_function_single(counter->oncpu,
7671581f 555 __read, counter, 1);
0793a61d
TG
556 }
557
ee06094f 558 return atomic64_read(&counter->count);
0793a61d
TG
559}
560
561/*
562 * Cross CPU call to switch performance data pointers
563 */
564static void __perf_switch_irq_data(void *info)
565{
566 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
567 struct perf_counter *counter = info;
568 struct perf_counter_context *ctx = counter->ctx;
569 struct perf_data *oldirqdata = counter->irqdata;
570
571 /*
572 * If this is a task context, we need to check whether it is
573 * the current task context of this cpu. If not it has been
574 * scheduled out before the smp call arrived.
575 */
576 if (ctx->task) {
577 if (cpuctx->task_ctx != ctx)
578 return;
579 spin_lock(&ctx->lock);
580 }
581
582 /* Change the pointer NMI safe */
583 atomic_long_set((atomic_long_t *)&counter->irqdata,
584 (unsigned long) counter->usrdata);
585 counter->usrdata = oldirqdata;
586
587 if (ctx->task)
588 spin_unlock(&ctx->lock);
589}
590
591static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
592{
593 struct perf_counter_context *ctx = counter->ctx;
594 struct perf_data *oldirqdata = counter->irqdata;
595 struct task_struct *task = ctx->task;
596
597 if (!task) {
598 smp_call_function_single(counter->cpu,
599 __perf_switch_irq_data,
600 counter, 1);
601 return counter->usrdata;
602 }
603
604retry:
605 spin_lock_irq(&ctx->lock);
6a930700 606 if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
0793a61d
TG
607 counter->irqdata = counter->usrdata;
608 counter->usrdata = oldirqdata;
609 spin_unlock_irq(&ctx->lock);
610 return oldirqdata;
611 }
612 spin_unlock_irq(&ctx->lock);
613 task_oncpu_function_call(task, __perf_switch_irq_data, counter);
614 /* Might have failed, because task was scheduled out */
615 if (counter->irqdata == oldirqdata)
616 goto retry;
617
618 return counter->usrdata;
619}
620
621static void put_context(struct perf_counter_context *ctx)
622{
623 if (ctx->task)
624 put_task_struct(ctx->task);
625}
626
627static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
628{
629 struct perf_cpu_context *cpuctx;
630 struct perf_counter_context *ctx;
631 struct task_struct *task;
632
633 /*
634 * If cpu is not a wildcard then this is a percpu counter:
635 */
636 if (cpu != -1) {
637 /* Must be root to operate on a CPU counter: */
638 if (!capable(CAP_SYS_ADMIN))
639 return ERR_PTR(-EACCES);
640
641 if (cpu < 0 || cpu > num_possible_cpus())
642 return ERR_PTR(-EINVAL);
643
644 /*
645 * We could be clever and allow to attach a counter to an
646 * offline CPU and activate it when the CPU comes up, but
647 * that's for later.
648 */
649 if (!cpu_isset(cpu, cpu_online_map))
650 return ERR_PTR(-ENODEV);
651
652 cpuctx = &per_cpu(perf_cpu_context, cpu);
653 ctx = &cpuctx->ctx;
654
0793a61d
TG
655 return ctx;
656 }
657
658 rcu_read_lock();
659 if (!pid)
660 task = current;
661 else
662 task = find_task_by_vpid(pid);
663 if (task)
664 get_task_struct(task);
665 rcu_read_unlock();
666
667 if (!task)
668 return ERR_PTR(-ESRCH);
669
670 ctx = &task->perf_counter_ctx;
671 ctx->task = task;
672
673 /* Reuse ptrace permission checks for now. */
674 if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
675 put_context(ctx);
676 return ERR_PTR(-EACCES);
677 }
678
679 return ctx;
680}
681
682/*
683 * Called when the last reference to the file is gone.
684 */
685static int perf_release(struct inode *inode, struct file *file)
686{
687 struct perf_counter *counter = file->private_data;
688 struct perf_counter_context *ctx = counter->ctx;
689
690 file->private_data = NULL;
691
692 mutex_lock(&counter->mutex);
693
04289bb9 694 perf_counter_remove_from_context(counter);
0793a61d
TG
695 put_context(ctx);
696
697 mutex_unlock(&counter->mutex);
698
699 kfree(counter);
700
701 return 0;
702}
703
704/*
705 * Read the performance counter - simple non blocking version for now
706 */
707static ssize_t
708perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
709{
710 u64 cntval;
711
712 if (count != sizeof(cntval))
713 return -EINVAL;
714
715 mutex_lock(&counter->mutex);
04289bb9 716 cntval = perf_counter_read(counter);
0793a61d
TG
717 mutex_unlock(&counter->mutex);
718
719 return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
720}
721
722static ssize_t
723perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
724{
725 if (!usrdata->len)
726 return 0;
727
728 count = min(count, (size_t)usrdata->len);
729 if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
730 return -EFAULT;
731
732 /* Adjust the counters */
733 usrdata->len -= count;
734 if (!usrdata->len)
735 usrdata->rd_idx = 0;
736 else
737 usrdata->rd_idx += count;
738
739 return count;
740}
741
742static ssize_t
743perf_read_irq_data(struct perf_counter *counter,
744 char __user *buf,
745 size_t count,
746 int nonblocking)
747{
748 struct perf_data *irqdata, *usrdata;
749 DECLARE_WAITQUEUE(wait, current);
750 ssize_t res;
751
752 irqdata = counter->irqdata;
753 usrdata = counter->usrdata;
754
755 if (usrdata->len + irqdata->len >= count)
756 goto read_pending;
757
758 if (nonblocking)
759 return -EAGAIN;
760
761 spin_lock_irq(&counter->waitq.lock);
762 __add_wait_queue(&counter->waitq, &wait);
763 for (;;) {
764 set_current_state(TASK_INTERRUPTIBLE);
765 if (usrdata->len + irqdata->len >= count)
766 break;
767
768 if (signal_pending(current))
769 break;
770
771 spin_unlock_irq(&counter->waitq.lock);
772 schedule();
773 spin_lock_irq(&counter->waitq.lock);
774 }
775 __remove_wait_queue(&counter->waitq, &wait);
776 __set_current_state(TASK_RUNNING);
777 spin_unlock_irq(&counter->waitq.lock);
778
779 if (usrdata->len + irqdata->len < count)
780 return -ERESTARTSYS;
781read_pending:
782 mutex_lock(&counter->mutex);
783
784 /* Drain pending data first: */
785 res = perf_copy_usrdata(usrdata, buf, count);
786 if (res < 0 || res == count)
787 goto out;
788
789 /* Switch irq buffer: */
790 usrdata = perf_switch_irq_data(counter);
791 if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) {
792 if (!res)
793 res = -EFAULT;
794 } else {
795 res = count;
796 }
797out:
798 mutex_unlock(&counter->mutex);
799
800 return res;
801}
802
803static ssize_t
804perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
805{
806 struct perf_counter *counter = file->private_data;
807
9f66a381 808 switch (counter->hw_event.record_type) {
0793a61d
TG
809 case PERF_RECORD_SIMPLE:
810 return perf_read_hw(counter, buf, count);
811
812 case PERF_RECORD_IRQ:
813 case PERF_RECORD_GROUP:
814 return perf_read_irq_data(counter, buf, count,
815 file->f_flags & O_NONBLOCK);
816 }
817 return -EINVAL;
818}
819
820static unsigned int perf_poll(struct file *file, poll_table *wait)
821{
822 struct perf_counter *counter = file->private_data;
823 unsigned int events = 0;
824 unsigned long flags;
825
826 poll_wait(file, &counter->waitq, wait);
827
828 spin_lock_irqsave(&counter->waitq.lock, flags);
829 if (counter->usrdata->len || counter->irqdata->len)
830 events |= POLLIN;
831 spin_unlock_irqrestore(&counter->waitq.lock, flags);
832
833 return events;
834}
835
836static const struct file_operations perf_fops = {
837 .release = perf_release,
838 .read = perf_read,
839 .poll = perf_poll,
840};
841
5c92d124
IM
842static void cpu_clock_perf_counter_enable(struct perf_counter *counter)
843{
844}
845
846static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
847{
848}
849
850static void cpu_clock_perf_counter_read(struct perf_counter *counter)
851{
852 int cpu = raw_smp_processor_id();
853
ee06094f 854 atomic64_set(&counter->count, cpu_clock(cpu));
5c92d124
IM
855}
856
857static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
7671581f
IM
858 .enable = cpu_clock_perf_counter_enable,
859 .disable = cpu_clock_perf_counter_disable,
860 .read = cpu_clock_perf_counter_read,
5c92d124
IM
861};
862
8cb391e8 863static void task_clock_perf_counter_update(struct perf_counter *counter)
bae43c99 864{
8cb391e8
IM
865 u64 prev, now;
866 s64 delta;
867
868 prev = atomic64_read(&counter->hw.prev_count);
869 now = current->se.sum_exec_runtime;
870
871 atomic64_set(&counter->hw.prev_count, now);
872
873 delta = now - prev;
8cb391e8
IM
874
875 atomic64_add(delta, &counter->count);
bae43c99
IM
876}
877
8cb391e8 878static void task_clock_perf_counter_read(struct perf_counter *counter)
bae43c99 879{
8cb391e8 880 task_clock_perf_counter_update(counter);
bae43c99
IM
881}
882
8cb391e8
IM
883static void task_clock_perf_counter_enable(struct perf_counter *counter)
884{
885 atomic64_set(&counter->hw.prev_count, current->se.sum_exec_runtime);
886}
887
888static void task_clock_perf_counter_disable(struct perf_counter *counter)
bae43c99 889{
8cb391e8 890 task_clock_perf_counter_update(counter);
bae43c99
IM
891}
892
893static const struct hw_perf_counter_ops perf_ops_task_clock = {
7671581f
IM
894 .enable = task_clock_perf_counter_enable,
895 .disable = task_clock_perf_counter_disable,
896 .read = task_clock_perf_counter_read,
bae43c99
IM
897};
898
e06c61a8
IM
899static u64 get_page_faults(void)
900{
901 struct task_struct *curr = current;
902
903 return curr->maj_flt + curr->min_flt;
904}
905
906static void page_faults_perf_counter_update(struct perf_counter *counter)
907{
908 u64 prev, now;
909 s64 delta;
910
911 prev = atomic64_read(&counter->hw.prev_count);
912 now = get_page_faults();
913
914 atomic64_set(&counter->hw.prev_count, now);
915
916 delta = now - prev;
e06c61a8
IM
917
918 atomic64_add(delta, &counter->count);
919}
920
921static void page_faults_perf_counter_read(struct perf_counter *counter)
922{
923 page_faults_perf_counter_update(counter);
924}
925
926static void page_faults_perf_counter_enable(struct perf_counter *counter)
927{
928 /*
929 * page-faults is a per-task value already,
930 * so we dont have to clear it on switch-in.
931 */
932}
933
934static void page_faults_perf_counter_disable(struct perf_counter *counter)
935{
936 page_faults_perf_counter_update(counter);
937}
938
939static const struct hw_perf_counter_ops perf_ops_page_faults = {
7671581f
IM
940 .enable = page_faults_perf_counter_enable,
941 .disable = page_faults_perf_counter_disable,
942 .read = page_faults_perf_counter_read,
e06c61a8
IM
943};
944
5d6a27d8
IM
945static u64 get_context_switches(void)
946{
947 struct task_struct *curr = current;
948
949 return curr->nvcsw + curr->nivcsw;
950}
951
952static void context_switches_perf_counter_update(struct perf_counter *counter)
953{
954 u64 prev, now;
955 s64 delta;
956
957 prev = atomic64_read(&counter->hw.prev_count);
958 now = get_context_switches();
959
960 atomic64_set(&counter->hw.prev_count, now);
961
962 delta = now - prev;
5d6a27d8
IM
963
964 atomic64_add(delta, &counter->count);
965}
966
967static void context_switches_perf_counter_read(struct perf_counter *counter)
968{
969 context_switches_perf_counter_update(counter);
970}
971
972static void context_switches_perf_counter_enable(struct perf_counter *counter)
973{
974 /*
975 * ->nvcsw + curr->nivcsw is a per-task value already,
976 * so we dont have to clear it on switch-in.
977 */
978}
979
980static void context_switches_perf_counter_disable(struct perf_counter *counter)
981{
982 context_switches_perf_counter_update(counter);
983}
984
985static const struct hw_perf_counter_ops perf_ops_context_switches = {
7671581f
IM
986 .enable = context_switches_perf_counter_enable,
987 .disable = context_switches_perf_counter_disable,
988 .read = context_switches_perf_counter_read,
5d6a27d8
IM
989};
990
6c594c21
IM
991static inline u64 get_cpu_migrations(void)
992{
993 return current->se.nr_migrations;
994}
995
996static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
997{
998 u64 prev, now;
999 s64 delta;
1000
1001 prev = atomic64_read(&counter->hw.prev_count);
1002 now = get_cpu_migrations();
1003
1004 atomic64_set(&counter->hw.prev_count, now);
1005
1006 delta = now - prev;
6c594c21
IM
1007
1008 atomic64_add(delta, &counter->count);
1009}
1010
1011static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
1012{
1013 cpu_migrations_perf_counter_update(counter);
1014}
1015
1016static void cpu_migrations_perf_counter_enable(struct perf_counter *counter)
1017{
1018 /*
1019 * se.nr_migrations is a per-task value already,
1020 * so we dont have to clear it on switch-in.
1021 */
1022}
1023
1024static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
1025{
1026 cpu_migrations_perf_counter_update(counter);
1027}
1028
1029static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
7671581f
IM
1030 .enable = cpu_migrations_perf_counter_enable,
1031 .disable = cpu_migrations_perf_counter_disable,
1032 .read = cpu_migrations_perf_counter_read,
6c594c21
IM
1033};
1034
5c92d124
IM
1035static const struct hw_perf_counter_ops *
1036sw_perf_counter_init(struct perf_counter *counter)
1037{
1038 const struct hw_perf_counter_ops *hw_ops = NULL;
1039
1040 switch (counter->hw_event.type) {
1041 case PERF_COUNT_CPU_CLOCK:
1042 hw_ops = &perf_ops_cpu_clock;
1043 break;
bae43c99
IM
1044 case PERF_COUNT_TASK_CLOCK:
1045 hw_ops = &perf_ops_task_clock;
1046 break;
e06c61a8
IM
1047 case PERF_COUNT_PAGE_FAULTS:
1048 hw_ops = &perf_ops_page_faults;
1049 break;
5d6a27d8
IM
1050 case PERF_COUNT_CONTEXT_SWITCHES:
1051 hw_ops = &perf_ops_context_switches;
1052 break;
6c594c21
IM
1053 case PERF_COUNT_CPU_MIGRATIONS:
1054 hw_ops = &perf_ops_cpu_migrations;
1055 break;
5c92d124
IM
1056 default:
1057 break;
1058 }
1059 return hw_ops;
1060}
1061
0793a61d
TG
1062/*
1063 * Allocate and initialize a counter structure
1064 */
1065static struct perf_counter *
04289bb9
IM
1066perf_counter_alloc(struct perf_counter_hw_event *hw_event,
1067 int cpu,
9b51f66d
IM
1068 struct perf_counter *group_leader,
1069 gfp_t gfpflags)
0793a61d 1070{
5c92d124 1071 const struct hw_perf_counter_ops *hw_ops;
621a01ea 1072 struct perf_counter *counter;
0793a61d 1073
9b51f66d 1074 counter = kzalloc(sizeof(*counter), gfpflags);
0793a61d
TG
1075 if (!counter)
1076 return NULL;
1077
04289bb9
IM
1078 /*
1079 * Single counters are their own group leaders, with an
1080 * empty sibling list:
1081 */
1082 if (!group_leader)
1083 group_leader = counter;
1084
0793a61d 1085 mutex_init(&counter->mutex);
04289bb9
IM
1086 INIT_LIST_HEAD(&counter->list_entry);
1087 INIT_LIST_HEAD(&counter->sibling_list);
0793a61d
TG
1088 init_waitqueue_head(&counter->waitq);
1089
9f66a381
IM
1090 counter->irqdata = &counter->data[0];
1091 counter->usrdata = &counter->data[1];
1092 counter->cpu = cpu;
1093 counter->hw_event = *hw_event;
1094 counter->wakeup_pending = 0;
04289bb9 1095 counter->group_leader = group_leader;
621a01ea
IM
1096 counter->hw_ops = NULL;
1097
a86ed508
IM
1098 if (hw_event->disabled)
1099 counter->state = PERF_COUNTER_STATE_OFF;
1100
5c92d124
IM
1101 hw_ops = NULL;
1102 if (!hw_event->raw && hw_event->type < 0)
1103 hw_ops = sw_perf_counter_init(counter);
9b51f66d 1104 if (!hw_ops)
5c92d124 1105 hw_ops = hw_perf_counter_init(counter);
5c92d124 1106
621a01ea
IM
1107 if (!hw_ops) {
1108 kfree(counter);
1109 return NULL;
1110 }
1111 counter->hw_ops = hw_ops;
0793a61d
TG
1112
1113 return counter;
1114}
1115
1116/**
9f66a381
IM
1117 * sys_perf_task_open - open a performance counter, associate it to a task/cpu
1118 *
1119 * @hw_event_uptr: event type attributes for monitoring/sampling
0793a61d 1120 * @pid: target pid
9f66a381
IM
1121 * @cpu: target cpu
1122 * @group_fd: group leader counter fd
0793a61d 1123 */
1d1c7ddb
IM
1124asmlinkage int
1125sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
1126 pid_t pid, int cpu, int group_fd)
0793a61d 1127{
04289bb9 1128 struct perf_counter *counter, *group_leader;
9f66a381 1129 struct perf_counter_hw_event hw_event;
04289bb9 1130 struct perf_counter_context *ctx;
9b51f66d 1131 struct file *counter_file = NULL;
04289bb9
IM
1132 struct file *group_file = NULL;
1133 int fput_needed = 0;
9b51f66d 1134 int fput_needed2 = 0;
0793a61d
TG
1135 int ret;
1136
9f66a381 1137 if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
eab656ae
TG
1138 return -EFAULT;
1139
04289bb9 1140 /*
ccff286d
IM
1141 * Get the target context (task or percpu):
1142 */
1143 ctx = find_get_context(pid, cpu);
1144 if (IS_ERR(ctx))
1145 return PTR_ERR(ctx);
1146
1147 /*
1148 * Look up the group leader (we will attach this counter to it):
04289bb9
IM
1149 */
1150 group_leader = NULL;
1151 if (group_fd != -1) {
1152 ret = -EINVAL;
1153 group_file = fget_light(group_fd, &fput_needed);
1154 if (!group_file)
ccff286d 1155 goto err_put_context;
04289bb9 1156 if (group_file->f_op != &perf_fops)
ccff286d 1157 goto err_put_context;
04289bb9
IM
1158
1159 group_leader = group_file->private_data;
1160 /*
ccff286d
IM
1161 * Do not allow a recursive hierarchy (this new sibling
1162 * becoming part of another group-sibling):
1163 */
1164 if (group_leader->group_leader != group_leader)
1165 goto err_put_context;
1166 /*
1167 * Do not allow to attach to a group in a different
1168 * task or CPU context:
04289bb9 1169 */
ccff286d
IM
1170 if (group_leader->ctx != ctx)
1171 goto err_put_context;
04289bb9
IM
1172 }
1173
5c92d124 1174 ret = -EINVAL;
9b51f66d 1175 counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL);
0793a61d
TG
1176 if (!counter)
1177 goto err_put_context;
1178
0793a61d
TG
1179 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
1180 if (ret < 0)
9b51f66d
IM
1181 goto err_free_put_context;
1182
1183 counter_file = fget_light(ret, &fput_needed2);
1184 if (!counter_file)
1185 goto err_free_put_context;
1186
1187 counter->filp = counter_file;
1188 perf_install_in_context(ctx, counter, cpu);
1189
1190 fput_light(counter_file, fput_needed2);
0793a61d 1191
04289bb9
IM
1192out_fput:
1193 fput_light(group_file, fput_needed);
1194
0793a61d
TG
1195 return ret;
1196
9b51f66d 1197err_free_put_context:
0793a61d
TG
1198 kfree(counter);
1199
1200err_put_context:
1201 put_context(ctx);
1202
04289bb9 1203 goto out_fput;
0793a61d
TG
1204}
1205
9b51f66d
IM
1206/*
1207 * Initialize the perf_counter context in a task_struct:
1208 */
1209static void
1210__perf_counter_init_context(struct perf_counter_context *ctx,
1211 struct task_struct *task)
1212{
1213 memset(ctx, 0, sizeof(*ctx));
1214 spin_lock_init(&ctx->lock);
1215 INIT_LIST_HEAD(&ctx->counter_list);
1216 ctx->task = task;
1217}
1218
1219/*
1220 * inherit a counter from parent task to child task:
1221 */
1222static int
1223inherit_counter(struct perf_counter *parent_counter,
1224 struct task_struct *parent,
1225 struct perf_counter_context *parent_ctx,
1226 struct task_struct *child,
1227 struct perf_counter_context *child_ctx)
1228{
1229 struct perf_counter *child_counter;
1230
1231 child_counter = perf_counter_alloc(&parent_counter->hw_event,
1232 parent_counter->cpu, NULL,
1233 GFP_ATOMIC);
1234 if (!child_counter)
1235 return -ENOMEM;
1236
1237 /*
1238 * Link it up in the child's context:
1239 */
1240 child_counter->ctx = child_ctx;
1241 child_counter->task = child;
1242 list_add_counter(child_counter, child_ctx);
1243 child_ctx->nr_counters++;
1244
1245 child_counter->parent = parent_counter;
1246 parent_counter->nr_inherited++;
1247 /*
1248 * inherit into child's child as well:
1249 */
1250 child_counter->hw_event.inherit = 1;
1251
1252 /*
1253 * Get a reference to the parent filp - we will fput it
1254 * when the child counter exits. This is safe to do because
1255 * we are in the parent and we know that the filp still
1256 * exists and has a nonzero count:
1257 */
1258 atomic_long_inc(&parent_counter->filp->f_count);
1259
1260 return 0;
1261}
1262
1263static void
1264__perf_counter_exit_task(struct task_struct *child,
1265 struct perf_counter *child_counter,
1266 struct perf_counter_context *child_ctx)
1267{
1268 struct perf_counter *parent_counter;
1269 u64 parent_val, child_val;
1270 u64 perf_flags;
1271
1272 /*
1273 * Disable and unlink this counter.
1274 *
1275 * Be careful about zapping the list - IRQ/NMI context
1276 * could still be processing it:
1277 */
1278 local_irq_disable();
1279 perf_flags = hw_perf_save_disable();
1280
0cc0c027
IM
1281 if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
1282 struct perf_cpu_context *cpuctx;
1283
1284 cpuctx = &__get_cpu_var(perf_cpu_context);
1285
7671581f 1286 child_counter->hw_ops->disable(child_counter);
0cc0c027
IM
1287 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
1288 child_counter->oncpu = -1;
1289
1290 cpuctx->active_oncpu--;
1291 child_ctx->nr_active--;
1292 }
1293
9b51f66d
IM
1294 list_del_init(&child_counter->list_entry);
1295
1296 hw_perf_restore(perf_flags);
1297 local_irq_enable();
1298
1299 parent_counter = child_counter->parent;
1300 /*
1301 * It can happen that parent exits first, and has counters
1302 * that are still around due to the child reference. These
1303 * counters need to be zapped - but otherwise linger.
1304 */
1305 if (!parent_counter)
1306 return;
1307
1308 parent_val = atomic64_read(&parent_counter->count);
1309 child_val = atomic64_read(&child_counter->count);
1310
1311 /*
1312 * Add back the child's count to the parent's count:
1313 */
1314 atomic64_add(child_val, &parent_counter->count);
1315
1316 fput(parent_counter->filp);
1317
1318 kfree(child_counter);
1319}
1320
1321/*
1322 * When a child task exist, feed back counter values to parent counters.
1323 *
1324 * Note: we are running in child context, but the PID is not hashed
1325 * anymore so new counters will not be added.
1326 */
1327void perf_counter_exit_task(struct task_struct *child)
1328{
1329 struct perf_counter *child_counter, *tmp;
1330 struct perf_counter_context *child_ctx;
1331
1332 child_ctx = &child->perf_counter_ctx;
1333
1334 if (likely(!child_ctx->nr_counters))
1335 return;
1336
1337 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
1338 list_entry)
1339 __perf_counter_exit_task(child, child_counter, child_ctx);
1340}
1341
1342/*
1343 * Initialize the perf_counter context in task_struct
1344 */
1345void perf_counter_init_task(struct task_struct *child)
1346{
1347 struct perf_counter_context *child_ctx, *parent_ctx;
1348 struct perf_counter *counter, *parent_counter;
1349 struct task_struct *parent = current;
1350 unsigned long flags;
1351
1352 child_ctx = &child->perf_counter_ctx;
1353 parent_ctx = &parent->perf_counter_ctx;
1354
1355 __perf_counter_init_context(child_ctx, child);
1356
1357 /*
1358 * This is executed from the parent task context, so inherit
1359 * counters that have been marked for cloning:
1360 */
1361
1362 if (likely(!parent_ctx->nr_counters))
1363 return;
1364
1365 /*
1366 * Lock the parent list. No need to lock the child - not PID
1367 * hashed yet and not running, so nobody can access it.
1368 */
1369 spin_lock_irqsave(&parent_ctx->lock, flags);
1370
1371 /*
1372 * We dont have to disable NMIs - we are only looking at
1373 * the list, not manipulating it:
1374 */
1375 list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
1376 if (!counter->hw_event.inherit || counter->group_leader != counter)
1377 continue;
1378
1379 /*
1380 * Instead of creating recursive hierarchies of counters,
1381 * we link inheritd counters back to the original parent,
1382 * which has a filp for sure, which we use as the reference
1383 * count:
1384 */
1385 parent_counter = counter;
1386 if (counter->parent)
1387 parent_counter = counter->parent;
1388
1389 if (inherit_counter(parent_counter, parent,
1390 parent_ctx, child, child_ctx))
1391 break;
1392 }
1393
1394 spin_unlock_irqrestore(&parent_ctx->lock, flags);
1395}
1396
04289bb9 1397static void __cpuinit perf_counter_init_cpu(int cpu)
0793a61d 1398{
04289bb9 1399 struct perf_cpu_context *cpuctx;
0793a61d 1400
04289bb9
IM
1401 cpuctx = &per_cpu(perf_cpu_context, cpu);
1402 __perf_counter_init_context(&cpuctx->ctx, NULL);
0793a61d
TG
1403
1404 mutex_lock(&perf_resource_mutex);
04289bb9 1405 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
0793a61d 1406 mutex_unlock(&perf_resource_mutex);
04289bb9 1407
0793a61d
TG
1408 hw_perf_counter_setup();
1409}
1410
1411#ifdef CONFIG_HOTPLUG_CPU
04289bb9 1412static void __perf_counter_exit_cpu(void *info)
0793a61d
TG
1413{
1414 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1415 struct perf_counter_context *ctx = &cpuctx->ctx;
1416 struct perf_counter *counter, *tmp;
1417
04289bb9
IM
1418 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
1419 __perf_counter_remove_from_context(counter);
0793a61d
TG
1420
1421}
04289bb9 1422static void perf_counter_exit_cpu(int cpu)
0793a61d 1423{
04289bb9 1424 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
0793a61d
TG
1425}
1426#else
04289bb9 1427static inline void perf_counter_exit_cpu(int cpu) { }
0793a61d
TG
1428#endif
1429
1430static int __cpuinit
1431perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
1432{
1433 unsigned int cpu = (long)hcpu;
1434
1435 switch (action) {
1436
1437 case CPU_UP_PREPARE:
1438 case CPU_UP_PREPARE_FROZEN:
04289bb9 1439 perf_counter_init_cpu(cpu);
0793a61d
TG
1440 break;
1441
1442 case CPU_DOWN_PREPARE:
1443 case CPU_DOWN_PREPARE_FROZEN:
04289bb9 1444 perf_counter_exit_cpu(cpu);
0793a61d
TG
1445 break;
1446
1447 default:
1448 break;
1449 }
1450
1451 return NOTIFY_OK;
1452}
1453
1454static struct notifier_block __cpuinitdata perf_cpu_nb = {
1455 .notifier_call = perf_cpu_notify,
1456};
1457
1458static int __init perf_counter_init(void)
1459{
1460 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
1461 (void *)(long)smp_processor_id());
1462 register_cpu_notifier(&perf_cpu_nb);
1463
1464 return 0;
1465}
1466early_initcall(perf_counter_init);
1467
1468static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
1469{
1470 return sprintf(buf, "%d\n", perf_reserved_percpu);
1471}
1472
1473static ssize_t
1474perf_set_reserve_percpu(struct sysdev_class *class,
1475 const char *buf,
1476 size_t count)
1477{
1478 struct perf_cpu_context *cpuctx;
1479 unsigned long val;
1480 int err, cpu, mpt;
1481
1482 err = strict_strtoul(buf, 10, &val);
1483 if (err)
1484 return err;
1485 if (val > perf_max_counters)
1486 return -EINVAL;
1487
1488 mutex_lock(&perf_resource_mutex);
1489 perf_reserved_percpu = val;
1490 for_each_online_cpu(cpu) {
1491 cpuctx = &per_cpu(perf_cpu_context, cpu);
1492 spin_lock_irq(&cpuctx->ctx.lock);
1493 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
1494 perf_max_counters - perf_reserved_percpu);
1495 cpuctx->max_pertask = mpt;
1496 spin_unlock_irq(&cpuctx->ctx.lock);
1497 }
1498 mutex_unlock(&perf_resource_mutex);
1499
1500 return count;
1501}
1502
1503static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
1504{
1505 return sprintf(buf, "%d\n", perf_overcommit);
1506}
1507
1508static ssize_t
1509perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
1510{
1511 unsigned long val;
1512 int err;
1513
1514 err = strict_strtoul(buf, 10, &val);
1515 if (err)
1516 return err;
1517 if (val > 1)
1518 return -EINVAL;
1519
1520 mutex_lock(&perf_resource_mutex);
1521 perf_overcommit = val;
1522 mutex_unlock(&perf_resource_mutex);
1523
1524 return count;
1525}
1526
1527static SYSDEV_CLASS_ATTR(
1528 reserve_percpu,
1529 0644,
1530 perf_show_reserve_percpu,
1531 perf_set_reserve_percpu
1532 );
1533
1534static SYSDEV_CLASS_ATTR(
1535 overcommit,
1536 0644,
1537 perf_show_overcommit,
1538 perf_set_overcommit
1539 );
1540
1541static struct attribute *perfclass_attrs[] = {
1542 &attr_reserve_percpu.attr,
1543 &attr_overcommit.attr,
1544 NULL
1545};
1546
1547static struct attribute_group perfclass_attr_group = {
1548 .attrs = perfclass_attrs,
1549 .name = "perf_counters",
1550};
1551
1552static int __init perf_counter_sysfs_init(void)
1553{
1554 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
1555 &perfclass_attr_group);
1556}
1557device_initcall(perf_counter_sysfs_init);