Commit | Line | Data |
---|---|---|
0793a61d TG |
1 | /* |
2 | * Performance counter core code | |
3 | * | |
4 | * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de> | |
5 | * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar | |
6 | * | |
7 | * For licencing details see kernel-base/COPYING | |
8 | */ | |
9 | ||
10 | #include <linux/fs.h> | |
11 | #include <linux/cpu.h> | |
12 | #include <linux/smp.h> | |
13 | #include <linux/poll.h> | |
14 | #include <linux/sysfs.h> | |
15 | #include <linux/ptrace.h> | |
16 | #include <linux/percpu.h> | |
17 | #include <linux/uaccess.h> | |
18 | #include <linux/syscalls.h> | |
19 | #include <linux/anon_inodes.h> | |
20 | #include <linux/perf_counter.h> | |
21 | ||
22 | /* | |
23 | * Each CPU has a list of per CPU counters: | |
24 | */ | |
25 | DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); | |
26 | ||
27 | int perf_max_counters __read_mostly; | |
28 | static int perf_reserved_percpu __read_mostly; | |
29 | static int perf_overcommit __read_mostly = 1; | |
30 | ||
31 | /* | |
32 | * Mutex for (sysadmin-configurable) counter reservations: | |
33 | */ | |
34 | static DEFINE_MUTEX(perf_resource_mutex); | |
35 | ||
36 | /* | |
37 | * Architecture provided APIs - weak aliases: | |
38 | */ | |
39 | ||
40 | int __weak hw_perf_counter_init(struct perf_counter *counter, u32 hw_event_type) | |
41 | { | |
42 | return -EINVAL; | |
43 | } | |
44 | ||
45 | void __weak hw_perf_counter_enable(struct perf_counter *counter) { } | |
46 | void __weak hw_perf_counter_disable(struct perf_counter *counter) { } | |
47 | void __weak hw_perf_counter_read(struct perf_counter *counter) { } | |
48 | void __weak hw_perf_disable_all(void) { } | |
49 | void __weak hw_perf_enable_all(void) { } | |
50 | void __weak hw_perf_counter_setup(void) { } | |
51 | ||
52 | #if BITS_PER_LONG == 64 | |
53 | ||
54 | /* | |
55 | * Read the cached counter in counter safe against cross CPU / NMI | |
56 | * modifications. 64 bit version - no complications. | |
57 | */ | |
58 | static inline u64 perf_read_counter_safe(struct perf_counter *counter) | |
59 | { | |
60 | return (u64) atomic64_read(&counter->count); | |
61 | } | |
62 | ||
63 | #else | |
64 | ||
65 | /* | |
66 | * Read the cached counter in counter safe against cross CPU / NMI | |
67 | * modifications. 32 bit version. | |
68 | */ | |
69 | static u64 perf_read_counter_safe(struct perf_counter *counter) | |
70 | { | |
71 | u32 cntl, cnth; | |
72 | ||
73 | local_irq_disable(); | |
74 | do { | |
75 | cnth = atomic_read(&counter->count32[1]); | |
76 | cntl = atomic_read(&counter->count32[0]); | |
77 | } while (cnth != atomic_read(&counter->count32[1])); | |
78 | ||
79 | local_irq_enable(); | |
80 | ||
81 | return cntl | ((u64) cnth) << 32; | |
82 | } | |
83 | ||
84 | #endif | |
85 | ||
86 | /* | |
87 | * Cross CPU call to remove a performance counter | |
88 | * | |
89 | * We disable the counter on the hardware level first. After that we | |
90 | * remove it from the context list. | |
91 | */ | |
92 | static void __perf_remove_from_context(void *info) | |
93 | { | |
94 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | |
95 | struct perf_counter *counter = info; | |
96 | struct perf_counter_context *ctx = counter->ctx; | |
97 | ||
98 | /* | |
99 | * If this is a task context, we need to check whether it is | |
100 | * the current task context of this cpu. If not it has been | |
101 | * scheduled out before the smp call arrived. | |
102 | */ | |
103 | if (ctx->task && cpuctx->task_ctx != ctx) | |
104 | return; | |
105 | ||
106 | spin_lock(&ctx->lock); | |
107 | ||
108 | if (counter->active) { | |
109 | hw_perf_counter_disable(counter); | |
110 | counter->active = 0; | |
111 | ctx->nr_active--; | |
112 | cpuctx->active_oncpu--; | |
113 | counter->task = NULL; | |
114 | } | |
115 | ctx->nr_counters--; | |
116 | ||
117 | /* | |
118 | * Protect the list operation against NMI by disabling the | |
119 | * counters on a global level. NOP for non NMI based counters. | |
120 | */ | |
121 | hw_perf_disable_all(); | |
122 | list_del_init(&counter->list); | |
123 | hw_perf_enable_all(); | |
124 | ||
125 | if (!ctx->task) { | |
126 | /* | |
127 | * Allow more per task counters with respect to the | |
128 | * reservation: | |
129 | */ | |
130 | cpuctx->max_pertask = | |
131 | min(perf_max_counters - ctx->nr_counters, | |
132 | perf_max_counters - perf_reserved_percpu); | |
133 | } | |
134 | ||
135 | spin_unlock(&ctx->lock); | |
136 | } | |
137 | ||
138 | ||
139 | /* | |
140 | * Remove the counter from a task's (or a CPU's) list of counters. | |
141 | * | |
142 | * Must be called with counter->mutex held. | |
143 | * | |
144 | * CPU counters are removed with a smp call. For task counters we only | |
145 | * call when the task is on a CPU. | |
146 | */ | |
147 | static void perf_remove_from_context(struct perf_counter *counter) | |
148 | { | |
149 | struct perf_counter_context *ctx = counter->ctx; | |
150 | struct task_struct *task = ctx->task; | |
151 | ||
152 | if (!task) { | |
153 | /* | |
154 | * Per cpu counters are removed via an smp call and | |
155 | * the removal is always sucessful. | |
156 | */ | |
157 | smp_call_function_single(counter->cpu, | |
158 | __perf_remove_from_context, | |
159 | counter, 1); | |
160 | return; | |
161 | } | |
162 | ||
163 | retry: | |
164 | task_oncpu_function_call(task, __perf_remove_from_context, | |
165 | counter); | |
166 | ||
167 | spin_lock_irq(&ctx->lock); | |
168 | /* | |
169 | * If the context is active we need to retry the smp call. | |
170 | */ | |
171 | if (ctx->nr_active && !list_empty(&counter->list)) { | |
172 | spin_unlock_irq(&ctx->lock); | |
173 | goto retry; | |
174 | } | |
175 | ||
176 | /* | |
177 | * The lock prevents that this context is scheduled in so we | |
178 | * can remove the counter safely, if it the call above did not | |
179 | * succeed. | |
180 | */ | |
181 | if (!list_empty(&counter->list)) { | |
182 | ctx->nr_counters--; | |
183 | list_del_init(&counter->list); | |
184 | counter->task = NULL; | |
185 | } | |
186 | spin_unlock_irq(&ctx->lock); | |
187 | } | |
188 | ||
189 | /* | |
190 | * Cross CPU call to install and enable a preformance counter | |
191 | */ | |
192 | static void __perf_install_in_context(void *info) | |
193 | { | |
194 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | |
195 | struct perf_counter *counter = info; | |
196 | struct perf_counter_context *ctx = counter->ctx; | |
197 | int cpu = smp_processor_id(); | |
198 | ||
199 | /* | |
200 | * If this is a task context, we need to check whether it is | |
201 | * the current task context of this cpu. If not it has been | |
202 | * scheduled out before the smp call arrived. | |
203 | */ | |
204 | if (ctx->task && cpuctx->task_ctx != ctx) | |
205 | return; | |
206 | ||
207 | spin_lock(&ctx->lock); | |
208 | ||
209 | /* | |
210 | * Protect the list operation against NMI by disabling the | |
211 | * counters on a global level. NOP for non NMI based counters. | |
212 | */ | |
213 | hw_perf_disable_all(); | |
214 | list_add_tail(&counter->list, &ctx->counters); | |
215 | hw_perf_enable_all(); | |
216 | ||
217 | ctx->nr_counters++; | |
218 | ||
219 | if (cpuctx->active_oncpu < perf_max_counters) { | |
220 | hw_perf_counter_enable(counter); | |
221 | counter->active = 1; | |
222 | counter->oncpu = cpu; | |
223 | ctx->nr_active++; | |
224 | cpuctx->active_oncpu++; | |
225 | } | |
226 | ||
227 | if (!ctx->task && cpuctx->max_pertask) | |
228 | cpuctx->max_pertask--; | |
229 | ||
230 | spin_unlock(&ctx->lock); | |
231 | } | |
232 | ||
233 | /* | |
234 | * Attach a performance counter to a context | |
235 | * | |
236 | * First we add the counter to the list with the hardware enable bit | |
237 | * in counter->hw_config cleared. | |
238 | * | |
239 | * If the counter is attached to a task which is on a CPU we use a smp | |
240 | * call to enable it in the task context. The task might have been | |
241 | * scheduled away, but we check this in the smp call again. | |
242 | */ | |
243 | static void | |
244 | perf_install_in_context(struct perf_counter_context *ctx, | |
245 | struct perf_counter *counter, | |
246 | int cpu) | |
247 | { | |
248 | struct task_struct *task = ctx->task; | |
249 | ||
250 | counter->ctx = ctx; | |
251 | if (!task) { | |
252 | /* | |
253 | * Per cpu counters are installed via an smp call and | |
254 | * the install is always sucessful. | |
255 | */ | |
256 | smp_call_function_single(cpu, __perf_install_in_context, | |
257 | counter, 1); | |
258 | return; | |
259 | } | |
260 | ||
261 | counter->task = task; | |
262 | retry: | |
263 | task_oncpu_function_call(task, __perf_install_in_context, | |
264 | counter); | |
265 | ||
266 | spin_lock_irq(&ctx->lock); | |
267 | /* | |
268 | * If the context is active and the counter has not been added | |
269 | * we need to retry the smp call. | |
270 | */ | |
271 | if (ctx->nr_active && list_empty(&counter->list)) { | |
272 | spin_unlock_irq(&ctx->lock); | |
273 | goto retry; | |
274 | } | |
275 | ||
276 | /* | |
277 | * The lock prevents that this context is scheduled in so we | |
278 | * can add the counter safely, if it the call above did not | |
279 | * succeed. | |
280 | */ | |
281 | if (list_empty(&counter->list)) { | |
282 | list_add_tail(&counter->list, &ctx->counters); | |
283 | ctx->nr_counters++; | |
284 | } | |
285 | spin_unlock_irq(&ctx->lock); | |
286 | } | |
287 | ||
288 | /* | |
289 | * Called from scheduler to remove the counters of the current task, | |
290 | * with interrupts disabled. | |
291 | * | |
292 | * We stop each counter and update the counter value in counter->count. | |
293 | * | |
294 | * This does not protect us against NMI, but hw_perf_counter_disable() | |
295 | * sets the disabled bit in the control field of counter _before_ | |
296 | * accessing the counter control register. If a NMI hits, then it will | |
297 | * not restart the counter. | |
298 | */ | |
299 | void perf_counter_task_sched_out(struct task_struct *task, int cpu) | |
300 | { | |
301 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | |
302 | struct perf_counter_context *ctx = &task->perf_counter_ctx; | |
303 | struct perf_counter *counter; | |
304 | ||
305 | if (likely(!cpuctx->task_ctx)) | |
306 | return; | |
307 | ||
308 | spin_lock(&ctx->lock); | |
309 | list_for_each_entry(counter, &ctx->counters, list) { | |
310 | if (!ctx->nr_active) | |
311 | break; | |
312 | if (counter->active) { | |
313 | hw_perf_counter_disable(counter); | |
314 | counter->active = 0; | |
315 | counter->oncpu = -1; | |
316 | ctx->nr_active--; | |
317 | cpuctx->active_oncpu--; | |
318 | } | |
319 | } | |
320 | spin_unlock(&ctx->lock); | |
321 | cpuctx->task_ctx = NULL; | |
322 | } | |
323 | ||
324 | /* | |
325 | * Called from scheduler to add the counters of the current task | |
326 | * with interrupts disabled. | |
327 | * | |
328 | * We restore the counter value and then enable it. | |
329 | * | |
330 | * This does not protect us against NMI, but hw_perf_counter_enable() | |
331 | * sets the enabled bit in the control field of counter _before_ | |
332 | * accessing the counter control register. If a NMI hits, then it will | |
333 | * keep the counter running. | |
334 | */ | |
335 | void perf_counter_task_sched_in(struct task_struct *task, int cpu) | |
336 | { | |
337 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | |
338 | struct perf_counter_context *ctx = &task->perf_counter_ctx; | |
339 | struct perf_counter *counter; | |
340 | ||
341 | if (likely(!ctx->nr_counters)) | |
342 | return; | |
343 | ||
344 | spin_lock(&ctx->lock); | |
345 | list_for_each_entry(counter, &ctx->counters, list) { | |
346 | if (ctx->nr_active == cpuctx->max_pertask) | |
347 | break; | |
348 | if (counter->cpu != -1 && counter->cpu != cpu) | |
349 | continue; | |
350 | ||
351 | hw_perf_counter_enable(counter); | |
352 | counter->active = 1; | |
353 | counter->oncpu = cpu; | |
354 | ctx->nr_active++; | |
355 | cpuctx->active_oncpu++; | |
356 | } | |
357 | spin_unlock(&ctx->lock); | |
358 | cpuctx->task_ctx = ctx; | |
359 | } | |
360 | ||
361 | void perf_counter_task_tick(struct task_struct *curr, int cpu) | |
362 | { | |
363 | struct perf_counter_context *ctx = &curr->perf_counter_ctx; | |
364 | struct perf_counter *counter; | |
365 | ||
366 | if (likely(!ctx->nr_counters)) | |
367 | return; | |
368 | ||
369 | perf_counter_task_sched_out(curr, cpu); | |
370 | ||
371 | spin_lock(&ctx->lock); | |
372 | ||
373 | /* | |
374 | * Rotate the first entry last: | |
375 | */ | |
376 | hw_perf_disable_all(); | |
377 | list_for_each_entry(counter, &ctx->counters, list) { | |
378 | list_del(&counter->list); | |
379 | list_add_tail(&counter->list, &ctx->counters); | |
380 | break; | |
381 | } | |
382 | hw_perf_enable_all(); | |
383 | ||
384 | spin_unlock(&ctx->lock); | |
385 | ||
386 | perf_counter_task_sched_in(curr, cpu); | |
387 | } | |
388 | ||
389 | /* | |
390 | * Initialize the perf_counter context in task_struct | |
391 | */ | |
392 | void perf_counter_init_task(struct task_struct *task) | |
393 | { | |
394 | struct perf_counter_context *ctx = &task->perf_counter_ctx; | |
395 | ||
396 | spin_lock_init(&ctx->lock); | |
397 | INIT_LIST_HEAD(&ctx->counters); | |
398 | ctx->nr_counters = 0; | |
399 | ctx->task = task; | |
400 | } | |
401 | ||
402 | /* | |
403 | * Cross CPU call to read the hardware counter | |
404 | */ | |
405 | static void __hw_perf_counter_read(void *info) | |
406 | { | |
407 | hw_perf_counter_read(info); | |
408 | } | |
409 | ||
410 | static u64 perf_read_counter(struct perf_counter *counter) | |
411 | { | |
412 | /* | |
413 | * If counter is enabled and currently active on a CPU, update the | |
414 | * value in the counter structure: | |
415 | */ | |
416 | if (counter->active) { | |
417 | smp_call_function_single(counter->oncpu, | |
418 | __hw_perf_counter_read, counter, 1); | |
419 | } | |
420 | ||
421 | return perf_read_counter_safe(counter); | |
422 | } | |
423 | ||
424 | /* | |
425 | * Cross CPU call to switch performance data pointers | |
426 | */ | |
427 | static void __perf_switch_irq_data(void *info) | |
428 | { | |
429 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | |
430 | struct perf_counter *counter = info; | |
431 | struct perf_counter_context *ctx = counter->ctx; | |
432 | struct perf_data *oldirqdata = counter->irqdata; | |
433 | ||
434 | /* | |
435 | * If this is a task context, we need to check whether it is | |
436 | * the current task context of this cpu. If not it has been | |
437 | * scheduled out before the smp call arrived. | |
438 | */ | |
439 | if (ctx->task) { | |
440 | if (cpuctx->task_ctx != ctx) | |
441 | return; | |
442 | spin_lock(&ctx->lock); | |
443 | } | |
444 | ||
445 | /* Change the pointer NMI safe */ | |
446 | atomic_long_set((atomic_long_t *)&counter->irqdata, | |
447 | (unsigned long) counter->usrdata); | |
448 | counter->usrdata = oldirqdata; | |
449 | ||
450 | if (ctx->task) | |
451 | spin_unlock(&ctx->lock); | |
452 | } | |
453 | ||
454 | static struct perf_data *perf_switch_irq_data(struct perf_counter *counter) | |
455 | { | |
456 | struct perf_counter_context *ctx = counter->ctx; | |
457 | struct perf_data *oldirqdata = counter->irqdata; | |
458 | struct task_struct *task = ctx->task; | |
459 | ||
460 | if (!task) { | |
461 | smp_call_function_single(counter->cpu, | |
462 | __perf_switch_irq_data, | |
463 | counter, 1); | |
464 | return counter->usrdata; | |
465 | } | |
466 | ||
467 | retry: | |
468 | spin_lock_irq(&ctx->lock); | |
469 | if (!counter->active) { | |
470 | counter->irqdata = counter->usrdata; | |
471 | counter->usrdata = oldirqdata; | |
472 | spin_unlock_irq(&ctx->lock); | |
473 | return oldirqdata; | |
474 | } | |
475 | spin_unlock_irq(&ctx->lock); | |
476 | task_oncpu_function_call(task, __perf_switch_irq_data, counter); | |
477 | /* Might have failed, because task was scheduled out */ | |
478 | if (counter->irqdata == oldirqdata) | |
479 | goto retry; | |
480 | ||
481 | return counter->usrdata; | |
482 | } | |
483 | ||
484 | static void put_context(struct perf_counter_context *ctx) | |
485 | { | |
486 | if (ctx->task) | |
487 | put_task_struct(ctx->task); | |
488 | } | |
489 | ||
490 | static struct perf_counter_context *find_get_context(pid_t pid, int cpu) | |
491 | { | |
492 | struct perf_cpu_context *cpuctx; | |
493 | struct perf_counter_context *ctx; | |
494 | struct task_struct *task; | |
495 | ||
496 | /* | |
497 | * If cpu is not a wildcard then this is a percpu counter: | |
498 | */ | |
499 | if (cpu != -1) { | |
500 | /* Must be root to operate on a CPU counter: */ | |
501 | if (!capable(CAP_SYS_ADMIN)) | |
502 | return ERR_PTR(-EACCES); | |
503 | ||
504 | if (cpu < 0 || cpu > num_possible_cpus()) | |
505 | return ERR_PTR(-EINVAL); | |
506 | ||
507 | /* | |
508 | * We could be clever and allow to attach a counter to an | |
509 | * offline CPU and activate it when the CPU comes up, but | |
510 | * that's for later. | |
511 | */ | |
512 | if (!cpu_isset(cpu, cpu_online_map)) | |
513 | return ERR_PTR(-ENODEV); | |
514 | ||
515 | cpuctx = &per_cpu(perf_cpu_context, cpu); | |
516 | ctx = &cpuctx->ctx; | |
517 | ||
518 | WARN_ON_ONCE(ctx->task); | |
519 | return ctx; | |
520 | } | |
521 | ||
522 | rcu_read_lock(); | |
523 | if (!pid) | |
524 | task = current; | |
525 | else | |
526 | task = find_task_by_vpid(pid); | |
527 | if (task) | |
528 | get_task_struct(task); | |
529 | rcu_read_unlock(); | |
530 | ||
531 | if (!task) | |
532 | return ERR_PTR(-ESRCH); | |
533 | ||
534 | ctx = &task->perf_counter_ctx; | |
535 | ctx->task = task; | |
536 | ||
537 | /* Reuse ptrace permission checks for now. */ | |
538 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) { | |
539 | put_context(ctx); | |
540 | return ERR_PTR(-EACCES); | |
541 | } | |
542 | ||
543 | return ctx; | |
544 | } | |
545 | ||
546 | /* | |
547 | * Called when the last reference to the file is gone. | |
548 | */ | |
549 | static int perf_release(struct inode *inode, struct file *file) | |
550 | { | |
551 | struct perf_counter *counter = file->private_data; | |
552 | struct perf_counter_context *ctx = counter->ctx; | |
553 | ||
554 | file->private_data = NULL; | |
555 | ||
556 | mutex_lock(&counter->mutex); | |
557 | ||
558 | perf_remove_from_context(counter); | |
559 | put_context(ctx); | |
560 | ||
561 | mutex_unlock(&counter->mutex); | |
562 | ||
563 | kfree(counter); | |
564 | ||
565 | return 0; | |
566 | } | |
567 | ||
568 | /* | |
569 | * Read the performance counter - simple non blocking version for now | |
570 | */ | |
571 | static ssize_t | |
572 | perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) | |
573 | { | |
574 | u64 cntval; | |
575 | ||
576 | if (count != sizeof(cntval)) | |
577 | return -EINVAL; | |
578 | ||
579 | mutex_lock(&counter->mutex); | |
580 | cntval = perf_read_counter(counter); | |
581 | mutex_unlock(&counter->mutex); | |
582 | ||
583 | return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval); | |
584 | } | |
585 | ||
586 | static ssize_t | |
587 | perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count) | |
588 | { | |
589 | if (!usrdata->len) | |
590 | return 0; | |
591 | ||
592 | count = min(count, (size_t)usrdata->len); | |
593 | if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count)) | |
594 | return -EFAULT; | |
595 | ||
596 | /* Adjust the counters */ | |
597 | usrdata->len -= count; | |
598 | if (!usrdata->len) | |
599 | usrdata->rd_idx = 0; | |
600 | else | |
601 | usrdata->rd_idx += count; | |
602 | ||
603 | return count; | |
604 | } | |
605 | ||
606 | static ssize_t | |
607 | perf_read_irq_data(struct perf_counter *counter, | |
608 | char __user *buf, | |
609 | size_t count, | |
610 | int nonblocking) | |
611 | { | |
612 | struct perf_data *irqdata, *usrdata; | |
613 | DECLARE_WAITQUEUE(wait, current); | |
614 | ssize_t res; | |
615 | ||
616 | irqdata = counter->irqdata; | |
617 | usrdata = counter->usrdata; | |
618 | ||
619 | if (usrdata->len + irqdata->len >= count) | |
620 | goto read_pending; | |
621 | ||
622 | if (nonblocking) | |
623 | return -EAGAIN; | |
624 | ||
625 | spin_lock_irq(&counter->waitq.lock); | |
626 | __add_wait_queue(&counter->waitq, &wait); | |
627 | for (;;) { | |
628 | set_current_state(TASK_INTERRUPTIBLE); | |
629 | if (usrdata->len + irqdata->len >= count) | |
630 | break; | |
631 | ||
632 | if (signal_pending(current)) | |
633 | break; | |
634 | ||
635 | spin_unlock_irq(&counter->waitq.lock); | |
636 | schedule(); | |
637 | spin_lock_irq(&counter->waitq.lock); | |
638 | } | |
639 | __remove_wait_queue(&counter->waitq, &wait); | |
640 | __set_current_state(TASK_RUNNING); | |
641 | spin_unlock_irq(&counter->waitq.lock); | |
642 | ||
643 | if (usrdata->len + irqdata->len < count) | |
644 | return -ERESTARTSYS; | |
645 | read_pending: | |
646 | mutex_lock(&counter->mutex); | |
647 | ||
648 | /* Drain pending data first: */ | |
649 | res = perf_copy_usrdata(usrdata, buf, count); | |
650 | if (res < 0 || res == count) | |
651 | goto out; | |
652 | ||
653 | /* Switch irq buffer: */ | |
654 | usrdata = perf_switch_irq_data(counter); | |
655 | if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) { | |
656 | if (!res) | |
657 | res = -EFAULT; | |
658 | } else { | |
659 | res = count; | |
660 | } | |
661 | out: | |
662 | mutex_unlock(&counter->mutex); | |
663 | ||
664 | return res; | |
665 | } | |
666 | ||
667 | static ssize_t | |
668 | perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |
669 | { | |
670 | struct perf_counter *counter = file->private_data; | |
671 | ||
672 | switch (counter->record_type) { | |
673 | case PERF_RECORD_SIMPLE: | |
674 | return perf_read_hw(counter, buf, count); | |
675 | ||
676 | case PERF_RECORD_IRQ: | |
677 | case PERF_RECORD_GROUP: | |
678 | return perf_read_irq_data(counter, buf, count, | |
679 | file->f_flags & O_NONBLOCK); | |
680 | } | |
681 | return -EINVAL; | |
682 | } | |
683 | ||
684 | static unsigned int perf_poll(struct file *file, poll_table *wait) | |
685 | { | |
686 | struct perf_counter *counter = file->private_data; | |
687 | unsigned int events = 0; | |
688 | unsigned long flags; | |
689 | ||
690 | poll_wait(file, &counter->waitq, wait); | |
691 | ||
692 | spin_lock_irqsave(&counter->waitq.lock, flags); | |
693 | if (counter->usrdata->len || counter->irqdata->len) | |
694 | events |= POLLIN; | |
695 | spin_unlock_irqrestore(&counter->waitq.lock, flags); | |
696 | ||
697 | return events; | |
698 | } | |
699 | ||
700 | static const struct file_operations perf_fops = { | |
701 | .release = perf_release, | |
702 | .read = perf_read, | |
703 | .poll = perf_poll, | |
704 | }; | |
705 | ||
706 | /* | |
707 | * Allocate and initialize a counter structure | |
708 | */ | |
709 | static struct perf_counter * | |
710 | perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type) | |
711 | { | |
712 | struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL); | |
713 | ||
714 | if (!counter) | |
715 | return NULL; | |
716 | ||
717 | mutex_init(&counter->mutex); | |
718 | INIT_LIST_HEAD(&counter->list); | |
719 | init_waitqueue_head(&counter->waitq); | |
720 | ||
721 | counter->irqdata = &counter->data[0]; | |
722 | counter->usrdata = &counter->data[1]; | |
723 | counter->cpu = cpu; | |
724 | counter->record_type = record_type; | |
725 | counter->__irq_period = hw_event_period; | |
726 | counter->wakeup_pending = 0; | |
727 | ||
728 | return counter; | |
729 | } | |
730 | ||
731 | /** | |
732 | * sys_perf_task_open - open a performance counter associate it to a task | |
733 | * @hw_event_type: event type for monitoring/sampling... | |
734 | * @pid: target pid | |
735 | */ | |
736 | asmlinkage int | |
737 | sys_perf_counter_open(u32 hw_event_type, | |
738 | u32 hw_event_period, | |
739 | u32 record_type, | |
740 | pid_t pid, | |
741 | int cpu) | |
742 | { | |
743 | struct perf_counter_context *ctx; | |
744 | struct perf_counter *counter; | |
745 | int ret; | |
746 | ||
747 | ctx = find_get_context(pid, cpu); | |
748 | if (IS_ERR(ctx)) | |
749 | return PTR_ERR(ctx); | |
750 | ||
751 | ret = -ENOMEM; | |
752 | counter = perf_counter_alloc(hw_event_period, cpu, record_type); | |
753 | if (!counter) | |
754 | goto err_put_context; | |
755 | ||
756 | ret = hw_perf_counter_init(counter, hw_event_type); | |
757 | if (ret) | |
758 | goto err_free_put_context; | |
759 | ||
760 | perf_install_in_context(ctx, counter, cpu); | |
761 | ||
762 | ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); | |
763 | if (ret < 0) | |
764 | goto err_remove_free_put_context; | |
765 | ||
766 | return ret; | |
767 | ||
768 | err_remove_free_put_context: | |
769 | mutex_lock(&counter->mutex); | |
770 | perf_remove_from_context(counter); | |
771 | mutex_unlock(&counter->mutex); | |
772 | ||
773 | err_free_put_context: | |
774 | kfree(counter); | |
775 | ||
776 | err_put_context: | |
777 | put_context(ctx); | |
778 | ||
779 | return ret; | |
780 | } | |
781 | ||
782 | static void __cpuinit perf_init_cpu(int cpu) | |
783 | { | |
784 | struct perf_cpu_context *ctx; | |
785 | ||
786 | ctx = &per_cpu(perf_cpu_context, cpu); | |
787 | spin_lock_init(&ctx->ctx.lock); | |
788 | INIT_LIST_HEAD(&ctx->ctx.counters); | |
789 | ||
790 | mutex_lock(&perf_resource_mutex); | |
791 | ctx->max_pertask = perf_max_counters - perf_reserved_percpu; | |
792 | mutex_unlock(&perf_resource_mutex); | |
793 | hw_perf_counter_setup(); | |
794 | } | |
795 | ||
796 | #ifdef CONFIG_HOTPLUG_CPU | |
797 | static void __perf_exit_cpu(void *info) | |
798 | { | |
799 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | |
800 | struct perf_counter_context *ctx = &cpuctx->ctx; | |
801 | struct perf_counter *counter, *tmp; | |
802 | ||
803 | list_for_each_entry_safe(counter, tmp, &ctx->counters, list) | |
804 | __perf_remove_from_context(counter); | |
805 | ||
806 | } | |
807 | static void perf_exit_cpu(int cpu) | |
808 | { | |
809 | smp_call_function_single(cpu, __perf_exit_cpu, NULL, 1); | |
810 | } | |
811 | #else | |
812 | static inline void perf_exit_cpu(int cpu) { } | |
813 | #endif | |
814 | ||
815 | static int __cpuinit | |
816 | perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | |
817 | { | |
818 | unsigned int cpu = (long)hcpu; | |
819 | ||
820 | switch (action) { | |
821 | ||
822 | case CPU_UP_PREPARE: | |
823 | case CPU_UP_PREPARE_FROZEN: | |
824 | perf_init_cpu(cpu); | |
825 | break; | |
826 | ||
827 | case CPU_DOWN_PREPARE: | |
828 | case CPU_DOWN_PREPARE_FROZEN: | |
829 | perf_exit_cpu(cpu); | |
830 | break; | |
831 | ||
832 | default: | |
833 | break; | |
834 | } | |
835 | ||
836 | return NOTIFY_OK; | |
837 | } | |
838 | ||
839 | static struct notifier_block __cpuinitdata perf_cpu_nb = { | |
840 | .notifier_call = perf_cpu_notify, | |
841 | }; | |
842 | ||
843 | static int __init perf_counter_init(void) | |
844 | { | |
845 | perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, | |
846 | (void *)(long)smp_processor_id()); | |
847 | register_cpu_notifier(&perf_cpu_nb); | |
848 | ||
849 | return 0; | |
850 | } | |
851 | early_initcall(perf_counter_init); | |
852 | ||
853 | static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) | |
854 | { | |
855 | return sprintf(buf, "%d\n", perf_reserved_percpu); | |
856 | } | |
857 | ||
858 | static ssize_t | |
859 | perf_set_reserve_percpu(struct sysdev_class *class, | |
860 | const char *buf, | |
861 | size_t count) | |
862 | { | |
863 | struct perf_cpu_context *cpuctx; | |
864 | unsigned long val; | |
865 | int err, cpu, mpt; | |
866 | ||
867 | err = strict_strtoul(buf, 10, &val); | |
868 | if (err) | |
869 | return err; | |
870 | if (val > perf_max_counters) | |
871 | return -EINVAL; | |
872 | ||
873 | mutex_lock(&perf_resource_mutex); | |
874 | perf_reserved_percpu = val; | |
875 | for_each_online_cpu(cpu) { | |
876 | cpuctx = &per_cpu(perf_cpu_context, cpu); | |
877 | spin_lock_irq(&cpuctx->ctx.lock); | |
878 | mpt = min(perf_max_counters - cpuctx->ctx.nr_counters, | |
879 | perf_max_counters - perf_reserved_percpu); | |
880 | cpuctx->max_pertask = mpt; | |
881 | spin_unlock_irq(&cpuctx->ctx.lock); | |
882 | } | |
883 | mutex_unlock(&perf_resource_mutex); | |
884 | ||
885 | return count; | |
886 | } | |
887 | ||
888 | static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) | |
889 | { | |
890 | return sprintf(buf, "%d\n", perf_overcommit); | |
891 | } | |
892 | ||
893 | static ssize_t | |
894 | perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) | |
895 | { | |
896 | unsigned long val; | |
897 | int err; | |
898 | ||
899 | err = strict_strtoul(buf, 10, &val); | |
900 | if (err) | |
901 | return err; | |
902 | if (val > 1) | |
903 | return -EINVAL; | |
904 | ||
905 | mutex_lock(&perf_resource_mutex); | |
906 | perf_overcommit = val; | |
907 | mutex_unlock(&perf_resource_mutex); | |
908 | ||
909 | return count; | |
910 | } | |
911 | ||
912 | static SYSDEV_CLASS_ATTR( | |
913 | reserve_percpu, | |
914 | 0644, | |
915 | perf_show_reserve_percpu, | |
916 | perf_set_reserve_percpu | |
917 | ); | |
918 | ||
919 | static SYSDEV_CLASS_ATTR( | |
920 | overcommit, | |
921 | 0644, | |
922 | perf_show_overcommit, | |
923 | perf_set_overcommit | |
924 | ); | |
925 | ||
926 | static struct attribute *perfclass_attrs[] = { | |
927 | &attr_reserve_percpu.attr, | |
928 | &attr_overcommit.attr, | |
929 | NULL | |
930 | }; | |
931 | ||
932 | static struct attribute_group perfclass_attr_group = { | |
933 | .attrs = perfclass_attrs, | |
934 | .name = "perf_counters", | |
935 | }; | |
936 | ||
937 | static int __init perf_counter_sysfs_init(void) | |
938 | { | |
939 | return sysfs_create_group(&cpu_sysdev_class.kset.kobj, | |
940 | &perfclass_attr_group); | |
941 | } | |
942 | device_initcall(perf_counter_sysfs_init); | |
943 |