Commit | Line | Data |
---|---|---|
0793a61d TG |
1 | /* |
2 | * Performance counter core code | |
3 | * | |
4 | * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de> | |
5 | * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar | |
6 | * | |
7 | * For licencing details see kernel-base/COPYING | |
8 | */ | |
9 | ||
10 | #include <linux/fs.h> | |
11 | #include <linux/cpu.h> | |
12 | #include <linux/smp.h> | |
04289bb9 | 13 | #include <linux/file.h> |
0793a61d TG |
14 | #include <linux/poll.h> |
15 | #include <linux/sysfs.h> | |
16 | #include <linux/ptrace.h> | |
17 | #include <linux/percpu.h> | |
18 | #include <linux/uaccess.h> | |
19 | #include <linux/syscalls.h> | |
20 | #include <linux/anon_inodes.h> | |
21 | #include <linux/perf_counter.h> | |
22 | ||
23 | /* | |
24 | * Each CPU has a list of per CPU counters: | |
25 | */ | |
26 | DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); | |
27 | ||
088e2852 | 28 | int perf_max_counters __read_mostly = 1; |
0793a61d TG |
29 | static int perf_reserved_percpu __read_mostly; |
30 | static int perf_overcommit __read_mostly = 1; | |
31 | ||
32 | /* | |
33 | * Mutex for (sysadmin-configurable) counter reservations: | |
34 | */ | |
35 | static DEFINE_MUTEX(perf_resource_mutex); | |
36 | ||
37 | /* | |
38 | * Architecture provided APIs - weak aliases: | |
39 | */ | |
5c92d124 | 40 | extern __weak const struct hw_perf_counter_ops * |
621a01ea | 41 | hw_perf_counter_init(struct perf_counter *counter) |
0793a61d | 42 | { |
621a01ea | 43 | return ERR_PTR(-EINVAL); |
0793a61d TG |
44 | } |
45 | ||
01b2838c | 46 | u64 __weak hw_perf_save_disable(void) { return 0; } |
ee06094f | 47 | void __weak hw_perf_restore(u64 ctrl) { } |
5c92d124 | 48 | void __weak hw_perf_counter_setup(void) { } |
0793a61d | 49 | |
04289bb9 IM |
50 | static void |
51 | list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) | |
52 | { | |
53 | struct perf_counter *group_leader = counter->group_leader; | |
54 | ||
55 | /* | |
56 | * Depending on whether it is a standalone or sibling counter, | |
57 | * add it straight to the context's counter list, or to the group | |
58 | * leader's sibling list: | |
59 | */ | |
60 | if (counter->group_leader == counter) | |
61 | list_add_tail(&counter->list_entry, &ctx->counter_list); | |
62 | else | |
63 | list_add_tail(&counter->list_entry, &group_leader->sibling_list); | |
64 | } | |
65 | ||
66 | static void | |
67 | list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) | |
68 | { | |
69 | struct perf_counter *sibling, *tmp; | |
70 | ||
71 | list_del_init(&counter->list_entry); | |
72 | ||
04289bb9 IM |
73 | /* |
74 | * If this was a group counter with sibling counters then | |
75 | * upgrade the siblings to singleton counters by adding them | |
76 | * to the context list directly: | |
77 | */ | |
78 | list_for_each_entry_safe(sibling, tmp, | |
79 | &counter->sibling_list, list_entry) { | |
80 | ||
81 | list_del_init(&sibling->list_entry); | |
82 | list_add_tail(&sibling->list_entry, &ctx->counter_list); | |
04289bb9 IM |
83 | sibling->group_leader = sibling; |
84 | } | |
85 | } | |
86 | ||
0793a61d TG |
87 | /* |
88 | * Cross CPU call to remove a performance counter | |
89 | * | |
90 | * We disable the counter on the hardware level first. After that we | |
91 | * remove it from the context list. | |
92 | */ | |
04289bb9 | 93 | static void __perf_counter_remove_from_context(void *info) |
0793a61d TG |
94 | { |
95 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | |
96 | struct perf_counter *counter = info; | |
97 | struct perf_counter_context *ctx = counter->ctx; | |
9b51f66d | 98 | unsigned long flags; |
5c92d124 | 99 | u64 perf_flags; |
0793a61d TG |
100 | |
101 | /* | |
102 | * If this is a task context, we need to check whether it is | |
103 | * the current task context of this cpu. If not it has been | |
104 | * scheduled out before the smp call arrived. | |
105 | */ | |
106 | if (ctx->task && cpuctx->task_ctx != ctx) | |
107 | return; | |
108 | ||
9b51f66d | 109 | spin_lock_irqsave(&ctx->lock, flags); |
0793a61d | 110 | |
6a930700 | 111 | if (counter->state == PERF_COUNTER_STATE_ACTIVE) { |
7671581f | 112 | counter->hw_ops->disable(counter); |
6a930700 | 113 | counter->state = PERF_COUNTER_STATE_INACTIVE; |
0793a61d TG |
114 | ctx->nr_active--; |
115 | cpuctx->active_oncpu--; | |
116 | counter->task = NULL; | |
117 | } | |
118 | ctx->nr_counters--; | |
119 | ||
120 | /* | |
121 | * Protect the list operation against NMI by disabling the | |
122 | * counters on a global level. NOP for non NMI based counters. | |
123 | */ | |
01b2838c | 124 | perf_flags = hw_perf_save_disable(); |
04289bb9 | 125 | list_del_counter(counter, ctx); |
01b2838c | 126 | hw_perf_restore(perf_flags); |
0793a61d TG |
127 | |
128 | if (!ctx->task) { | |
129 | /* | |
130 | * Allow more per task counters with respect to the | |
131 | * reservation: | |
132 | */ | |
133 | cpuctx->max_pertask = | |
134 | min(perf_max_counters - ctx->nr_counters, | |
135 | perf_max_counters - perf_reserved_percpu); | |
136 | } | |
137 | ||
9b51f66d | 138 | spin_unlock_irqrestore(&ctx->lock, flags); |
0793a61d TG |
139 | } |
140 | ||
141 | ||
142 | /* | |
143 | * Remove the counter from a task's (or a CPU's) list of counters. | |
144 | * | |
145 | * Must be called with counter->mutex held. | |
146 | * | |
147 | * CPU counters are removed with a smp call. For task counters we only | |
148 | * call when the task is on a CPU. | |
149 | */ | |
04289bb9 | 150 | static void perf_counter_remove_from_context(struct perf_counter *counter) |
0793a61d TG |
151 | { |
152 | struct perf_counter_context *ctx = counter->ctx; | |
153 | struct task_struct *task = ctx->task; | |
154 | ||
155 | if (!task) { | |
156 | /* | |
157 | * Per cpu counters are removed via an smp call and | |
158 | * the removal is always sucessful. | |
159 | */ | |
160 | smp_call_function_single(counter->cpu, | |
04289bb9 | 161 | __perf_counter_remove_from_context, |
0793a61d TG |
162 | counter, 1); |
163 | return; | |
164 | } | |
165 | ||
166 | retry: | |
04289bb9 | 167 | task_oncpu_function_call(task, __perf_counter_remove_from_context, |
0793a61d TG |
168 | counter); |
169 | ||
170 | spin_lock_irq(&ctx->lock); | |
171 | /* | |
172 | * If the context is active we need to retry the smp call. | |
173 | */ | |
04289bb9 | 174 | if (ctx->nr_active && !list_empty(&counter->list_entry)) { |
0793a61d TG |
175 | spin_unlock_irq(&ctx->lock); |
176 | goto retry; | |
177 | } | |
178 | ||
179 | /* | |
180 | * The lock prevents that this context is scheduled in so we | |
04289bb9 | 181 | * can remove the counter safely, if the call above did not |
0793a61d TG |
182 | * succeed. |
183 | */ | |
04289bb9 | 184 | if (!list_empty(&counter->list_entry)) { |
0793a61d | 185 | ctx->nr_counters--; |
04289bb9 | 186 | list_del_counter(counter, ctx); |
0793a61d TG |
187 | counter->task = NULL; |
188 | } | |
189 | spin_unlock_irq(&ctx->lock); | |
190 | } | |
191 | ||
192 | /* | |
193 | * Cross CPU call to install and enable a preformance counter | |
194 | */ | |
195 | static void __perf_install_in_context(void *info) | |
196 | { | |
197 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | |
198 | struct perf_counter *counter = info; | |
199 | struct perf_counter_context *ctx = counter->ctx; | |
200 | int cpu = smp_processor_id(); | |
9b51f66d | 201 | unsigned long flags; |
5c92d124 | 202 | u64 perf_flags; |
0793a61d TG |
203 | |
204 | /* | |
205 | * If this is a task context, we need to check whether it is | |
206 | * the current task context of this cpu. If not it has been | |
207 | * scheduled out before the smp call arrived. | |
208 | */ | |
209 | if (ctx->task && cpuctx->task_ctx != ctx) | |
210 | return; | |
211 | ||
9b51f66d | 212 | spin_lock_irqsave(&ctx->lock, flags); |
0793a61d TG |
213 | |
214 | /* | |
215 | * Protect the list operation against NMI by disabling the | |
216 | * counters on a global level. NOP for non NMI based counters. | |
217 | */ | |
01b2838c | 218 | perf_flags = hw_perf_save_disable(); |
04289bb9 | 219 | list_add_counter(counter, ctx); |
01b2838c | 220 | hw_perf_restore(perf_flags); |
0793a61d TG |
221 | |
222 | ctx->nr_counters++; | |
223 | ||
224 | if (cpuctx->active_oncpu < perf_max_counters) { | |
6a930700 | 225 | counter->state = PERF_COUNTER_STATE_ACTIVE; |
0793a61d TG |
226 | counter->oncpu = cpu; |
227 | ctx->nr_active++; | |
228 | cpuctx->active_oncpu++; | |
7671581f | 229 | counter->hw_ops->enable(counter); |
0793a61d TG |
230 | } |
231 | ||
232 | if (!ctx->task && cpuctx->max_pertask) | |
233 | cpuctx->max_pertask--; | |
234 | ||
9b51f66d | 235 | spin_unlock_irqrestore(&ctx->lock, flags); |
0793a61d TG |
236 | } |
237 | ||
238 | /* | |
239 | * Attach a performance counter to a context | |
240 | * | |
241 | * First we add the counter to the list with the hardware enable bit | |
242 | * in counter->hw_config cleared. | |
243 | * | |
244 | * If the counter is attached to a task which is on a CPU we use a smp | |
245 | * call to enable it in the task context. The task might have been | |
246 | * scheduled away, but we check this in the smp call again. | |
247 | */ | |
248 | static void | |
249 | perf_install_in_context(struct perf_counter_context *ctx, | |
250 | struct perf_counter *counter, | |
251 | int cpu) | |
252 | { | |
253 | struct task_struct *task = ctx->task; | |
254 | ||
255 | counter->ctx = ctx; | |
256 | if (!task) { | |
257 | /* | |
258 | * Per cpu counters are installed via an smp call and | |
259 | * the install is always sucessful. | |
260 | */ | |
261 | smp_call_function_single(cpu, __perf_install_in_context, | |
262 | counter, 1); | |
263 | return; | |
264 | } | |
265 | ||
266 | counter->task = task; | |
267 | retry: | |
268 | task_oncpu_function_call(task, __perf_install_in_context, | |
269 | counter); | |
270 | ||
271 | spin_lock_irq(&ctx->lock); | |
272 | /* | |
0793a61d TG |
273 | * we need to retry the smp call. |
274 | */ | |
04289bb9 | 275 | if (ctx->nr_active && list_empty(&counter->list_entry)) { |
0793a61d TG |
276 | spin_unlock_irq(&ctx->lock); |
277 | goto retry; | |
278 | } | |
279 | ||
280 | /* | |
281 | * The lock prevents that this context is scheduled in so we | |
282 | * can add the counter safely, if it the call above did not | |
283 | * succeed. | |
284 | */ | |
04289bb9 IM |
285 | if (list_empty(&counter->list_entry)) { |
286 | list_add_counter(counter, ctx); | |
0793a61d TG |
287 | ctx->nr_counters++; |
288 | } | |
289 | spin_unlock_irq(&ctx->lock); | |
290 | } | |
291 | ||
04289bb9 IM |
292 | static void |
293 | counter_sched_out(struct perf_counter *counter, | |
294 | struct perf_cpu_context *cpuctx, | |
295 | struct perf_counter_context *ctx) | |
296 | { | |
6a930700 | 297 | if (counter->state != PERF_COUNTER_STATE_ACTIVE) |
04289bb9 IM |
298 | return; |
299 | ||
7671581f | 300 | counter->hw_ops->disable(counter); |
6a930700 IM |
301 | counter->state = PERF_COUNTER_STATE_INACTIVE; |
302 | counter->oncpu = -1; | |
04289bb9 IM |
303 | |
304 | cpuctx->active_oncpu--; | |
305 | ctx->nr_active--; | |
306 | } | |
307 | ||
308 | static void | |
309 | group_sched_out(struct perf_counter *group_counter, | |
310 | struct perf_cpu_context *cpuctx, | |
311 | struct perf_counter_context *ctx) | |
312 | { | |
313 | struct perf_counter *counter; | |
314 | ||
315 | counter_sched_out(group_counter, cpuctx, ctx); | |
316 | ||
317 | /* | |
318 | * Schedule out siblings (if any): | |
319 | */ | |
320 | list_for_each_entry(counter, &group_counter->sibling_list, list_entry) | |
321 | counter_sched_out(counter, cpuctx, ctx); | |
322 | } | |
323 | ||
0793a61d TG |
324 | /* |
325 | * Called from scheduler to remove the counters of the current task, | |
326 | * with interrupts disabled. | |
327 | * | |
328 | * We stop each counter and update the counter value in counter->count. | |
329 | * | |
7671581f | 330 | * This does not protect us against NMI, but disable() |
0793a61d TG |
331 | * sets the disabled bit in the control field of counter _before_ |
332 | * accessing the counter control register. If a NMI hits, then it will | |
333 | * not restart the counter. | |
334 | */ | |
335 | void perf_counter_task_sched_out(struct task_struct *task, int cpu) | |
336 | { | |
337 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | |
338 | struct perf_counter_context *ctx = &task->perf_counter_ctx; | |
339 | struct perf_counter *counter; | |
340 | ||
341 | if (likely(!cpuctx->task_ctx)) | |
342 | return; | |
343 | ||
344 | spin_lock(&ctx->lock); | |
04289bb9 IM |
345 | if (ctx->nr_active) { |
346 | list_for_each_entry(counter, &ctx->counter_list, list_entry) | |
347 | group_sched_out(counter, cpuctx, ctx); | |
0793a61d TG |
348 | } |
349 | spin_unlock(&ctx->lock); | |
350 | cpuctx->task_ctx = NULL; | |
351 | } | |
352 | ||
04289bb9 IM |
353 | static void |
354 | counter_sched_in(struct perf_counter *counter, | |
355 | struct perf_cpu_context *cpuctx, | |
356 | struct perf_counter_context *ctx, | |
357 | int cpu) | |
358 | { | |
6a930700 | 359 | if (counter->state == PERF_COUNTER_STATE_OFF) |
1d1c7ddb IM |
360 | return; |
361 | ||
7671581f | 362 | counter->hw_ops->enable(counter); |
6a930700 | 363 | counter->state = PERF_COUNTER_STATE_ACTIVE; |
04289bb9 IM |
364 | counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ |
365 | ||
366 | cpuctx->active_oncpu++; | |
367 | ctx->nr_active++; | |
368 | } | |
369 | ||
7995888f | 370 | static int |
04289bb9 IM |
371 | group_sched_in(struct perf_counter *group_counter, |
372 | struct perf_cpu_context *cpuctx, | |
373 | struct perf_counter_context *ctx, | |
374 | int cpu) | |
375 | { | |
376 | struct perf_counter *counter; | |
7995888f | 377 | int was_group = 0; |
04289bb9 IM |
378 | |
379 | counter_sched_in(group_counter, cpuctx, ctx, cpu); | |
380 | ||
381 | /* | |
382 | * Schedule in siblings as one group (if any): | |
383 | */ | |
7995888f | 384 | list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { |
04289bb9 | 385 | counter_sched_in(counter, cpuctx, ctx, cpu); |
7995888f IM |
386 | was_group = 1; |
387 | } | |
388 | ||
389 | return was_group; | |
04289bb9 IM |
390 | } |
391 | ||
0793a61d TG |
392 | /* |
393 | * Called from scheduler to add the counters of the current task | |
394 | * with interrupts disabled. | |
395 | * | |
396 | * We restore the counter value and then enable it. | |
397 | * | |
7671581f | 398 | * This does not protect us against NMI, but enable() |
0793a61d TG |
399 | * sets the enabled bit in the control field of counter _before_ |
400 | * accessing the counter control register. If a NMI hits, then it will | |
401 | * keep the counter running. | |
402 | */ | |
403 | void perf_counter_task_sched_in(struct task_struct *task, int cpu) | |
404 | { | |
405 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | |
406 | struct perf_counter_context *ctx = &task->perf_counter_ctx; | |
407 | struct perf_counter *counter; | |
408 | ||
409 | if (likely(!ctx->nr_counters)) | |
410 | return; | |
411 | ||
412 | spin_lock(&ctx->lock); | |
04289bb9 | 413 | list_for_each_entry(counter, &ctx->counter_list, list_entry) { |
0793a61d TG |
414 | if (ctx->nr_active == cpuctx->max_pertask) |
415 | break; | |
04289bb9 IM |
416 | |
417 | /* | |
418 | * Listen to the 'cpu' scheduling filter constraint | |
419 | * of counters: | |
420 | */ | |
0793a61d TG |
421 | if (counter->cpu != -1 && counter->cpu != cpu) |
422 | continue; | |
423 | ||
7995888f IM |
424 | /* |
425 | * If we scheduled in a group atomically and | |
426 | * exclusively, break out: | |
427 | */ | |
428 | if (group_sched_in(counter, cpuctx, ctx, cpu)) | |
429 | break; | |
0793a61d TG |
430 | } |
431 | spin_unlock(&ctx->lock); | |
04289bb9 | 432 | |
0793a61d TG |
433 | cpuctx->task_ctx = ctx; |
434 | } | |
435 | ||
1d1c7ddb IM |
436 | int perf_counter_task_disable(void) |
437 | { | |
438 | struct task_struct *curr = current; | |
439 | struct perf_counter_context *ctx = &curr->perf_counter_ctx; | |
440 | struct perf_counter *counter; | |
441 | u64 perf_flags; | |
442 | int cpu; | |
443 | ||
444 | if (likely(!ctx->nr_counters)) | |
445 | return 0; | |
446 | ||
447 | local_irq_disable(); | |
448 | cpu = smp_processor_id(); | |
449 | ||
450 | perf_counter_task_sched_out(curr, cpu); | |
451 | ||
452 | spin_lock(&ctx->lock); | |
453 | ||
454 | /* | |
455 | * Disable all the counters: | |
456 | */ | |
457 | perf_flags = hw_perf_save_disable(); | |
458 | ||
9b51f66d | 459 | list_for_each_entry(counter, &ctx->counter_list, list_entry) |
6a930700 | 460 | counter->state = PERF_COUNTER_STATE_OFF; |
9b51f66d | 461 | |
1d1c7ddb IM |
462 | hw_perf_restore(perf_flags); |
463 | ||
464 | spin_unlock(&ctx->lock); | |
465 | ||
466 | local_irq_enable(); | |
467 | ||
468 | return 0; | |
469 | } | |
470 | ||
471 | int perf_counter_task_enable(void) | |
472 | { | |
473 | struct task_struct *curr = current; | |
474 | struct perf_counter_context *ctx = &curr->perf_counter_ctx; | |
475 | struct perf_counter *counter; | |
476 | u64 perf_flags; | |
477 | int cpu; | |
478 | ||
479 | if (likely(!ctx->nr_counters)) | |
480 | return 0; | |
481 | ||
482 | local_irq_disable(); | |
483 | cpu = smp_processor_id(); | |
484 | ||
485 | spin_lock(&ctx->lock); | |
486 | ||
487 | /* | |
488 | * Disable all the counters: | |
489 | */ | |
490 | perf_flags = hw_perf_save_disable(); | |
491 | ||
492 | list_for_each_entry(counter, &ctx->counter_list, list_entry) { | |
6a930700 | 493 | if (counter->state != PERF_COUNTER_STATE_OFF) |
1d1c7ddb | 494 | continue; |
6a930700 | 495 | counter->state = PERF_COUNTER_STATE_INACTIVE; |
1d1c7ddb IM |
496 | } |
497 | hw_perf_restore(perf_flags); | |
498 | ||
499 | spin_unlock(&ctx->lock); | |
500 | ||
501 | perf_counter_task_sched_in(curr, cpu); | |
502 | ||
503 | local_irq_enable(); | |
504 | ||
505 | return 0; | |
506 | } | |
507 | ||
0793a61d TG |
508 | void perf_counter_task_tick(struct task_struct *curr, int cpu) |
509 | { | |
510 | struct perf_counter_context *ctx = &curr->perf_counter_ctx; | |
511 | struct perf_counter *counter; | |
5c92d124 | 512 | u64 perf_flags; |
0793a61d TG |
513 | |
514 | if (likely(!ctx->nr_counters)) | |
515 | return; | |
516 | ||
517 | perf_counter_task_sched_out(curr, cpu); | |
518 | ||
519 | spin_lock(&ctx->lock); | |
520 | ||
521 | /* | |
04289bb9 | 522 | * Rotate the first entry last (works just fine for group counters too): |
0793a61d | 523 | */ |
01b2838c | 524 | perf_flags = hw_perf_save_disable(); |
04289bb9 IM |
525 | list_for_each_entry(counter, &ctx->counter_list, list_entry) { |
526 | list_del(&counter->list_entry); | |
527 | list_add_tail(&counter->list_entry, &ctx->counter_list); | |
0793a61d TG |
528 | break; |
529 | } | |
01b2838c | 530 | hw_perf_restore(perf_flags); |
0793a61d TG |
531 | |
532 | spin_unlock(&ctx->lock); | |
533 | ||
534 | perf_counter_task_sched_in(curr, cpu); | |
535 | } | |
536 | ||
0793a61d TG |
537 | /* |
538 | * Cross CPU call to read the hardware counter | |
539 | */ | |
7671581f | 540 | static void __read(void *info) |
0793a61d | 541 | { |
621a01ea IM |
542 | struct perf_counter *counter = info; |
543 | ||
7671581f | 544 | counter->hw_ops->read(counter); |
0793a61d TG |
545 | } |
546 | ||
04289bb9 | 547 | static u64 perf_counter_read(struct perf_counter *counter) |
0793a61d TG |
548 | { |
549 | /* | |
550 | * If counter is enabled and currently active on a CPU, update the | |
551 | * value in the counter structure: | |
552 | */ | |
6a930700 | 553 | if (counter->state == PERF_COUNTER_STATE_ACTIVE) { |
0793a61d | 554 | smp_call_function_single(counter->oncpu, |
7671581f | 555 | __read, counter, 1); |
0793a61d TG |
556 | } |
557 | ||
ee06094f | 558 | return atomic64_read(&counter->count); |
0793a61d TG |
559 | } |
560 | ||
561 | /* | |
562 | * Cross CPU call to switch performance data pointers | |
563 | */ | |
564 | static void __perf_switch_irq_data(void *info) | |
565 | { | |
566 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | |
567 | struct perf_counter *counter = info; | |
568 | struct perf_counter_context *ctx = counter->ctx; | |
569 | struct perf_data *oldirqdata = counter->irqdata; | |
570 | ||
571 | /* | |
572 | * If this is a task context, we need to check whether it is | |
573 | * the current task context of this cpu. If not it has been | |
574 | * scheduled out before the smp call arrived. | |
575 | */ | |
576 | if (ctx->task) { | |
577 | if (cpuctx->task_ctx != ctx) | |
578 | return; | |
579 | spin_lock(&ctx->lock); | |
580 | } | |
581 | ||
582 | /* Change the pointer NMI safe */ | |
583 | atomic_long_set((atomic_long_t *)&counter->irqdata, | |
584 | (unsigned long) counter->usrdata); | |
585 | counter->usrdata = oldirqdata; | |
586 | ||
587 | if (ctx->task) | |
588 | spin_unlock(&ctx->lock); | |
589 | } | |
590 | ||
591 | static struct perf_data *perf_switch_irq_data(struct perf_counter *counter) | |
592 | { | |
593 | struct perf_counter_context *ctx = counter->ctx; | |
594 | struct perf_data *oldirqdata = counter->irqdata; | |
595 | struct task_struct *task = ctx->task; | |
596 | ||
597 | if (!task) { | |
598 | smp_call_function_single(counter->cpu, | |
599 | __perf_switch_irq_data, | |
600 | counter, 1); | |
601 | return counter->usrdata; | |
602 | } | |
603 | ||
604 | retry: | |
605 | spin_lock_irq(&ctx->lock); | |
6a930700 | 606 | if (counter->state != PERF_COUNTER_STATE_ACTIVE) { |
0793a61d TG |
607 | counter->irqdata = counter->usrdata; |
608 | counter->usrdata = oldirqdata; | |
609 | spin_unlock_irq(&ctx->lock); | |
610 | return oldirqdata; | |
611 | } | |
612 | spin_unlock_irq(&ctx->lock); | |
613 | task_oncpu_function_call(task, __perf_switch_irq_data, counter); | |
614 | /* Might have failed, because task was scheduled out */ | |
615 | if (counter->irqdata == oldirqdata) | |
616 | goto retry; | |
617 | ||
618 | return counter->usrdata; | |
619 | } | |
620 | ||
621 | static void put_context(struct perf_counter_context *ctx) | |
622 | { | |
623 | if (ctx->task) | |
624 | put_task_struct(ctx->task); | |
625 | } | |
626 | ||
627 | static struct perf_counter_context *find_get_context(pid_t pid, int cpu) | |
628 | { | |
629 | struct perf_cpu_context *cpuctx; | |
630 | struct perf_counter_context *ctx; | |
631 | struct task_struct *task; | |
632 | ||
633 | /* | |
634 | * If cpu is not a wildcard then this is a percpu counter: | |
635 | */ | |
636 | if (cpu != -1) { | |
637 | /* Must be root to operate on a CPU counter: */ | |
638 | if (!capable(CAP_SYS_ADMIN)) | |
639 | return ERR_PTR(-EACCES); | |
640 | ||
641 | if (cpu < 0 || cpu > num_possible_cpus()) | |
642 | return ERR_PTR(-EINVAL); | |
643 | ||
644 | /* | |
645 | * We could be clever and allow to attach a counter to an | |
646 | * offline CPU and activate it when the CPU comes up, but | |
647 | * that's for later. | |
648 | */ | |
649 | if (!cpu_isset(cpu, cpu_online_map)) | |
650 | return ERR_PTR(-ENODEV); | |
651 | ||
652 | cpuctx = &per_cpu(perf_cpu_context, cpu); | |
653 | ctx = &cpuctx->ctx; | |
654 | ||
0793a61d TG |
655 | return ctx; |
656 | } | |
657 | ||
658 | rcu_read_lock(); | |
659 | if (!pid) | |
660 | task = current; | |
661 | else | |
662 | task = find_task_by_vpid(pid); | |
663 | if (task) | |
664 | get_task_struct(task); | |
665 | rcu_read_unlock(); | |
666 | ||
667 | if (!task) | |
668 | return ERR_PTR(-ESRCH); | |
669 | ||
670 | ctx = &task->perf_counter_ctx; | |
671 | ctx->task = task; | |
672 | ||
673 | /* Reuse ptrace permission checks for now. */ | |
674 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) { | |
675 | put_context(ctx); | |
676 | return ERR_PTR(-EACCES); | |
677 | } | |
678 | ||
679 | return ctx; | |
680 | } | |
681 | ||
682 | /* | |
683 | * Called when the last reference to the file is gone. | |
684 | */ | |
685 | static int perf_release(struct inode *inode, struct file *file) | |
686 | { | |
687 | struct perf_counter *counter = file->private_data; | |
688 | struct perf_counter_context *ctx = counter->ctx; | |
689 | ||
690 | file->private_data = NULL; | |
691 | ||
692 | mutex_lock(&counter->mutex); | |
693 | ||
04289bb9 | 694 | perf_counter_remove_from_context(counter); |
0793a61d TG |
695 | put_context(ctx); |
696 | ||
697 | mutex_unlock(&counter->mutex); | |
698 | ||
699 | kfree(counter); | |
700 | ||
701 | return 0; | |
702 | } | |
703 | ||
704 | /* | |
705 | * Read the performance counter - simple non blocking version for now | |
706 | */ | |
707 | static ssize_t | |
708 | perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) | |
709 | { | |
710 | u64 cntval; | |
711 | ||
712 | if (count != sizeof(cntval)) | |
713 | return -EINVAL; | |
714 | ||
715 | mutex_lock(&counter->mutex); | |
04289bb9 | 716 | cntval = perf_counter_read(counter); |
0793a61d TG |
717 | mutex_unlock(&counter->mutex); |
718 | ||
719 | return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval); | |
720 | } | |
721 | ||
722 | static ssize_t | |
723 | perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count) | |
724 | { | |
725 | if (!usrdata->len) | |
726 | return 0; | |
727 | ||
728 | count = min(count, (size_t)usrdata->len); | |
729 | if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count)) | |
730 | return -EFAULT; | |
731 | ||
732 | /* Adjust the counters */ | |
733 | usrdata->len -= count; | |
734 | if (!usrdata->len) | |
735 | usrdata->rd_idx = 0; | |
736 | else | |
737 | usrdata->rd_idx += count; | |
738 | ||
739 | return count; | |
740 | } | |
741 | ||
742 | static ssize_t | |
743 | perf_read_irq_data(struct perf_counter *counter, | |
744 | char __user *buf, | |
745 | size_t count, | |
746 | int nonblocking) | |
747 | { | |
748 | struct perf_data *irqdata, *usrdata; | |
749 | DECLARE_WAITQUEUE(wait, current); | |
750 | ssize_t res; | |
751 | ||
752 | irqdata = counter->irqdata; | |
753 | usrdata = counter->usrdata; | |
754 | ||
755 | if (usrdata->len + irqdata->len >= count) | |
756 | goto read_pending; | |
757 | ||
758 | if (nonblocking) | |
759 | return -EAGAIN; | |
760 | ||
761 | spin_lock_irq(&counter->waitq.lock); | |
762 | __add_wait_queue(&counter->waitq, &wait); | |
763 | for (;;) { | |
764 | set_current_state(TASK_INTERRUPTIBLE); | |
765 | if (usrdata->len + irqdata->len >= count) | |
766 | break; | |
767 | ||
768 | if (signal_pending(current)) | |
769 | break; | |
770 | ||
771 | spin_unlock_irq(&counter->waitq.lock); | |
772 | schedule(); | |
773 | spin_lock_irq(&counter->waitq.lock); | |
774 | } | |
775 | __remove_wait_queue(&counter->waitq, &wait); | |
776 | __set_current_state(TASK_RUNNING); | |
777 | spin_unlock_irq(&counter->waitq.lock); | |
778 | ||
779 | if (usrdata->len + irqdata->len < count) | |
780 | return -ERESTARTSYS; | |
781 | read_pending: | |
782 | mutex_lock(&counter->mutex); | |
783 | ||
784 | /* Drain pending data first: */ | |
785 | res = perf_copy_usrdata(usrdata, buf, count); | |
786 | if (res < 0 || res == count) | |
787 | goto out; | |
788 | ||
789 | /* Switch irq buffer: */ | |
790 | usrdata = perf_switch_irq_data(counter); | |
791 | if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) { | |
792 | if (!res) | |
793 | res = -EFAULT; | |
794 | } else { | |
795 | res = count; | |
796 | } | |
797 | out: | |
798 | mutex_unlock(&counter->mutex); | |
799 | ||
800 | return res; | |
801 | } | |
802 | ||
803 | static ssize_t | |
804 | perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |
805 | { | |
806 | struct perf_counter *counter = file->private_data; | |
807 | ||
9f66a381 | 808 | switch (counter->hw_event.record_type) { |
0793a61d TG |
809 | case PERF_RECORD_SIMPLE: |
810 | return perf_read_hw(counter, buf, count); | |
811 | ||
812 | case PERF_RECORD_IRQ: | |
813 | case PERF_RECORD_GROUP: | |
814 | return perf_read_irq_data(counter, buf, count, | |
815 | file->f_flags & O_NONBLOCK); | |
816 | } | |
817 | return -EINVAL; | |
818 | } | |
819 | ||
820 | static unsigned int perf_poll(struct file *file, poll_table *wait) | |
821 | { | |
822 | struct perf_counter *counter = file->private_data; | |
823 | unsigned int events = 0; | |
824 | unsigned long flags; | |
825 | ||
826 | poll_wait(file, &counter->waitq, wait); | |
827 | ||
828 | spin_lock_irqsave(&counter->waitq.lock, flags); | |
829 | if (counter->usrdata->len || counter->irqdata->len) | |
830 | events |= POLLIN; | |
831 | spin_unlock_irqrestore(&counter->waitq.lock, flags); | |
832 | ||
833 | return events; | |
834 | } | |
835 | ||
836 | static const struct file_operations perf_fops = { | |
837 | .release = perf_release, | |
838 | .read = perf_read, | |
839 | .poll = perf_poll, | |
840 | }; | |
841 | ||
5c92d124 IM |
842 | static void cpu_clock_perf_counter_enable(struct perf_counter *counter) |
843 | { | |
844 | } | |
845 | ||
846 | static void cpu_clock_perf_counter_disable(struct perf_counter *counter) | |
847 | { | |
848 | } | |
849 | ||
850 | static void cpu_clock_perf_counter_read(struct perf_counter *counter) | |
851 | { | |
852 | int cpu = raw_smp_processor_id(); | |
853 | ||
ee06094f | 854 | atomic64_set(&counter->count, cpu_clock(cpu)); |
5c92d124 IM |
855 | } |
856 | ||
857 | static const struct hw_perf_counter_ops perf_ops_cpu_clock = { | |
7671581f IM |
858 | .enable = cpu_clock_perf_counter_enable, |
859 | .disable = cpu_clock_perf_counter_disable, | |
860 | .read = cpu_clock_perf_counter_read, | |
5c92d124 IM |
861 | }; |
862 | ||
8cb391e8 | 863 | static void task_clock_perf_counter_update(struct perf_counter *counter) |
bae43c99 | 864 | { |
8cb391e8 IM |
865 | u64 prev, now; |
866 | s64 delta; | |
867 | ||
868 | prev = atomic64_read(&counter->hw.prev_count); | |
869 | now = current->se.sum_exec_runtime; | |
870 | ||
871 | atomic64_set(&counter->hw.prev_count, now); | |
872 | ||
873 | delta = now - prev; | |
8cb391e8 IM |
874 | |
875 | atomic64_add(delta, &counter->count); | |
bae43c99 IM |
876 | } |
877 | ||
8cb391e8 | 878 | static void task_clock_perf_counter_read(struct perf_counter *counter) |
bae43c99 | 879 | { |
8cb391e8 | 880 | task_clock_perf_counter_update(counter); |
bae43c99 IM |
881 | } |
882 | ||
8cb391e8 IM |
883 | static void task_clock_perf_counter_enable(struct perf_counter *counter) |
884 | { | |
885 | atomic64_set(&counter->hw.prev_count, current->se.sum_exec_runtime); | |
886 | } | |
887 | ||
888 | static void task_clock_perf_counter_disable(struct perf_counter *counter) | |
bae43c99 | 889 | { |
8cb391e8 | 890 | task_clock_perf_counter_update(counter); |
bae43c99 IM |
891 | } |
892 | ||
893 | static const struct hw_perf_counter_ops perf_ops_task_clock = { | |
7671581f IM |
894 | .enable = task_clock_perf_counter_enable, |
895 | .disable = task_clock_perf_counter_disable, | |
896 | .read = task_clock_perf_counter_read, | |
bae43c99 IM |
897 | }; |
898 | ||
e06c61a8 IM |
899 | static u64 get_page_faults(void) |
900 | { | |
901 | struct task_struct *curr = current; | |
902 | ||
903 | return curr->maj_flt + curr->min_flt; | |
904 | } | |
905 | ||
906 | static void page_faults_perf_counter_update(struct perf_counter *counter) | |
907 | { | |
908 | u64 prev, now; | |
909 | s64 delta; | |
910 | ||
911 | prev = atomic64_read(&counter->hw.prev_count); | |
912 | now = get_page_faults(); | |
913 | ||
914 | atomic64_set(&counter->hw.prev_count, now); | |
915 | ||
916 | delta = now - prev; | |
e06c61a8 IM |
917 | |
918 | atomic64_add(delta, &counter->count); | |
919 | } | |
920 | ||
921 | static void page_faults_perf_counter_read(struct perf_counter *counter) | |
922 | { | |
923 | page_faults_perf_counter_update(counter); | |
924 | } | |
925 | ||
926 | static void page_faults_perf_counter_enable(struct perf_counter *counter) | |
927 | { | |
928 | /* | |
929 | * page-faults is a per-task value already, | |
930 | * so we dont have to clear it on switch-in. | |
931 | */ | |
932 | } | |
933 | ||
934 | static void page_faults_perf_counter_disable(struct perf_counter *counter) | |
935 | { | |
936 | page_faults_perf_counter_update(counter); | |
937 | } | |
938 | ||
939 | static const struct hw_perf_counter_ops perf_ops_page_faults = { | |
7671581f IM |
940 | .enable = page_faults_perf_counter_enable, |
941 | .disable = page_faults_perf_counter_disable, | |
942 | .read = page_faults_perf_counter_read, | |
e06c61a8 IM |
943 | }; |
944 | ||
5d6a27d8 IM |
945 | static u64 get_context_switches(void) |
946 | { | |
947 | struct task_struct *curr = current; | |
948 | ||
949 | return curr->nvcsw + curr->nivcsw; | |
950 | } | |
951 | ||
952 | static void context_switches_perf_counter_update(struct perf_counter *counter) | |
953 | { | |
954 | u64 prev, now; | |
955 | s64 delta; | |
956 | ||
957 | prev = atomic64_read(&counter->hw.prev_count); | |
958 | now = get_context_switches(); | |
959 | ||
960 | atomic64_set(&counter->hw.prev_count, now); | |
961 | ||
962 | delta = now - prev; | |
5d6a27d8 IM |
963 | |
964 | atomic64_add(delta, &counter->count); | |
965 | } | |
966 | ||
967 | static void context_switches_perf_counter_read(struct perf_counter *counter) | |
968 | { | |
969 | context_switches_perf_counter_update(counter); | |
970 | } | |
971 | ||
972 | static void context_switches_perf_counter_enable(struct perf_counter *counter) | |
973 | { | |
974 | /* | |
975 | * ->nvcsw + curr->nivcsw is a per-task value already, | |
976 | * so we dont have to clear it on switch-in. | |
977 | */ | |
978 | } | |
979 | ||
980 | static void context_switches_perf_counter_disable(struct perf_counter *counter) | |
981 | { | |
982 | context_switches_perf_counter_update(counter); | |
983 | } | |
984 | ||
985 | static const struct hw_perf_counter_ops perf_ops_context_switches = { | |
7671581f IM |
986 | .enable = context_switches_perf_counter_enable, |
987 | .disable = context_switches_perf_counter_disable, | |
988 | .read = context_switches_perf_counter_read, | |
5d6a27d8 IM |
989 | }; |
990 | ||
6c594c21 IM |
991 | static inline u64 get_cpu_migrations(void) |
992 | { | |
993 | return current->se.nr_migrations; | |
994 | } | |
995 | ||
996 | static void cpu_migrations_perf_counter_update(struct perf_counter *counter) | |
997 | { | |
998 | u64 prev, now; | |
999 | s64 delta; | |
1000 | ||
1001 | prev = atomic64_read(&counter->hw.prev_count); | |
1002 | now = get_cpu_migrations(); | |
1003 | ||
1004 | atomic64_set(&counter->hw.prev_count, now); | |
1005 | ||
1006 | delta = now - prev; | |
6c594c21 IM |
1007 | |
1008 | atomic64_add(delta, &counter->count); | |
1009 | } | |
1010 | ||
1011 | static void cpu_migrations_perf_counter_read(struct perf_counter *counter) | |
1012 | { | |
1013 | cpu_migrations_perf_counter_update(counter); | |
1014 | } | |
1015 | ||
1016 | static void cpu_migrations_perf_counter_enable(struct perf_counter *counter) | |
1017 | { | |
1018 | /* | |
1019 | * se.nr_migrations is a per-task value already, | |
1020 | * so we dont have to clear it on switch-in. | |
1021 | */ | |
1022 | } | |
1023 | ||
1024 | static void cpu_migrations_perf_counter_disable(struct perf_counter *counter) | |
1025 | { | |
1026 | cpu_migrations_perf_counter_update(counter); | |
1027 | } | |
1028 | ||
1029 | static const struct hw_perf_counter_ops perf_ops_cpu_migrations = { | |
7671581f IM |
1030 | .enable = cpu_migrations_perf_counter_enable, |
1031 | .disable = cpu_migrations_perf_counter_disable, | |
1032 | .read = cpu_migrations_perf_counter_read, | |
6c594c21 IM |
1033 | }; |
1034 | ||
5c92d124 IM |
1035 | static const struct hw_perf_counter_ops * |
1036 | sw_perf_counter_init(struct perf_counter *counter) | |
1037 | { | |
1038 | const struct hw_perf_counter_ops *hw_ops = NULL; | |
1039 | ||
1040 | switch (counter->hw_event.type) { | |
1041 | case PERF_COUNT_CPU_CLOCK: | |
1042 | hw_ops = &perf_ops_cpu_clock; | |
1043 | break; | |
bae43c99 IM |
1044 | case PERF_COUNT_TASK_CLOCK: |
1045 | hw_ops = &perf_ops_task_clock; | |
1046 | break; | |
e06c61a8 IM |
1047 | case PERF_COUNT_PAGE_FAULTS: |
1048 | hw_ops = &perf_ops_page_faults; | |
1049 | break; | |
5d6a27d8 IM |
1050 | case PERF_COUNT_CONTEXT_SWITCHES: |
1051 | hw_ops = &perf_ops_context_switches; | |
1052 | break; | |
6c594c21 IM |
1053 | case PERF_COUNT_CPU_MIGRATIONS: |
1054 | hw_ops = &perf_ops_cpu_migrations; | |
1055 | break; | |
5c92d124 IM |
1056 | default: |
1057 | break; | |
1058 | } | |
1059 | return hw_ops; | |
1060 | } | |
1061 | ||
0793a61d TG |
1062 | /* |
1063 | * Allocate and initialize a counter structure | |
1064 | */ | |
1065 | static struct perf_counter * | |
04289bb9 IM |
1066 | perf_counter_alloc(struct perf_counter_hw_event *hw_event, |
1067 | int cpu, | |
9b51f66d IM |
1068 | struct perf_counter *group_leader, |
1069 | gfp_t gfpflags) | |
0793a61d | 1070 | { |
5c92d124 | 1071 | const struct hw_perf_counter_ops *hw_ops; |
621a01ea | 1072 | struct perf_counter *counter; |
0793a61d | 1073 | |
9b51f66d | 1074 | counter = kzalloc(sizeof(*counter), gfpflags); |
0793a61d TG |
1075 | if (!counter) |
1076 | return NULL; | |
1077 | ||
04289bb9 IM |
1078 | /* |
1079 | * Single counters are their own group leaders, with an | |
1080 | * empty sibling list: | |
1081 | */ | |
1082 | if (!group_leader) | |
1083 | group_leader = counter; | |
1084 | ||
0793a61d | 1085 | mutex_init(&counter->mutex); |
04289bb9 IM |
1086 | INIT_LIST_HEAD(&counter->list_entry); |
1087 | INIT_LIST_HEAD(&counter->sibling_list); | |
0793a61d TG |
1088 | init_waitqueue_head(&counter->waitq); |
1089 | ||
9f66a381 IM |
1090 | counter->irqdata = &counter->data[0]; |
1091 | counter->usrdata = &counter->data[1]; | |
1092 | counter->cpu = cpu; | |
1093 | counter->hw_event = *hw_event; | |
1094 | counter->wakeup_pending = 0; | |
04289bb9 | 1095 | counter->group_leader = group_leader; |
621a01ea IM |
1096 | counter->hw_ops = NULL; |
1097 | ||
a86ed508 IM |
1098 | if (hw_event->disabled) |
1099 | counter->state = PERF_COUNTER_STATE_OFF; | |
1100 | ||
5c92d124 IM |
1101 | hw_ops = NULL; |
1102 | if (!hw_event->raw && hw_event->type < 0) | |
1103 | hw_ops = sw_perf_counter_init(counter); | |
9b51f66d | 1104 | if (!hw_ops) |
5c92d124 | 1105 | hw_ops = hw_perf_counter_init(counter); |
5c92d124 | 1106 | |
621a01ea IM |
1107 | if (!hw_ops) { |
1108 | kfree(counter); | |
1109 | return NULL; | |
1110 | } | |
1111 | counter->hw_ops = hw_ops; | |
0793a61d TG |
1112 | |
1113 | return counter; | |
1114 | } | |
1115 | ||
1116 | /** | |
9f66a381 IM |
1117 | * sys_perf_task_open - open a performance counter, associate it to a task/cpu |
1118 | * | |
1119 | * @hw_event_uptr: event type attributes for monitoring/sampling | |
0793a61d | 1120 | * @pid: target pid |
9f66a381 IM |
1121 | * @cpu: target cpu |
1122 | * @group_fd: group leader counter fd | |
0793a61d | 1123 | */ |
1d1c7ddb IM |
1124 | asmlinkage int |
1125 | sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, | |
1126 | pid_t pid, int cpu, int group_fd) | |
0793a61d | 1127 | { |
04289bb9 | 1128 | struct perf_counter *counter, *group_leader; |
9f66a381 | 1129 | struct perf_counter_hw_event hw_event; |
04289bb9 | 1130 | struct perf_counter_context *ctx; |
9b51f66d | 1131 | struct file *counter_file = NULL; |
04289bb9 IM |
1132 | struct file *group_file = NULL; |
1133 | int fput_needed = 0; | |
9b51f66d | 1134 | int fput_needed2 = 0; |
0793a61d TG |
1135 | int ret; |
1136 | ||
9f66a381 | 1137 | if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) |
eab656ae TG |
1138 | return -EFAULT; |
1139 | ||
04289bb9 | 1140 | /* |
ccff286d IM |
1141 | * Get the target context (task or percpu): |
1142 | */ | |
1143 | ctx = find_get_context(pid, cpu); | |
1144 | if (IS_ERR(ctx)) | |
1145 | return PTR_ERR(ctx); | |
1146 | ||
1147 | /* | |
1148 | * Look up the group leader (we will attach this counter to it): | |
04289bb9 IM |
1149 | */ |
1150 | group_leader = NULL; | |
1151 | if (group_fd != -1) { | |
1152 | ret = -EINVAL; | |
1153 | group_file = fget_light(group_fd, &fput_needed); | |
1154 | if (!group_file) | |
ccff286d | 1155 | goto err_put_context; |
04289bb9 | 1156 | if (group_file->f_op != &perf_fops) |
ccff286d | 1157 | goto err_put_context; |
04289bb9 IM |
1158 | |
1159 | group_leader = group_file->private_data; | |
1160 | /* | |
ccff286d IM |
1161 | * Do not allow a recursive hierarchy (this new sibling |
1162 | * becoming part of another group-sibling): | |
1163 | */ | |
1164 | if (group_leader->group_leader != group_leader) | |
1165 | goto err_put_context; | |
1166 | /* | |
1167 | * Do not allow to attach to a group in a different | |
1168 | * task or CPU context: | |
04289bb9 | 1169 | */ |
ccff286d IM |
1170 | if (group_leader->ctx != ctx) |
1171 | goto err_put_context; | |
04289bb9 IM |
1172 | } |
1173 | ||
5c92d124 | 1174 | ret = -EINVAL; |
9b51f66d | 1175 | counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL); |
0793a61d TG |
1176 | if (!counter) |
1177 | goto err_put_context; | |
1178 | ||
0793a61d TG |
1179 | ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); |
1180 | if (ret < 0) | |
9b51f66d IM |
1181 | goto err_free_put_context; |
1182 | ||
1183 | counter_file = fget_light(ret, &fput_needed2); | |
1184 | if (!counter_file) | |
1185 | goto err_free_put_context; | |
1186 | ||
1187 | counter->filp = counter_file; | |
1188 | perf_install_in_context(ctx, counter, cpu); | |
1189 | ||
1190 | fput_light(counter_file, fput_needed2); | |
0793a61d | 1191 | |
04289bb9 IM |
1192 | out_fput: |
1193 | fput_light(group_file, fput_needed); | |
1194 | ||
0793a61d TG |
1195 | return ret; |
1196 | ||
9b51f66d | 1197 | err_free_put_context: |
0793a61d TG |
1198 | kfree(counter); |
1199 | ||
1200 | err_put_context: | |
1201 | put_context(ctx); | |
1202 | ||
04289bb9 | 1203 | goto out_fput; |
0793a61d TG |
1204 | } |
1205 | ||
9b51f66d IM |
1206 | /* |
1207 | * Initialize the perf_counter context in a task_struct: | |
1208 | */ | |
1209 | static void | |
1210 | __perf_counter_init_context(struct perf_counter_context *ctx, | |
1211 | struct task_struct *task) | |
1212 | { | |
1213 | memset(ctx, 0, sizeof(*ctx)); | |
1214 | spin_lock_init(&ctx->lock); | |
1215 | INIT_LIST_HEAD(&ctx->counter_list); | |
1216 | ctx->task = task; | |
1217 | } | |
1218 | ||
1219 | /* | |
1220 | * inherit a counter from parent task to child task: | |
1221 | */ | |
1222 | static int | |
1223 | inherit_counter(struct perf_counter *parent_counter, | |
1224 | struct task_struct *parent, | |
1225 | struct perf_counter_context *parent_ctx, | |
1226 | struct task_struct *child, | |
1227 | struct perf_counter_context *child_ctx) | |
1228 | { | |
1229 | struct perf_counter *child_counter; | |
1230 | ||
1231 | child_counter = perf_counter_alloc(&parent_counter->hw_event, | |
1232 | parent_counter->cpu, NULL, | |
1233 | GFP_ATOMIC); | |
1234 | if (!child_counter) | |
1235 | return -ENOMEM; | |
1236 | ||
1237 | /* | |
1238 | * Link it up in the child's context: | |
1239 | */ | |
1240 | child_counter->ctx = child_ctx; | |
1241 | child_counter->task = child; | |
1242 | list_add_counter(child_counter, child_ctx); | |
1243 | child_ctx->nr_counters++; | |
1244 | ||
1245 | child_counter->parent = parent_counter; | |
1246 | parent_counter->nr_inherited++; | |
1247 | /* | |
1248 | * inherit into child's child as well: | |
1249 | */ | |
1250 | child_counter->hw_event.inherit = 1; | |
1251 | ||
1252 | /* | |
1253 | * Get a reference to the parent filp - we will fput it | |
1254 | * when the child counter exits. This is safe to do because | |
1255 | * we are in the parent and we know that the filp still | |
1256 | * exists and has a nonzero count: | |
1257 | */ | |
1258 | atomic_long_inc(&parent_counter->filp->f_count); | |
1259 | ||
1260 | return 0; | |
1261 | } | |
1262 | ||
1263 | static void | |
1264 | __perf_counter_exit_task(struct task_struct *child, | |
1265 | struct perf_counter *child_counter, | |
1266 | struct perf_counter_context *child_ctx) | |
1267 | { | |
1268 | struct perf_counter *parent_counter; | |
1269 | u64 parent_val, child_val; | |
1270 | u64 perf_flags; | |
1271 | ||
1272 | /* | |
1273 | * Disable and unlink this counter. | |
1274 | * | |
1275 | * Be careful about zapping the list - IRQ/NMI context | |
1276 | * could still be processing it: | |
1277 | */ | |
1278 | local_irq_disable(); | |
1279 | perf_flags = hw_perf_save_disable(); | |
1280 | ||
0cc0c027 IM |
1281 | if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) { |
1282 | struct perf_cpu_context *cpuctx; | |
1283 | ||
1284 | cpuctx = &__get_cpu_var(perf_cpu_context); | |
1285 | ||
7671581f | 1286 | child_counter->hw_ops->disable(child_counter); |
0cc0c027 IM |
1287 | child_counter->state = PERF_COUNTER_STATE_INACTIVE; |
1288 | child_counter->oncpu = -1; | |
1289 | ||
1290 | cpuctx->active_oncpu--; | |
1291 | child_ctx->nr_active--; | |
1292 | } | |
1293 | ||
9b51f66d IM |
1294 | list_del_init(&child_counter->list_entry); |
1295 | ||
1296 | hw_perf_restore(perf_flags); | |
1297 | local_irq_enable(); | |
1298 | ||
1299 | parent_counter = child_counter->parent; | |
1300 | /* | |
1301 | * It can happen that parent exits first, and has counters | |
1302 | * that are still around due to the child reference. These | |
1303 | * counters need to be zapped - but otherwise linger. | |
1304 | */ | |
1305 | if (!parent_counter) | |
1306 | return; | |
1307 | ||
1308 | parent_val = atomic64_read(&parent_counter->count); | |
1309 | child_val = atomic64_read(&child_counter->count); | |
1310 | ||
1311 | /* | |
1312 | * Add back the child's count to the parent's count: | |
1313 | */ | |
1314 | atomic64_add(child_val, &parent_counter->count); | |
1315 | ||
1316 | fput(parent_counter->filp); | |
1317 | ||
1318 | kfree(child_counter); | |
1319 | } | |
1320 | ||
1321 | /* | |
1322 | * When a child task exist, feed back counter values to parent counters. | |
1323 | * | |
1324 | * Note: we are running in child context, but the PID is not hashed | |
1325 | * anymore so new counters will not be added. | |
1326 | */ | |
1327 | void perf_counter_exit_task(struct task_struct *child) | |
1328 | { | |
1329 | struct perf_counter *child_counter, *tmp; | |
1330 | struct perf_counter_context *child_ctx; | |
1331 | ||
1332 | child_ctx = &child->perf_counter_ctx; | |
1333 | ||
1334 | if (likely(!child_ctx->nr_counters)) | |
1335 | return; | |
1336 | ||
1337 | list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, | |
1338 | list_entry) | |
1339 | __perf_counter_exit_task(child, child_counter, child_ctx); | |
1340 | } | |
1341 | ||
1342 | /* | |
1343 | * Initialize the perf_counter context in task_struct | |
1344 | */ | |
1345 | void perf_counter_init_task(struct task_struct *child) | |
1346 | { | |
1347 | struct perf_counter_context *child_ctx, *parent_ctx; | |
1348 | struct perf_counter *counter, *parent_counter; | |
1349 | struct task_struct *parent = current; | |
1350 | unsigned long flags; | |
1351 | ||
1352 | child_ctx = &child->perf_counter_ctx; | |
1353 | parent_ctx = &parent->perf_counter_ctx; | |
1354 | ||
1355 | __perf_counter_init_context(child_ctx, child); | |
1356 | ||
1357 | /* | |
1358 | * This is executed from the parent task context, so inherit | |
1359 | * counters that have been marked for cloning: | |
1360 | */ | |
1361 | ||
1362 | if (likely(!parent_ctx->nr_counters)) | |
1363 | return; | |
1364 | ||
1365 | /* | |
1366 | * Lock the parent list. No need to lock the child - not PID | |
1367 | * hashed yet and not running, so nobody can access it. | |
1368 | */ | |
1369 | spin_lock_irqsave(&parent_ctx->lock, flags); | |
1370 | ||
1371 | /* | |
1372 | * We dont have to disable NMIs - we are only looking at | |
1373 | * the list, not manipulating it: | |
1374 | */ | |
1375 | list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) { | |
1376 | if (!counter->hw_event.inherit || counter->group_leader != counter) | |
1377 | continue; | |
1378 | ||
1379 | /* | |
1380 | * Instead of creating recursive hierarchies of counters, | |
1381 | * we link inheritd counters back to the original parent, | |
1382 | * which has a filp for sure, which we use as the reference | |
1383 | * count: | |
1384 | */ | |
1385 | parent_counter = counter; | |
1386 | if (counter->parent) | |
1387 | parent_counter = counter->parent; | |
1388 | ||
1389 | if (inherit_counter(parent_counter, parent, | |
1390 | parent_ctx, child, child_ctx)) | |
1391 | break; | |
1392 | } | |
1393 | ||
1394 | spin_unlock_irqrestore(&parent_ctx->lock, flags); | |
1395 | } | |
1396 | ||
04289bb9 | 1397 | static void __cpuinit perf_counter_init_cpu(int cpu) |
0793a61d | 1398 | { |
04289bb9 | 1399 | struct perf_cpu_context *cpuctx; |
0793a61d | 1400 | |
04289bb9 IM |
1401 | cpuctx = &per_cpu(perf_cpu_context, cpu); |
1402 | __perf_counter_init_context(&cpuctx->ctx, NULL); | |
0793a61d TG |
1403 | |
1404 | mutex_lock(&perf_resource_mutex); | |
04289bb9 | 1405 | cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu; |
0793a61d | 1406 | mutex_unlock(&perf_resource_mutex); |
04289bb9 | 1407 | |
0793a61d TG |
1408 | hw_perf_counter_setup(); |
1409 | } | |
1410 | ||
1411 | #ifdef CONFIG_HOTPLUG_CPU | |
04289bb9 | 1412 | static void __perf_counter_exit_cpu(void *info) |
0793a61d TG |
1413 | { |
1414 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | |
1415 | struct perf_counter_context *ctx = &cpuctx->ctx; | |
1416 | struct perf_counter *counter, *tmp; | |
1417 | ||
04289bb9 IM |
1418 | list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) |
1419 | __perf_counter_remove_from_context(counter); | |
0793a61d TG |
1420 | |
1421 | } | |
04289bb9 | 1422 | static void perf_counter_exit_cpu(int cpu) |
0793a61d | 1423 | { |
04289bb9 | 1424 | smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1); |
0793a61d TG |
1425 | } |
1426 | #else | |
04289bb9 | 1427 | static inline void perf_counter_exit_cpu(int cpu) { } |
0793a61d TG |
1428 | #endif |
1429 | ||
1430 | static int __cpuinit | |
1431 | perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | |
1432 | { | |
1433 | unsigned int cpu = (long)hcpu; | |
1434 | ||
1435 | switch (action) { | |
1436 | ||
1437 | case CPU_UP_PREPARE: | |
1438 | case CPU_UP_PREPARE_FROZEN: | |
04289bb9 | 1439 | perf_counter_init_cpu(cpu); |
0793a61d TG |
1440 | break; |
1441 | ||
1442 | case CPU_DOWN_PREPARE: | |
1443 | case CPU_DOWN_PREPARE_FROZEN: | |
04289bb9 | 1444 | perf_counter_exit_cpu(cpu); |
0793a61d TG |
1445 | break; |
1446 | ||
1447 | default: | |
1448 | break; | |
1449 | } | |
1450 | ||
1451 | return NOTIFY_OK; | |
1452 | } | |
1453 | ||
1454 | static struct notifier_block __cpuinitdata perf_cpu_nb = { | |
1455 | .notifier_call = perf_cpu_notify, | |
1456 | }; | |
1457 | ||
1458 | static int __init perf_counter_init(void) | |
1459 | { | |
1460 | perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, | |
1461 | (void *)(long)smp_processor_id()); | |
1462 | register_cpu_notifier(&perf_cpu_nb); | |
1463 | ||
1464 | return 0; | |
1465 | } | |
1466 | early_initcall(perf_counter_init); | |
1467 | ||
1468 | static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) | |
1469 | { | |
1470 | return sprintf(buf, "%d\n", perf_reserved_percpu); | |
1471 | } | |
1472 | ||
1473 | static ssize_t | |
1474 | perf_set_reserve_percpu(struct sysdev_class *class, | |
1475 | const char *buf, | |
1476 | size_t count) | |
1477 | { | |
1478 | struct perf_cpu_context *cpuctx; | |
1479 | unsigned long val; | |
1480 | int err, cpu, mpt; | |
1481 | ||
1482 | err = strict_strtoul(buf, 10, &val); | |
1483 | if (err) | |
1484 | return err; | |
1485 | if (val > perf_max_counters) | |
1486 | return -EINVAL; | |
1487 | ||
1488 | mutex_lock(&perf_resource_mutex); | |
1489 | perf_reserved_percpu = val; | |
1490 | for_each_online_cpu(cpu) { | |
1491 | cpuctx = &per_cpu(perf_cpu_context, cpu); | |
1492 | spin_lock_irq(&cpuctx->ctx.lock); | |
1493 | mpt = min(perf_max_counters - cpuctx->ctx.nr_counters, | |
1494 | perf_max_counters - perf_reserved_percpu); | |
1495 | cpuctx->max_pertask = mpt; | |
1496 | spin_unlock_irq(&cpuctx->ctx.lock); | |
1497 | } | |
1498 | mutex_unlock(&perf_resource_mutex); | |
1499 | ||
1500 | return count; | |
1501 | } | |
1502 | ||
1503 | static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) | |
1504 | { | |
1505 | return sprintf(buf, "%d\n", perf_overcommit); | |
1506 | } | |
1507 | ||
1508 | static ssize_t | |
1509 | perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) | |
1510 | { | |
1511 | unsigned long val; | |
1512 | int err; | |
1513 | ||
1514 | err = strict_strtoul(buf, 10, &val); | |
1515 | if (err) | |
1516 | return err; | |
1517 | if (val > 1) | |
1518 | return -EINVAL; | |
1519 | ||
1520 | mutex_lock(&perf_resource_mutex); | |
1521 | perf_overcommit = val; | |
1522 | mutex_unlock(&perf_resource_mutex); | |
1523 | ||
1524 | return count; | |
1525 | } | |
1526 | ||
1527 | static SYSDEV_CLASS_ATTR( | |
1528 | reserve_percpu, | |
1529 | 0644, | |
1530 | perf_show_reserve_percpu, | |
1531 | perf_set_reserve_percpu | |
1532 | ); | |
1533 | ||
1534 | static SYSDEV_CLASS_ATTR( | |
1535 | overcommit, | |
1536 | 0644, | |
1537 | perf_show_overcommit, | |
1538 | perf_set_overcommit | |
1539 | ); | |
1540 | ||
1541 | static struct attribute *perfclass_attrs[] = { | |
1542 | &attr_reserve_percpu.attr, | |
1543 | &attr_overcommit.attr, | |
1544 | NULL | |
1545 | }; | |
1546 | ||
1547 | static struct attribute_group perfclass_attr_group = { | |
1548 | .attrs = perfclass_attrs, | |
1549 | .name = "perf_counters", | |
1550 | }; | |
1551 | ||
1552 | static int __init perf_counter_sysfs_init(void) | |
1553 | { | |
1554 | return sysfs_create_group(&cpu_sysdev_class.kset.kobj, | |
1555 | &perfclass_attr_group); | |
1556 | } | |
1557 | device_initcall(perf_counter_sysfs_init); |