Commit | Line | Data |
---|---|---|
c757249a SN |
1 | /* |
2 | * taskstats.c - Export per-task statistics to userland | |
3 | * | |
4 | * Copyright (C) Shailabh Nagar, IBM Corp. 2006 | |
5 | * (C) Balbir Singh, IBM Corp. 2006 | |
6 | * | |
7 | * This program is free software; you can redistribute it and/or modify | |
8 | * it under the terms of the GNU General Public License as published by | |
9 | * the Free Software Foundation; either version 2 of the License, or | |
10 | * (at your option) any later version. | |
11 | * | |
12 | * This program is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | * GNU General Public License for more details. | |
16 | * | |
17 | */ | |
18 | ||
19 | #include <linux/kernel.h> | |
20 | #include <linux/taskstats_kern.h> | |
6f44993f | 21 | #include <linux/delayacct.h> |
f9fd8914 SN |
22 | #include <linux/cpumask.h> |
23 | #include <linux/percpu.h> | |
c757249a SN |
24 | #include <net/genetlink.h> |
25 | #include <asm/atomic.h> | |
26 | ||
f9fd8914 SN |
27 | /* |
28 | * Maximum length of a cpumask that can be specified in | |
29 | * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute | |
30 | */ | |
31 | #define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) | |
32 | ||
c757249a SN |
33 | static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; |
34 | static int family_registered; | |
35 | kmem_cache_t *taskstats_cache; | |
c757249a SN |
36 | |
37 | static struct genl_family family = { | |
38 | .id = GENL_ID_GENERATE, | |
39 | .name = TASKSTATS_GENL_NAME, | |
40 | .version = TASKSTATS_GENL_VERSION, | |
41 | .maxattr = TASKSTATS_CMD_ATTR_MAX, | |
42 | }; | |
43 | ||
44 | static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] | |
45 | __read_mostly = { | |
46 | [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, | |
47 | [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, | |
f9fd8914 SN |
48 | [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, |
49 | [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; | |
50 | ||
51 | struct listener { | |
52 | struct list_head list; | |
53 | pid_t pid; | |
c757249a SN |
54 | }; |
55 | ||
f9fd8914 SN |
56 | struct listener_list { |
57 | struct rw_semaphore sem; | |
58 | struct list_head list; | |
59 | }; | |
60 | static DEFINE_PER_CPU(struct listener_list, listener_array); | |
61 | ||
62 | enum actions { | |
63 | REGISTER, | |
64 | DEREGISTER, | |
65 | CPU_DONT_CARE | |
66 | }; | |
c757249a SN |
67 | |
68 | static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, | |
69 | void **replyp, size_t size) | |
70 | { | |
71 | struct sk_buff *skb; | |
72 | void *reply; | |
73 | ||
74 | /* | |
75 | * If new attributes are added, please revisit this allocation | |
76 | */ | |
77 | skb = nlmsg_new(size); | |
78 | if (!skb) | |
79 | return -ENOMEM; | |
80 | ||
81 | if (!info) { | |
82 | int seq = get_cpu_var(taskstats_seqnum)++; | |
83 | put_cpu_var(taskstats_seqnum); | |
84 | ||
85 | reply = genlmsg_put(skb, 0, seq, | |
86 | family.id, 0, 0, | |
87 | cmd, family.version); | |
88 | } else | |
89 | reply = genlmsg_put(skb, info->snd_pid, info->snd_seq, | |
90 | family.id, 0, 0, | |
91 | cmd, family.version); | |
92 | if (reply == NULL) { | |
93 | nlmsg_free(skb); | |
94 | return -EINVAL; | |
95 | } | |
96 | ||
97 | *skbp = skb; | |
98 | *replyp = reply; | |
99 | return 0; | |
100 | } | |
101 | ||
f9fd8914 SN |
102 | /* |
103 | * Send taskstats data in @skb to listener with nl_pid @pid | |
104 | */ | |
105 | static int send_reply(struct sk_buff *skb, pid_t pid) | |
c757249a SN |
106 | { |
107 | struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); | |
f9fd8914 | 108 | void *reply = genlmsg_data(genlhdr); |
c757249a SN |
109 | int rc; |
110 | ||
c757249a SN |
111 | rc = genlmsg_end(skb, reply); |
112 | if (rc < 0) { | |
113 | nlmsg_free(skb); | |
114 | return rc; | |
115 | } | |
116 | ||
c757249a SN |
117 | return genlmsg_unicast(skb, pid); |
118 | } | |
119 | ||
f9fd8914 SN |
120 | /* |
121 | * Send taskstats data in @skb to listeners registered for @cpu's exit data | |
122 | */ | |
123 | static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) | |
124 | { | |
125 | struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); | |
126 | struct listener_list *listeners; | |
127 | struct listener *s, *tmp; | |
128 | struct sk_buff *skb_next, *skb_cur = skb; | |
129 | void *reply = genlmsg_data(genlhdr); | |
130 | int rc, ret; | |
131 | ||
132 | rc = genlmsg_end(skb, reply); | |
133 | if (rc < 0) { | |
134 | nlmsg_free(skb); | |
135 | return rc; | |
136 | } | |
137 | ||
138 | rc = 0; | |
139 | listeners = &per_cpu(listener_array, cpu); | |
140 | down_write(&listeners->sem); | |
141 | list_for_each_entry_safe(s, tmp, &listeners->list, list) { | |
142 | skb_next = NULL; | |
143 | if (!list_is_last(&s->list, &listeners->list)) { | |
144 | skb_next = skb_clone(skb_cur, GFP_KERNEL); | |
145 | if (!skb_next) { | |
146 | nlmsg_free(skb_cur); | |
147 | rc = -ENOMEM; | |
148 | break; | |
149 | } | |
150 | } | |
151 | ret = genlmsg_unicast(skb_cur, s->pid); | |
152 | if (ret == -ECONNREFUSED) { | |
153 | list_del(&s->list); | |
154 | kfree(s); | |
155 | rc = ret; | |
156 | } | |
157 | skb_cur = skb_next; | |
158 | } | |
159 | up_write(&listeners->sem); | |
160 | ||
161 | return rc; | |
162 | } | |
163 | ||
c757249a SN |
164 | static int fill_pid(pid_t pid, struct task_struct *pidtsk, |
165 | struct taskstats *stats) | |
166 | { | |
167 | int rc; | |
168 | struct task_struct *tsk = pidtsk; | |
169 | ||
170 | if (!pidtsk) { | |
171 | read_lock(&tasklist_lock); | |
172 | tsk = find_task_by_pid(pid); | |
173 | if (!tsk) { | |
174 | read_unlock(&tasklist_lock); | |
175 | return -ESRCH; | |
176 | } | |
177 | get_task_struct(tsk); | |
178 | read_unlock(&tasklist_lock); | |
179 | } else | |
180 | get_task_struct(tsk); | |
181 | ||
182 | /* | |
183 | * Each accounting subsystem adds calls to its functions to | |
184 | * fill in relevant parts of struct taskstsats as follows | |
185 | * | |
186 | * rc = per-task-foo(stats, tsk); | |
187 | * if (rc) | |
188 | * goto err; | |
189 | */ | |
190 | ||
6f44993f SN |
191 | rc = delayacct_add_tsk(stats, tsk); |
192 | stats->version = TASKSTATS_VERSION; | |
193 | ||
194 | /* Define err: label here if needed */ | |
c757249a SN |
195 | put_task_struct(tsk); |
196 | return rc; | |
197 | ||
198 | } | |
199 | ||
200 | static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, | |
201 | struct taskstats *stats) | |
202 | { | |
c757249a | 203 | struct task_struct *tsk, *first; |
ad4ecbcb | 204 | unsigned long flags; |
c757249a | 205 | |
ad4ecbcb SN |
206 | /* |
207 | * Add additional stats from live tasks except zombie thread group | |
208 | * leaders who are already counted with the dead tasks | |
209 | */ | |
c757249a | 210 | first = tgidtsk; |
c757249a | 211 | if (!first) { |
ad4ecbcb | 212 | read_lock(&tasklist_lock); |
c757249a SN |
213 | first = find_task_by_pid(tgid); |
214 | if (!first) { | |
215 | read_unlock(&tasklist_lock); | |
216 | return -ESRCH; | |
217 | } | |
ad4ecbcb SN |
218 | get_task_struct(first); |
219 | read_unlock(&tasklist_lock); | |
220 | } else | |
221 | get_task_struct(first); | |
222 | ||
223 | /* Start with stats from dead tasks */ | |
224 | spin_lock_irqsave(&first->signal->stats_lock, flags); | |
225 | if (first->signal->stats) | |
226 | memcpy(stats, first->signal->stats, sizeof(*stats)); | |
227 | spin_unlock_irqrestore(&first->signal->stats_lock, flags); | |
228 | ||
c757249a | 229 | tsk = first; |
ad4ecbcb | 230 | read_lock(&tasklist_lock); |
c757249a | 231 | do { |
ad4ecbcb SN |
232 | if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk)) |
233 | continue; | |
c757249a | 234 | /* |
ad4ecbcb | 235 | * Accounting subsystem can call its functions here to |
c757249a SN |
236 | * fill in relevant parts of struct taskstsats as follows |
237 | * | |
ad4ecbcb | 238 | * per-task-foo(stats, tsk); |
c757249a | 239 | */ |
ad4ecbcb | 240 | delayacct_add_tsk(stats, tsk); |
6f44993f | 241 | |
c757249a SN |
242 | } while_each_thread(first, tsk); |
243 | read_unlock(&tasklist_lock); | |
6f44993f SN |
244 | stats->version = TASKSTATS_VERSION; |
245 | ||
c757249a | 246 | /* |
ad4ecbcb SN |
247 | * Accounting subsytems can also add calls here to modify |
248 | * fields of taskstats. | |
c757249a SN |
249 | */ |
250 | ||
ad4ecbcb SN |
251 | return 0; |
252 | } | |
253 | ||
254 | ||
255 | static void fill_tgid_exit(struct task_struct *tsk) | |
256 | { | |
257 | unsigned long flags; | |
258 | ||
259 | spin_lock_irqsave(&tsk->signal->stats_lock, flags); | |
260 | if (!tsk->signal->stats) | |
261 | goto ret; | |
262 | ||
263 | /* | |
264 | * Each accounting subsystem calls its functions here to | |
265 | * accumalate its per-task stats for tsk, into the per-tgid structure | |
266 | * | |
267 | * per-task-foo(tsk->signal->stats, tsk); | |
268 | */ | |
269 | delayacct_add_tsk(tsk->signal->stats, tsk); | |
270 | ret: | |
271 | spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); | |
272 | return; | |
c757249a SN |
273 | } |
274 | ||
f9fd8914 SN |
275 | static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) |
276 | { | |
277 | struct listener_list *listeners; | |
278 | struct listener *s, *tmp; | |
279 | unsigned int cpu; | |
280 | cpumask_t mask = *maskp; | |
ad4ecbcb | 281 | |
f9fd8914 SN |
282 | if (!cpus_subset(mask, cpu_possible_map)) |
283 | return -EINVAL; | |
284 | ||
285 | if (isadd == REGISTER) { | |
286 | for_each_cpu_mask(cpu, mask) { | |
287 | s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, | |
288 | cpu_to_node(cpu)); | |
289 | if (!s) | |
290 | goto cleanup; | |
291 | s->pid = pid; | |
292 | INIT_LIST_HEAD(&s->list); | |
293 | ||
294 | listeners = &per_cpu(listener_array, cpu); | |
295 | down_write(&listeners->sem); | |
296 | list_add(&s->list, &listeners->list); | |
297 | up_write(&listeners->sem); | |
298 | } | |
299 | return 0; | |
300 | } | |
301 | ||
302 | /* Deregister or cleanup */ | |
303 | cleanup: | |
304 | for_each_cpu_mask(cpu, mask) { | |
305 | listeners = &per_cpu(listener_array, cpu); | |
306 | down_write(&listeners->sem); | |
307 | list_for_each_entry_safe(s, tmp, &listeners->list, list) { | |
308 | if (s->pid == pid) { | |
309 | list_del(&s->list); | |
310 | kfree(s); | |
311 | break; | |
312 | } | |
313 | } | |
314 | up_write(&listeners->sem); | |
315 | } | |
316 | return 0; | |
317 | } | |
318 | ||
319 | static int parse(struct nlattr *na, cpumask_t *mask) | |
320 | { | |
321 | char *data; | |
322 | int len; | |
323 | int ret; | |
324 | ||
325 | if (na == NULL) | |
326 | return 1; | |
327 | len = nla_len(na); | |
328 | if (len > TASKSTATS_CPUMASK_MAXLEN) | |
329 | return -E2BIG; | |
330 | if (len < 1) | |
331 | return -EINVAL; | |
332 | data = kmalloc(len, GFP_KERNEL); | |
333 | if (!data) | |
334 | return -ENOMEM; | |
335 | nla_strlcpy(data, na, len); | |
336 | ret = cpulist_parse(data, *mask); | |
337 | kfree(data); | |
338 | return ret; | |
339 | } | |
340 | ||
341 | static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | |
c757249a SN |
342 | { |
343 | int rc = 0; | |
344 | struct sk_buff *rep_skb; | |
345 | struct taskstats stats; | |
346 | void *reply; | |
347 | size_t size; | |
348 | struct nlattr *na; | |
f9fd8914 SN |
349 | cpumask_t mask; |
350 | ||
351 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); | |
352 | if (rc < 0) | |
353 | return rc; | |
354 | if (rc == 0) | |
355 | return add_del_listener(info->snd_pid, &mask, REGISTER); | |
356 | ||
357 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask); | |
358 | if (rc < 0) | |
359 | return rc; | |
360 | if (rc == 0) | |
361 | return add_del_listener(info->snd_pid, &mask, DEREGISTER); | |
c757249a SN |
362 | |
363 | /* | |
364 | * Size includes space for nested attributes | |
365 | */ | |
366 | size = nla_total_size(sizeof(u32)) + | |
367 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | |
368 | ||
369 | memset(&stats, 0, sizeof(stats)); | |
370 | rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); | |
371 | if (rc < 0) | |
372 | return rc; | |
373 | ||
374 | if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { | |
375 | u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); | |
376 | rc = fill_pid(pid, NULL, &stats); | |
377 | if (rc < 0) | |
378 | goto err; | |
379 | ||
380 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); | |
381 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid); | |
382 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | |
383 | stats); | |
384 | } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { | |
385 | u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); | |
386 | rc = fill_tgid(tgid, NULL, &stats); | |
387 | if (rc < 0) | |
388 | goto err; | |
389 | ||
390 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); | |
391 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid); | |
392 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | |
393 | stats); | |
394 | } else { | |
395 | rc = -EINVAL; | |
396 | goto err; | |
397 | } | |
398 | ||
399 | nla_nest_end(rep_skb, na); | |
400 | ||
f9fd8914 | 401 | return send_reply(rep_skb, info->snd_pid); |
c757249a SN |
402 | |
403 | nla_put_failure: | |
404 | return genlmsg_cancel(rep_skb, reply); | |
405 | err: | |
406 | nlmsg_free(rep_skb); | |
407 | return rc; | |
408 | } | |
409 | ||
f9fd8914 SN |
410 | void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) |
411 | { | |
412 | struct listener_list *listeners; | |
413 | struct taskstats *tmp; | |
414 | /* | |
415 | * This is the cpu on which the task is exiting currently and will | |
416 | * be the one for which the exit event is sent, even if the cpu | |
417 | * on which this function is running changes later. | |
418 | */ | |
419 | *mycpu = raw_smp_processor_id(); | |
420 | ||
421 | *ptidstats = NULL; | |
422 | tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); | |
423 | if (!tmp) | |
424 | return; | |
425 | ||
426 | listeners = &per_cpu(listener_array, *mycpu); | |
427 | down_read(&listeners->sem); | |
428 | if (!list_empty(&listeners->list)) { | |
429 | *ptidstats = tmp; | |
430 | tmp = NULL; | |
431 | } | |
432 | up_read(&listeners->sem); | |
433 | kfree(tmp); | |
434 | } | |
435 | ||
c757249a SN |
436 | /* Send pid data out on exit */ |
437 | void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, | |
f9fd8914 | 438 | int group_dead, unsigned int mycpu) |
c757249a SN |
439 | { |
440 | int rc; | |
441 | struct sk_buff *rep_skb; | |
442 | void *reply; | |
443 | size_t size; | |
444 | int is_thread_group; | |
445 | struct nlattr *na; | |
ad4ecbcb | 446 | unsigned long flags; |
c757249a SN |
447 | |
448 | if (!family_registered || !tidstats) | |
449 | return; | |
450 | ||
ad4ecbcb SN |
451 | spin_lock_irqsave(&tsk->signal->stats_lock, flags); |
452 | is_thread_group = tsk->signal->stats ? 1 : 0; | |
453 | spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); | |
c757249a | 454 | |
ad4ecbcb | 455 | rc = 0; |
c757249a SN |
456 | /* |
457 | * Size includes space for nested attributes | |
458 | */ | |
459 | size = nla_total_size(sizeof(u32)) + | |
460 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | |
461 | ||
462 | if (is_thread_group) | |
463 | size = 2 * size; /* PID + STATS + TGID + STATS */ | |
464 | ||
465 | rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); | |
466 | if (rc < 0) | |
467 | goto ret; | |
468 | ||
469 | rc = fill_pid(tsk->pid, tsk, tidstats); | |
470 | if (rc < 0) | |
471 | goto err_skb; | |
472 | ||
473 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); | |
474 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid); | |
475 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | |
476 | *tidstats); | |
477 | nla_nest_end(rep_skb, na); | |
478 | ||
ad4ecbcb SN |
479 | if (!is_thread_group) |
480 | goto send; | |
c757249a | 481 | |
c757249a | 482 | /* |
ad4ecbcb SN |
483 | * tsk has/had a thread group so fill the tsk->signal->stats structure |
484 | * Doesn't matter if tsk is the leader or the last group member leaving | |
c757249a | 485 | */ |
ad4ecbcb SN |
486 | |
487 | fill_tgid_exit(tsk); | |
488 | if (!group_dead) | |
489 | goto send; | |
c757249a SN |
490 | |
491 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); | |
492 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); | |
ad4ecbcb | 493 | /* No locking needed for tsk->signal->stats since group is dead */ |
c757249a | 494 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, |
ad4ecbcb | 495 | *tsk->signal->stats); |
c757249a SN |
496 | nla_nest_end(rep_skb, na); |
497 | ||
ad4ecbcb | 498 | send: |
f9fd8914 | 499 | send_cpu_listeners(rep_skb, mycpu); |
ad4ecbcb | 500 | return; |
c757249a SN |
501 | |
502 | nla_put_failure: | |
503 | genlmsg_cancel(rep_skb, reply); | |
504 | goto ret; | |
505 | err_skb: | |
506 | nlmsg_free(rep_skb); | |
507 | ret: | |
c757249a SN |
508 | return; |
509 | } | |
510 | ||
511 | static struct genl_ops taskstats_ops = { | |
512 | .cmd = TASKSTATS_CMD_GET, | |
f9fd8914 | 513 | .doit = taskstats_user_cmd, |
c757249a SN |
514 | .policy = taskstats_cmd_get_policy, |
515 | }; | |
516 | ||
517 | /* Needed early in initialization */ | |
518 | void __init taskstats_init_early(void) | |
519 | { | |
f9fd8914 SN |
520 | unsigned int i; |
521 | ||
c757249a SN |
522 | taskstats_cache = kmem_cache_create("taskstats_cache", |
523 | sizeof(struct taskstats), | |
524 | 0, SLAB_PANIC, NULL, NULL); | |
f9fd8914 SN |
525 | for_each_possible_cpu(i) { |
526 | INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); | |
527 | init_rwsem(&(per_cpu(listener_array, i).sem)); | |
528 | } | |
c757249a SN |
529 | } |
530 | ||
531 | static int __init taskstats_init(void) | |
532 | { | |
533 | int rc; | |
534 | ||
535 | rc = genl_register_family(&family); | |
536 | if (rc) | |
537 | return rc; | |
538 | ||
539 | rc = genl_register_ops(&family, &taskstats_ops); | |
540 | if (rc < 0) | |
541 | goto err; | |
542 | ||
543 | family_registered = 1; | |
544 | return 0; | |
545 | err: | |
546 | genl_unregister_family(&family); | |
547 | return rc; | |
548 | } | |
549 | ||
550 | /* | |
551 | * late initcall ensures initialization of statistics collection | |
552 | * mechanisms precedes initialization of the taskstats interface | |
553 | */ | |
554 | late_initcall(taskstats_init); |