[PATCH] per-task delay accounting taskstats interface: control exit data through...
[linux-block.git] / kernel / taskstats.c
CommitLineData
c757249a
SN
1/*
2 * taskstats.c - Export per-task statistics to userland
3 *
4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
5 * (C) Balbir Singh, IBM Corp. 2006
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 */
18
19#include <linux/kernel.h>
20#include <linux/taskstats_kern.h>
6f44993f 21#include <linux/delayacct.h>
f9fd8914
SN
22#include <linux/cpumask.h>
23#include <linux/percpu.h>
c757249a
SN
24#include <net/genetlink.h>
25#include <asm/atomic.h>
26
f9fd8914
SN
27/*
28 * Maximum length of a cpumask that can be specified in
29 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute
30 */
31#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS)
32
c757249a
SN
33static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
34static int family_registered;
35kmem_cache_t *taskstats_cache;
c757249a
SN
36
37static struct genl_family family = {
38 .id = GENL_ID_GENERATE,
39 .name = TASKSTATS_GENL_NAME,
40 .version = TASKSTATS_GENL_VERSION,
41 .maxattr = TASKSTATS_CMD_ATTR_MAX,
42};
43
44static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
45__read_mostly = {
46 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 },
47 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
f9fd8914
SN
48 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
49 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
50
51struct listener {
52 struct list_head list;
53 pid_t pid;
c757249a
SN
54};
55
f9fd8914
SN
56struct listener_list {
57 struct rw_semaphore sem;
58 struct list_head list;
59};
60static DEFINE_PER_CPU(struct listener_list, listener_array);
61
62enum actions {
63 REGISTER,
64 DEREGISTER,
65 CPU_DONT_CARE
66};
c757249a
SN
67
68static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
69 void **replyp, size_t size)
70{
71 struct sk_buff *skb;
72 void *reply;
73
74 /*
75 * If new attributes are added, please revisit this allocation
76 */
77 skb = nlmsg_new(size);
78 if (!skb)
79 return -ENOMEM;
80
81 if (!info) {
82 int seq = get_cpu_var(taskstats_seqnum)++;
83 put_cpu_var(taskstats_seqnum);
84
85 reply = genlmsg_put(skb, 0, seq,
86 family.id, 0, 0,
87 cmd, family.version);
88 } else
89 reply = genlmsg_put(skb, info->snd_pid, info->snd_seq,
90 family.id, 0, 0,
91 cmd, family.version);
92 if (reply == NULL) {
93 nlmsg_free(skb);
94 return -EINVAL;
95 }
96
97 *skbp = skb;
98 *replyp = reply;
99 return 0;
100}
101
f9fd8914
SN
102/*
103 * Send taskstats data in @skb to listener with nl_pid @pid
104 */
105static int send_reply(struct sk_buff *skb, pid_t pid)
c757249a
SN
106{
107 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
f9fd8914 108 void *reply = genlmsg_data(genlhdr);
c757249a
SN
109 int rc;
110
c757249a
SN
111 rc = genlmsg_end(skb, reply);
112 if (rc < 0) {
113 nlmsg_free(skb);
114 return rc;
115 }
116
c757249a
SN
117 return genlmsg_unicast(skb, pid);
118}
119
f9fd8914
SN
120/*
121 * Send taskstats data in @skb to listeners registered for @cpu's exit data
122 */
123static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
124{
125 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
126 struct listener_list *listeners;
127 struct listener *s, *tmp;
128 struct sk_buff *skb_next, *skb_cur = skb;
129 void *reply = genlmsg_data(genlhdr);
130 int rc, ret;
131
132 rc = genlmsg_end(skb, reply);
133 if (rc < 0) {
134 nlmsg_free(skb);
135 return rc;
136 }
137
138 rc = 0;
139 listeners = &per_cpu(listener_array, cpu);
140 down_write(&listeners->sem);
141 list_for_each_entry_safe(s, tmp, &listeners->list, list) {
142 skb_next = NULL;
143 if (!list_is_last(&s->list, &listeners->list)) {
144 skb_next = skb_clone(skb_cur, GFP_KERNEL);
145 if (!skb_next) {
146 nlmsg_free(skb_cur);
147 rc = -ENOMEM;
148 break;
149 }
150 }
151 ret = genlmsg_unicast(skb_cur, s->pid);
152 if (ret == -ECONNREFUSED) {
153 list_del(&s->list);
154 kfree(s);
155 rc = ret;
156 }
157 skb_cur = skb_next;
158 }
159 up_write(&listeners->sem);
160
161 return rc;
162}
163
c757249a
SN
164static int fill_pid(pid_t pid, struct task_struct *pidtsk,
165 struct taskstats *stats)
166{
167 int rc;
168 struct task_struct *tsk = pidtsk;
169
170 if (!pidtsk) {
171 read_lock(&tasklist_lock);
172 tsk = find_task_by_pid(pid);
173 if (!tsk) {
174 read_unlock(&tasklist_lock);
175 return -ESRCH;
176 }
177 get_task_struct(tsk);
178 read_unlock(&tasklist_lock);
179 } else
180 get_task_struct(tsk);
181
182 /*
183 * Each accounting subsystem adds calls to its functions to
184 * fill in relevant parts of struct taskstsats as follows
185 *
186 * rc = per-task-foo(stats, tsk);
187 * if (rc)
188 * goto err;
189 */
190
6f44993f
SN
191 rc = delayacct_add_tsk(stats, tsk);
192 stats->version = TASKSTATS_VERSION;
193
194 /* Define err: label here if needed */
c757249a
SN
195 put_task_struct(tsk);
196 return rc;
197
198}
199
200static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
201 struct taskstats *stats)
202{
c757249a 203 struct task_struct *tsk, *first;
ad4ecbcb 204 unsigned long flags;
c757249a 205
ad4ecbcb
SN
206 /*
207 * Add additional stats from live tasks except zombie thread group
208 * leaders who are already counted with the dead tasks
209 */
c757249a 210 first = tgidtsk;
c757249a 211 if (!first) {
ad4ecbcb 212 read_lock(&tasklist_lock);
c757249a
SN
213 first = find_task_by_pid(tgid);
214 if (!first) {
215 read_unlock(&tasklist_lock);
216 return -ESRCH;
217 }
ad4ecbcb
SN
218 get_task_struct(first);
219 read_unlock(&tasklist_lock);
220 } else
221 get_task_struct(first);
222
223 /* Start with stats from dead tasks */
224 spin_lock_irqsave(&first->signal->stats_lock, flags);
225 if (first->signal->stats)
226 memcpy(stats, first->signal->stats, sizeof(*stats));
227 spin_unlock_irqrestore(&first->signal->stats_lock, flags);
228
c757249a 229 tsk = first;
ad4ecbcb 230 read_lock(&tasklist_lock);
c757249a 231 do {
ad4ecbcb
SN
232 if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk))
233 continue;
c757249a 234 /*
ad4ecbcb 235 * Accounting subsystem can call its functions here to
c757249a
SN
236 * fill in relevant parts of struct taskstsats as follows
237 *
ad4ecbcb 238 * per-task-foo(stats, tsk);
c757249a 239 */
ad4ecbcb 240 delayacct_add_tsk(stats, tsk);
6f44993f 241
c757249a
SN
242 } while_each_thread(first, tsk);
243 read_unlock(&tasklist_lock);
6f44993f
SN
244 stats->version = TASKSTATS_VERSION;
245
c757249a 246 /*
ad4ecbcb
SN
247 * Accounting subsytems can also add calls here to modify
248 * fields of taskstats.
c757249a
SN
249 */
250
ad4ecbcb
SN
251 return 0;
252}
253
254
255static void fill_tgid_exit(struct task_struct *tsk)
256{
257 unsigned long flags;
258
259 spin_lock_irqsave(&tsk->signal->stats_lock, flags);
260 if (!tsk->signal->stats)
261 goto ret;
262
263 /*
264 * Each accounting subsystem calls its functions here to
265 * accumalate its per-task stats for tsk, into the per-tgid structure
266 *
267 * per-task-foo(tsk->signal->stats, tsk);
268 */
269 delayacct_add_tsk(tsk->signal->stats, tsk);
270ret:
271 spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
272 return;
c757249a
SN
273}
274
f9fd8914
SN
275static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
276{
277 struct listener_list *listeners;
278 struct listener *s, *tmp;
279 unsigned int cpu;
280 cpumask_t mask = *maskp;
ad4ecbcb 281
f9fd8914
SN
282 if (!cpus_subset(mask, cpu_possible_map))
283 return -EINVAL;
284
285 if (isadd == REGISTER) {
286 for_each_cpu_mask(cpu, mask) {
287 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
288 cpu_to_node(cpu));
289 if (!s)
290 goto cleanup;
291 s->pid = pid;
292 INIT_LIST_HEAD(&s->list);
293
294 listeners = &per_cpu(listener_array, cpu);
295 down_write(&listeners->sem);
296 list_add(&s->list, &listeners->list);
297 up_write(&listeners->sem);
298 }
299 return 0;
300 }
301
302 /* Deregister or cleanup */
303cleanup:
304 for_each_cpu_mask(cpu, mask) {
305 listeners = &per_cpu(listener_array, cpu);
306 down_write(&listeners->sem);
307 list_for_each_entry_safe(s, tmp, &listeners->list, list) {
308 if (s->pid == pid) {
309 list_del(&s->list);
310 kfree(s);
311 break;
312 }
313 }
314 up_write(&listeners->sem);
315 }
316 return 0;
317}
318
319static int parse(struct nlattr *na, cpumask_t *mask)
320{
321 char *data;
322 int len;
323 int ret;
324
325 if (na == NULL)
326 return 1;
327 len = nla_len(na);
328 if (len > TASKSTATS_CPUMASK_MAXLEN)
329 return -E2BIG;
330 if (len < 1)
331 return -EINVAL;
332 data = kmalloc(len, GFP_KERNEL);
333 if (!data)
334 return -ENOMEM;
335 nla_strlcpy(data, na, len);
336 ret = cpulist_parse(data, *mask);
337 kfree(data);
338 return ret;
339}
340
341static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
c757249a
SN
342{
343 int rc = 0;
344 struct sk_buff *rep_skb;
345 struct taskstats stats;
346 void *reply;
347 size_t size;
348 struct nlattr *na;
f9fd8914
SN
349 cpumask_t mask;
350
351 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
352 if (rc < 0)
353 return rc;
354 if (rc == 0)
355 return add_del_listener(info->snd_pid, &mask, REGISTER);
356
357 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask);
358 if (rc < 0)
359 return rc;
360 if (rc == 0)
361 return add_del_listener(info->snd_pid, &mask, DEREGISTER);
c757249a
SN
362
363 /*
364 * Size includes space for nested attributes
365 */
366 size = nla_total_size(sizeof(u32)) +
367 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
368
369 memset(&stats, 0, sizeof(stats));
370 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
371 if (rc < 0)
372 return rc;
373
374 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
375 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
376 rc = fill_pid(pid, NULL, &stats);
377 if (rc < 0)
378 goto err;
379
380 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
381 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid);
382 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
383 stats);
384 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
385 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
386 rc = fill_tgid(tgid, NULL, &stats);
387 if (rc < 0)
388 goto err;
389
390 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
391 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid);
392 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
393 stats);
394 } else {
395 rc = -EINVAL;
396 goto err;
397 }
398
399 nla_nest_end(rep_skb, na);
400
f9fd8914 401 return send_reply(rep_skb, info->snd_pid);
c757249a
SN
402
403nla_put_failure:
404 return genlmsg_cancel(rep_skb, reply);
405err:
406 nlmsg_free(rep_skb);
407 return rc;
408}
409
f9fd8914
SN
410void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
411{
412 struct listener_list *listeners;
413 struct taskstats *tmp;
414 /*
415 * This is the cpu on which the task is exiting currently and will
416 * be the one for which the exit event is sent, even if the cpu
417 * on which this function is running changes later.
418 */
419 *mycpu = raw_smp_processor_id();
420
421 *ptidstats = NULL;
422 tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
423 if (!tmp)
424 return;
425
426 listeners = &per_cpu(listener_array, *mycpu);
427 down_read(&listeners->sem);
428 if (!list_empty(&listeners->list)) {
429 *ptidstats = tmp;
430 tmp = NULL;
431 }
432 up_read(&listeners->sem);
433 kfree(tmp);
434}
435
c757249a
SN
436/* Send pid data out on exit */
437void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
f9fd8914 438 int group_dead, unsigned int mycpu)
c757249a
SN
439{
440 int rc;
441 struct sk_buff *rep_skb;
442 void *reply;
443 size_t size;
444 int is_thread_group;
445 struct nlattr *na;
ad4ecbcb 446 unsigned long flags;
c757249a
SN
447
448 if (!family_registered || !tidstats)
449 return;
450
ad4ecbcb
SN
451 spin_lock_irqsave(&tsk->signal->stats_lock, flags);
452 is_thread_group = tsk->signal->stats ? 1 : 0;
453 spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
c757249a 454
ad4ecbcb 455 rc = 0;
c757249a
SN
456 /*
457 * Size includes space for nested attributes
458 */
459 size = nla_total_size(sizeof(u32)) +
460 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
461
462 if (is_thread_group)
463 size = 2 * size; /* PID + STATS + TGID + STATS */
464
465 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
466 if (rc < 0)
467 goto ret;
468
469 rc = fill_pid(tsk->pid, tsk, tidstats);
470 if (rc < 0)
471 goto err_skb;
472
473 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
474 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid);
475 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
476 *tidstats);
477 nla_nest_end(rep_skb, na);
478
ad4ecbcb
SN
479 if (!is_thread_group)
480 goto send;
c757249a 481
c757249a 482 /*
ad4ecbcb
SN
483 * tsk has/had a thread group so fill the tsk->signal->stats structure
484 * Doesn't matter if tsk is the leader or the last group member leaving
c757249a 485 */
ad4ecbcb
SN
486
487 fill_tgid_exit(tsk);
488 if (!group_dead)
489 goto send;
c757249a
SN
490
491 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
492 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
ad4ecbcb 493 /* No locking needed for tsk->signal->stats since group is dead */
c757249a 494 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
ad4ecbcb 495 *tsk->signal->stats);
c757249a
SN
496 nla_nest_end(rep_skb, na);
497
ad4ecbcb 498send:
f9fd8914 499 send_cpu_listeners(rep_skb, mycpu);
ad4ecbcb 500 return;
c757249a
SN
501
502nla_put_failure:
503 genlmsg_cancel(rep_skb, reply);
504 goto ret;
505err_skb:
506 nlmsg_free(rep_skb);
507ret:
c757249a
SN
508 return;
509}
510
511static struct genl_ops taskstats_ops = {
512 .cmd = TASKSTATS_CMD_GET,
f9fd8914 513 .doit = taskstats_user_cmd,
c757249a
SN
514 .policy = taskstats_cmd_get_policy,
515};
516
517/* Needed early in initialization */
518void __init taskstats_init_early(void)
519{
f9fd8914
SN
520 unsigned int i;
521
c757249a
SN
522 taskstats_cache = kmem_cache_create("taskstats_cache",
523 sizeof(struct taskstats),
524 0, SLAB_PANIC, NULL, NULL);
f9fd8914
SN
525 for_each_possible_cpu(i) {
526 INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
527 init_rwsem(&(per_cpu(listener_array, i).sem));
528 }
c757249a
SN
529}
530
531static int __init taskstats_init(void)
532{
533 int rc;
534
535 rc = genl_register_family(&family);
536 if (rc)
537 return rc;
538
539 rc = genl_register_ops(&family, &taskstats_ops);
540 if (rc < 0)
541 goto err;
542
543 family_registered = 1;
544 return 0;
545err:
546 genl_unregister_family(&family);
547 return rc;
548}
549
550/*
551 * late initcall ensures initialization of statistics collection
552 * mechanisms precedes initialization of the taskstats interface
553 */
554late_initcall(taskstats_init);