a78e86349ecbf7b977ae3ab660a9adb1b50ead3e
[linux-2.6-block.git] / kernel / trace / trace_syscalls.c
1 #include <trace/syscall.h>
2 #include <trace/events/syscalls.h>
3 #include <linux/kernel.h>
4 #include <linux/ftrace.h>
5 #include <linux/perf_event.h>
6 #include <asm/syscall.h>
7
8 #include "trace_output.h"
9 #include "trace.h"
10
11 static DEFINE_MUTEX(syscall_trace_lock);
12 static int sys_refcount_enter;
13 static int sys_refcount_exit;
14 static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
15 static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
16
17 extern unsigned long __start_syscalls_metadata[];
18 extern unsigned long __stop_syscalls_metadata[];
19
20 static struct syscall_metadata **syscalls_metadata;
21
22 static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
23 {
24         struct syscall_metadata *start;
25         struct syscall_metadata *stop;
26         char str[KSYM_SYMBOL_LEN];
27
28
29         start = (struct syscall_metadata *)__start_syscalls_metadata;
30         stop = (struct syscall_metadata *)__stop_syscalls_metadata;
31         kallsyms_lookup(syscall, NULL, NULL, NULL, str);
32
33         for ( ; start < stop; start++) {
34                 /*
35                  * Only compare after the "sys" prefix. Archs that use
36                  * syscall wrappers may have syscalls symbols aliases prefixed
37                  * with "SyS" instead of "sys", leading to an unwanted
38                  * mismatch.
39                  */
40                 if (start->name && !strcmp(start->name + 3, str + 3))
41                         return start;
42         }
43         return NULL;
44 }
45
46 static struct syscall_metadata *syscall_nr_to_meta(int nr)
47 {
48         if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
49                 return NULL;
50
51         return syscalls_metadata[nr];
52 }
53
54 enum print_line_t
55 print_syscall_enter(struct trace_iterator *iter, int flags)
56 {
57         struct trace_seq *s = &iter->seq;
58         struct trace_entry *ent = iter->ent;
59         struct syscall_trace_enter *trace;
60         struct syscall_metadata *entry;
61         int i, ret, syscall;
62
63         trace = (typeof(trace))ent;
64         syscall = trace->nr;
65         entry = syscall_nr_to_meta(syscall);
66
67         if (!entry)
68                 goto end;
69
70         if (entry->enter_event->id != ent->type) {
71                 WARN_ON_ONCE(1);
72                 goto end;
73         }
74
75         ret = trace_seq_printf(s, "%s(", entry->name);
76         if (!ret)
77                 return TRACE_TYPE_PARTIAL_LINE;
78
79         for (i = 0; i < entry->nb_args; i++) {
80                 /* parameter types */
81                 if (trace_flags & TRACE_ITER_VERBOSE) {
82                         ret = trace_seq_printf(s, "%s ", entry->types[i]);
83                         if (!ret)
84                                 return TRACE_TYPE_PARTIAL_LINE;
85                 }
86                 /* parameter values */
87                 ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
88                                        trace->args[i],
89                                        i == entry->nb_args - 1 ? "" : ", ");
90                 if (!ret)
91                         return TRACE_TYPE_PARTIAL_LINE;
92         }
93
94         ret = trace_seq_putc(s, ')');
95         if (!ret)
96                 return TRACE_TYPE_PARTIAL_LINE;
97
98 end:
99         ret =  trace_seq_putc(s, '\n');
100         if (!ret)
101                 return TRACE_TYPE_PARTIAL_LINE;
102
103         return TRACE_TYPE_HANDLED;
104 }
105
106 enum print_line_t
107 print_syscall_exit(struct trace_iterator *iter, int flags)
108 {
109         struct trace_seq *s = &iter->seq;
110         struct trace_entry *ent = iter->ent;
111         struct syscall_trace_exit *trace;
112         int syscall;
113         struct syscall_metadata *entry;
114         int ret;
115
116         trace = (typeof(trace))ent;
117         syscall = trace->nr;
118         entry = syscall_nr_to_meta(syscall);
119
120         if (!entry) {
121                 trace_seq_printf(s, "\n");
122                 return TRACE_TYPE_HANDLED;
123         }
124
125         if (entry->exit_event->id != ent->type) {
126                 WARN_ON_ONCE(1);
127                 return TRACE_TYPE_UNHANDLED;
128         }
129
130         ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
131                                 trace->ret);
132         if (!ret)
133                 return TRACE_TYPE_PARTIAL_LINE;
134
135         return TRACE_TYPE_HANDLED;
136 }
137
138 extern char *__bad_type_size(void);
139
140 #define SYSCALL_FIELD(type, name)                                       \
141         sizeof(type) != sizeof(trace.name) ?                            \
142                 __bad_type_size() :                                     \
143                 #type, #name, offsetof(typeof(trace), name),            \
144                 sizeof(trace.name), is_signed_type(type)
145
146 int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
147 {
148         int i;
149         int ret;
150         struct syscall_metadata *entry = call->data;
151         struct syscall_trace_enter trace;
152         int offset = offsetof(struct syscall_trace_enter, args);
153
154         ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
155                                "\tsigned:%u;\n",
156                                SYSCALL_FIELD(int, nr));
157         if (!ret)
158                 return 0;
159
160         for (i = 0; i < entry->nb_args; i++) {
161                 ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i],
162                                         entry->args[i]);
163                 if (!ret)
164                         return 0;
165                 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;"
166                                        "\tsigned:%u;\n", offset,
167                                        sizeof(unsigned long),
168                                        is_signed_type(unsigned long));
169                 if (!ret)
170                         return 0;
171                 offset += sizeof(unsigned long);
172         }
173
174         trace_seq_puts(s, "\nprint fmt: \"");
175         for (i = 0; i < entry->nb_args; i++) {
176                 ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i],
177                                         sizeof(unsigned long),
178                                         i == entry->nb_args - 1 ? "" : ", ");
179                 if (!ret)
180                         return 0;
181         }
182         trace_seq_putc(s, '"');
183
184         for (i = 0; i < entry->nb_args; i++) {
185                 ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))",
186                                        entry->args[i]);
187                 if (!ret)
188                         return 0;
189         }
190
191         return trace_seq_putc(s, '\n');
192 }
193
194 static
195 int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
196 {
197         int i;
198         int pos = 0;
199
200         /* When len=0, we just calculate the needed length */
201 #define LEN_OR_ZERO (len ? len - pos : 0)
202
203         pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
204         for (i = 0; i < entry->nb_args; i++) {
205                 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
206                                 entry->args[i], sizeof(unsigned long),
207                                 i == entry->nb_args - 1 ? "" : ", ");
208         }
209         pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
210
211         for (i = 0; i < entry->nb_args; i++) {
212                 pos += snprintf(buf + pos, LEN_OR_ZERO,
213                                 ", ((unsigned long)(REC->%s))", entry->args[i]);
214         }
215
216 #undef LEN_OR_ZERO
217
218         /* return the length of print_fmt */
219         return pos;
220 }
221
222 static int set_syscall_print_fmt(struct ftrace_event_call *call)
223 {
224         char *print_fmt;
225         int len;
226         struct syscall_metadata *entry = call->data;
227
228         if (entry->enter_event != call) {
229                 call->print_fmt = "\"0x%lx\", REC->ret";
230                 return 0;
231         }
232
233         /* First: called with 0 length to calculate the needed length */
234         len = __set_enter_print_fmt(entry, NULL, 0);
235
236         print_fmt = kmalloc(len + 1, GFP_KERNEL);
237         if (!print_fmt)
238                 return -ENOMEM;
239
240         /* Second: actually write the @print_fmt */
241         __set_enter_print_fmt(entry, print_fmt, len + 1);
242         call->print_fmt = print_fmt;
243
244         return 0;
245 }
246
247 static void free_syscall_print_fmt(struct ftrace_event_call *call)
248 {
249         struct syscall_metadata *entry = call->data;
250
251         if (entry->enter_event == call)
252                 kfree(call->print_fmt);
253 }
254
255 int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
256 {
257         int ret;
258         struct syscall_trace_exit trace;
259
260         ret = trace_seq_printf(s,
261                                "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
262                                "\tsigned:%u;\n"
263                                "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
264                                "\tsigned:%u;\n",
265                                SYSCALL_FIELD(int, nr),
266                                SYSCALL_FIELD(long, ret));
267         if (!ret)
268                 return 0;
269
270         return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n");
271 }
272
273 int syscall_enter_define_fields(struct ftrace_event_call *call)
274 {
275         struct syscall_trace_enter trace;
276         struct syscall_metadata *meta = call->data;
277         int ret;
278         int i;
279         int offset = offsetof(typeof(trace), args);
280
281         ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
282         if (ret)
283                 return ret;
284
285         for (i = 0; i < meta->nb_args; i++) {
286                 ret = trace_define_field(call, meta->types[i],
287                                          meta->args[i], offset,
288                                          sizeof(unsigned long), 0,
289                                          FILTER_OTHER);
290                 offset += sizeof(unsigned long);
291         }
292
293         return ret;
294 }
295
296 int syscall_exit_define_fields(struct ftrace_event_call *call)
297 {
298         struct syscall_trace_exit trace;
299         int ret;
300
301         ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
302         if (ret)
303                 return ret;
304
305         ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
306                                  FILTER_OTHER);
307
308         return ret;
309 }
310
311 void ftrace_syscall_enter(struct pt_regs *regs, long id)
312 {
313         struct syscall_trace_enter *entry;
314         struct syscall_metadata *sys_data;
315         struct ring_buffer_event *event;
316         struct ring_buffer *buffer;
317         int size;
318         int syscall_nr;
319
320         syscall_nr = syscall_get_nr(current, regs);
321         if (syscall_nr < 0)
322                 return;
323         if (!test_bit(syscall_nr, enabled_enter_syscalls))
324                 return;
325
326         sys_data = syscall_nr_to_meta(syscall_nr);
327         if (!sys_data)
328                 return;
329
330         size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
331
332         event = trace_current_buffer_lock_reserve(&buffer,
333                         sys_data->enter_event->id, size, 0, 0);
334         if (!event)
335                 return;
336
337         entry = ring_buffer_event_data(event);
338         entry->nr = syscall_nr;
339         syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
340
341         if (!filter_current_check_discard(buffer, sys_data->enter_event,
342                                           entry, event))
343                 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
344 }
345
346 void ftrace_syscall_exit(struct pt_regs *regs, long ret)
347 {
348         struct syscall_trace_exit *entry;
349         struct syscall_metadata *sys_data;
350         struct ring_buffer_event *event;
351         struct ring_buffer *buffer;
352         int syscall_nr;
353
354         syscall_nr = syscall_get_nr(current, regs);
355         if (syscall_nr < 0)
356                 return;
357         if (!test_bit(syscall_nr, enabled_exit_syscalls))
358                 return;
359
360         sys_data = syscall_nr_to_meta(syscall_nr);
361         if (!sys_data)
362                 return;
363
364         event = trace_current_buffer_lock_reserve(&buffer,
365                         sys_data->exit_event->id, sizeof(*entry), 0, 0);
366         if (!event)
367                 return;
368
369         entry = ring_buffer_event_data(event);
370         entry->nr = syscall_nr;
371         entry->ret = syscall_get_return_value(current, regs);
372
373         if (!filter_current_check_discard(buffer, sys_data->exit_event,
374                                           entry, event))
375                 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
376 }
377
378 int reg_event_syscall_enter(struct ftrace_event_call *call)
379 {
380         int ret = 0;
381         int num;
382
383         num = ((struct syscall_metadata *)call->data)->syscall_nr;
384         if (num < 0 || num >= NR_syscalls)
385                 return -ENOSYS;
386         mutex_lock(&syscall_trace_lock);
387         if (!sys_refcount_enter)
388                 ret = register_trace_sys_enter(ftrace_syscall_enter);
389         if (!ret) {
390                 set_bit(num, enabled_enter_syscalls);
391                 sys_refcount_enter++;
392         }
393         mutex_unlock(&syscall_trace_lock);
394         return ret;
395 }
396
397 void unreg_event_syscall_enter(struct ftrace_event_call *call)
398 {
399         int num;
400
401         num = ((struct syscall_metadata *)call->data)->syscall_nr;
402         if (num < 0 || num >= NR_syscalls)
403                 return;
404         mutex_lock(&syscall_trace_lock);
405         sys_refcount_enter--;
406         clear_bit(num, enabled_enter_syscalls);
407         if (!sys_refcount_enter)
408                 unregister_trace_sys_enter(ftrace_syscall_enter);
409         mutex_unlock(&syscall_trace_lock);
410 }
411
412 int reg_event_syscall_exit(struct ftrace_event_call *call)
413 {
414         int ret = 0;
415         int num;
416
417         num = ((struct syscall_metadata *)call->data)->syscall_nr;
418         if (num < 0 || num >= NR_syscalls)
419                 return -ENOSYS;
420         mutex_lock(&syscall_trace_lock);
421         if (!sys_refcount_exit)
422                 ret = register_trace_sys_exit(ftrace_syscall_exit);
423         if (!ret) {
424                 set_bit(num, enabled_exit_syscalls);
425                 sys_refcount_exit++;
426         }
427         mutex_unlock(&syscall_trace_lock);
428         return ret;
429 }
430
431 void unreg_event_syscall_exit(struct ftrace_event_call *call)
432 {
433         int num;
434
435         num = ((struct syscall_metadata *)call->data)->syscall_nr;
436         if (num < 0 || num >= NR_syscalls)
437                 return;
438         mutex_lock(&syscall_trace_lock);
439         sys_refcount_exit--;
440         clear_bit(num, enabled_exit_syscalls);
441         if (!sys_refcount_exit)
442                 unregister_trace_sys_exit(ftrace_syscall_exit);
443         mutex_unlock(&syscall_trace_lock);
444 }
445
446 int init_syscall_trace(struct ftrace_event_call *call)
447 {
448         int id;
449
450         if (set_syscall_print_fmt(call) < 0)
451                 return -ENOMEM;
452
453         id = trace_event_raw_init(call);
454
455         if (id < 0) {
456                 free_syscall_print_fmt(call);
457                 return id;
458         }
459
460         return id;
461 }
462
463 int __init init_ftrace_syscalls(void)
464 {
465         struct syscall_metadata *meta;
466         unsigned long addr;
467         int i;
468
469         syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
470                                         NR_syscalls, GFP_KERNEL);
471         if (!syscalls_metadata) {
472                 WARN_ON(1);
473                 return -ENOMEM;
474         }
475
476         for (i = 0; i < NR_syscalls; i++) {
477                 addr = arch_syscall_addr(i);
478                 meta = find_syscall_meta(addr);
479                 if (!meta)
480                         continue;
481
482                 meta->syscall_nr = i;
483                 syscalls_metadata[i] = meta;
484         }
485
486         return 0;
487 }
488 core_initcall(init_ftrace_syscalls);
489
490 #ifdef CONFIG_EVENT_PROFILE
491
492 static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
493 static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
494 static int sys_prof_refcount_enter;
495 static int sys_prof_refcount_exit;
496
497 static void prof_syscall_enter(struct pt_regs *regs, long id)
498 {
499         struct syscall_metadata *sys_data;
500         struct syscall_trace_enter *rec;
501         unsigned long flags;
502         char *trace_buf;
503         char *raw_data;
504         int syscall_nr;
505         int rctx;
506         int size;
507         int cpu;
508
509         syscall_nr = syscall_get_nr(current, regs);
510         if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
511                 return;
512
513         sys_data = syscall_nr_to_meta(syscall_nr);
514         if (!sys_data)
515                 return;
516
517         /* get the size after alignment with the u32 buffer size field */
518         size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
519         size = ALIGN(size + sizeof(u32), sizeof(u64));
520         size -= sizeof(u32);
521
522         if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
523                       "profile buffer not large enough"))
524                 return;
525
526         /* Protect the per cpu buffer, begin the rcu read side */
527         local_irq_save(flags);
528
529         rctx = perf_swevent_get_recursion_context();
530         if (rctx < 0)
531                 goto end_recursion;
532
533         cpu = smp_processor_id();
534
535         trace_buf = rcu_dereference(perf_trace_buf);
536
537         if (!trace_buf)
538                 goto end;
539
540         raw_data = per_cpu_ptr(trace_buf, cpu);
541
542         /* zero the dead bytes from align to not leak stack to user */
543         *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
544
545         rec = (struct syscall_trace_enter *) raw_data;
546         tracing_generic_entry_update(&rec->ent, 0, 0);
547         rec->ent.type = sys_data->enter_event->id;
548         rec->nr = syscall_nr;
549         syscall_get_arguments(current, regs, 0, sys_data->nb_args,
550                                (unsigned long *)&rec->args);
551         perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size);
552
553 end:
554         perf_swevent_put_recursion_context(rctx);
555 end_recursion:
556         local_irq_restore(flags);
557 }
558
559 int prof_sysenter_enable(struct ftrace_event_call *call)
560 {
561         int ret = 0;
562         int num;
563
564         num = ((struct syscall_metadata *)call->data)->syscall_nr;
565
566         mutex_lock(&syscall_trace_lock);
567         if (!sys_prof_refcount_enter)
568                 ret = register_trace_sys_enter(prof_syscall_enter);
569         if (ret) {
570                 pr_info("event trace: Could not activate"
571                                 "syscall entry trace point");
572         } else {
573                 set_bit(num, enabled_prof_enter_syscalls);
574                 sys_prof_refcount_enter++;
575         }
576         mutex_unlock(&syscall_trace_lock);
577         return ret;
578 }
579
580 void prof_sysenter_disable(struct ftrace_event_call *call)
581 {
582         int num;
583
584         num = ((struct syscall_metadata *)call->data)->syscall_nr;
585
586         mutex_lock(&syscall_trace_lock);
587         sys_prof_refcount_enter--;
588         clear_bit(num, enabled_prof_enter_syscalls);
589         if (!sys_prof_refcount_enter)
590                 unregister_trace_sys_enter(prof_syscall_enter);
591         mutex_unlock(&syscall_trace_lock);
592 }
593
594 static void prof_syscall_exit(struct pt_regs *regs, long ret)
595 {
596         struct syscall_metadata *sys_data;
597         struct syscall_trace_exit *rec;
598         unsigned long flags;
599         int syscall_nr;
600         char *trace_buf;
601         char *raw_data;
602         int rctx;
603         int size;
604         int cpu;
605
606         syscall_nr = syscall_get_nr(current, regs);
607         if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
608                 return;
609
610         sys_data = syscall_nr_to_meta(syscall_nr);
611         if (!sys_data)
612                 return;
613
614         /* We can probably do that at build time */
615         size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
616         size -= sizeof(u32);
617
618         /*
619          * Impossible, but be paranoid with the future
620          * How to put this check outside runtime?
621          */
622         if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
623                 "exit event has grown above profile buffer size"))
624                 return;
625
626         /* Protect the per cpu buffer, begin the rcu read side */
627         local_irq_save(flags);
628
629         rctx = perf_swevent_get_recursion_context();
630         if (rctx < 0)
631                 goto end_recursion;
632
633         cpu = smp_processor_id();
634
635         trace_buf = rcu_dereference(perf_trace_buf);
636
637         if (!trace_buf)
638                 goto end;
639
640         raw_data = per_cpu_ptr(trace_buf, cpu);
641
642         /* zero the dead bytes from align to not leak stack to user */
643         *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
644
645         rec = (struct syscall_trace_exit *)raw_data;
646
647         tracing_generic_entry_update(&rec->ent, 0, 0);
648         rec->ent.type = sys_data->exit_event->id;
649         rec->nr = syscall_nr;
650         rec->ret = syscall_get_return_value(current, regs);
651
652         perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size);
653
654 end:
655         perf_swevent_put_recursion_context(rctx);
656 end_recursion:
657         local_irq_restore(flags);
658 }
659
660 int prof_sysexit_enable(struct ftrace_event_call *call)
661 {
662         int ret = 0;
663         int num;
664
665         num = ((struct syscall_metadata *)call->data)->syscall_nr;
666
667         mutex_lock(&syscall_trace_lock);
668         if (!sys_prof_refcount_exit)
669                 ret = register_trace_sys_exit(prof_syscall_exit);
670         if (ret) {
671                 pr_info("event trace: Could not activate"
672                                 "syscall entry trace point");
673         } else {
674                 set_bit(num, enabled_prof_exit_syscalls);
675                 sys_prof_refcount_exit++;
676         }
677         mutex_unlock(&syscall_trace_lock);
678         return ret;
679 }
680
681 void prof_sysexit_disable(struct ftrace_event_call *call)
682 {
683         int num;
684
685         num = ((struct syscall_metadata *)call->data)->syscall_nr;
686
687         mutex_lock(&syscall_trace_lock);
688         sys_prof_refcount_exit--;
689         clear_bit(num, enabled_prof_exit_syscalls);
690         if (!sys_prof_refcount_exit)
691                 unregister_trace_sys_exit(prof_syscall_exit);
692         mutex_unlock(&syscall_trace_lock);
693 }
694
695 #endif
696
697