Merge branch 'master' of ssh://axboe@master.kernel.org/pub/scm/linux/kernel/git/axboe...
[blktrace.git] / blktrace.c
1 /*
2  * block queue tracing application
3  *
4  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation; either version 2 of the License, or
9  *  (at your option) any later version.
10  *
11  *  This program is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with this program; if not, write to the Free Software
18  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  *
20  */
21 #include <pthread.h>
22 #include <sys/types.h>
23 #include <sys/stat.h>
24 #include <unistd.h>
25 #include <locale.h>
26 #include <signal.h>
27 #include <fcntl.h>
28 #include <string.h>
29 #include <sys/ioctl.h>
30 #include <sys/param.h>
31 #include <sys/statfs.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <sched.h>
35 #include <ctype.h>
36 #include <getopt.h>
37 #include <sys/mman.h>
38
39 #include "blktrace.h"
40
41 #define BUF_SIZE        (128 *1024)
42 #define BUF_NR          (4)
43
44 #define RELAYFS_TYPE    0xF0B4A981
45
46 #define DECLARE_MASK_MAP(mask)          { BLK_TC_##mask, #mask, "BLK_TC_"#mask }
47 #define COMPARE_MASK_MAP(mmp, str)                                      \
48         (!strcasecmp((mmp)->short_form, (str)) ||                      \
49          !strcasecmp((mmp)->long_form, (str)))
50
51 #define VALID_SET(x)    ((1 <= (x)) && ((x) < (1 << BLK_TC_SHIFT)))
52
53 struct mask_map {
54         int mask;
55         char *short_form;
56         char *long_form;
57 };
58
59 struct mask_map mask_maps[] = {
60         DECLARE_MASK_MAP(READ),
61         DECLARE_MASK_MAP(WRITE),
62         DECLARE_MASK_MAP(BARRIER),
63         DECLARE_MASK_MAP(SYNC),
64         DECLARE_MASK_MAP(QUEUE),
65         DECLARE_MASK_MAP(REQUEUE),
66         DECLARE_MASK_MAP(ISSUE),
67         DECLARE_MASK_MAP(COMPLETE),
68         DECLARE_MASK_MAP(FS),
69         DECLARE_MASK_MAP(PC),
70 };
71
72 #define S_OPTS  "d:a:A:r:o:kw:"
73 static struct option l_opts[] = {
74         {
75                 .name = "dev",
76                 .has_arg = required_argument,
77                 .flag = NULL,
78                 .val = 'd'
79         },
80         {
81                 .name = "act-mask",
82                 .has_arg = required_argument,
83                 .flag = NULL,
84                 .val = 'a'
85         },
86         {
87                 .name = "set-mask",
88                 .has_arg = required_argument,
89                 .flag = NULL,
90                 .val = 'A'
91         },
92         {
93                 .name = "relay",
94                 .has_arg = required_argument,
95                 .flag = NULL,
96                 .val = 'r'
97         },
98         {
99                 .name = "output",
100                 .has_arg = required_argument,
101                 .flag = NULL,
102                 .val = 'o'
103         },
104         {
105                 .name = "kill",
106                 .has_arg = no_argument,
107                 .flag = NULL,
108                 .val = 'k'
109         },
110         {
111                 .name = "stopwatch",
112                 .has_arg = required_argument,
113                 .flag = NULL,
114                 .val = 'w'
115         },
116 };
117
118 struct thread_information {
119         int cpu;
120         pthread_t thread;
121
122         int fd;
123         char fn[MAXPATHLEN + 64];
124         void *buf;
125         unsigned long buf_offset;
126         unsigned int buf_subbuf;
127         unsigned int sequence;
128
129         pthread_mutex_t *fd_lock;
130         int ofd;
131
132         unsigned long events_processed;
133         struct device_information *device;
134 };
135
136 struct device_information {
137         int fd;
138         char *path;
139         char buts_name[32];
140         int trace_started;
141         struct thread_information *threads;
142 };
143
144 static int ncpus;
145 static struct thread_information *thread_information;
146 static int ndevs;
147 static struct device_information *device_information;
148
149 /* command line option globals */
150 static char *relay_path;
151 static char *output_name;
152 static int act_mask = ~0U;
153 static int kill_running_trace;
154 static int use_mmap;
155
156 #define is_done()       (*(volatile int *)(&done))
157 static volatile int done;
158
159 static pthread_mutex_t stdout_mutex = PTHREAD_MUTEX_INITIALIZER;
160
161 static void exit_trace(int status);
162
163 static int find_mask_map(char *string)
164 {
165         int i;
166
167         for (i = 0; i < sizeof(mask_maps)/sizeof(mask_maps[0]); i++)
168                 if (COMPARE_MASK_MAP(&mask_maps[i], string))
169                         return mask_maps[i].mask;
170
171         return -1;
172 }
173
174 static int start_trace(struct device_information *dip)
175 {
176         struct blk_user_trace_setup buts;
177
178         memset(&buts, 0, sizeof(buts));
179         buts.buf_size = BUF_SIZE;
180         buts.buf_nr = BUF_NR;
181         buts.act_mask = act_mask;
182
183         if (ioctl(dip->fd, BLKSTARTTRACE, &buts) < 0) {
184                 perror("BLKSTARTTRACE");
185                 return 1;
186         }
187
188         memcpy(dip->buts_name, buts.name, sizeof(dip->buts_name));
189         dip->trace_started = 1;
190         return 0;
191 }
192
193 static void stop_trace(struct device_information *dip)
194 {
195         if (dip->trace_started || kill_running_trace) {
196                 if (ioctl(dip->fd, BLKSTOPTRACE) < 0)
197                         perror("BLKSTOPTRACE");
198                 close(dip->fd);
199                 dip->trace_started = 0;
200         }
201 }
202
203 static void stop_all_traces(void)
204 {
205         struct device_information *dip;
206         int i;
207
208         for (dip = device_information, i = 0; i < ndevs; i++, dip++)
209                 stop_trace(dip);
210 }
211
212 static int get_data_read(struct thread_information *tip, void *buf, int len)
213 {
214         char *p = buf;
215         int ret, bytes_left = len;
216
217         while (!is_done() && bytes_left > 0) {
218                 ret = read(tip->fd, p, bytes_left);
219                 if (ret == len)
220                         return 0;
221
222                 if (ret < 0) {
223                         perror(tip->fn);
224                         fprintf(stderr,"Thread %d failed read of %s\n",
225                                 tip->cpu, tip->fn);
226                         exit_trace(1);
227                 } else if (ret > 0) {
228                         fprintf(stderr,"Thread %d misread %s %d,%d\n",
229                                 tip->cpu, tip->fn, ret, len);
230                         exit_trace(1);
231                 } else {
232                         p += ret;
233                         bytes_left -= ret;
234                 }
235
236                 usleep(10000);
237         }
238
239         return -1;
240 }
241
242 static int get_data_mmap(struct thread_information *tip, void *buf, int len,
243                          int check_magic)
244 {
245         if (len > (BUF_SIZE * (tip->buf_subbuf + 1)) - tip->buf_offset) {
246                 tip->buf_subbuf++;
247                 if (tip->buf_subbuf == BUF_NR)
248                         tip->buf_subbuf = 0;
249
250                 tip->buf_offset = tip->buf_subbuf * BUF_SIZE;
251         }
252
253         while (1) {
254                 struct blk_io_trace *t = buf;
255
256                 memcpy(buf, tip->buf + tip->buf_offset, len);
257
258                 if (!check_magic)
259                         break;
260
261                 if (CHECK_MAGIC(t) && t->sequence >= tip->sequence) {
262                         tip->sequence = t->sequence;
263                         break;
264                 }
265         
266                 if (is_done())
267                         return -1;
268
269                 usleep(10000);
270         }
271
272         tip->buf_offset += len;
273         return 0;
274 }
275
276 static int get_data(struct thread_information *tip, void *buf, int len,
277                     int check_magic)
278 {
279         if (tip->buf)
280                 return get_data_mmap(tip, buf, len, check_magic);
281         else
282                 return get_data_read(tip, buf, len);
283 }
284
285 static void *extract_data(struct thread_information *tip, char *ofn, int nb)
286 {
287         unsigned char *buf;
288
289         buf = malloc(nb);
290         if (!get_data(tip, buf, nb, 0))
291                 return buf;
292
293         free(buf);
294         exit_trace(1);
295         return NULL;
296 }
297
298 static inline void tip_fd_unlock(struct thread_information *tip)
299 {
300         if (tip->fd_lock)
301                 pthread_mutex_unlock(tip->fd_lock);
302 }
303
304 static inline void tip_fd_lock(struct thread_information *tip)
305 {
306         if (tip->fd_lock)
307                 pthread_mutex_lock(tip->fd_lock);
308 }
309
310 static void *extract(void *arg)
311 {
312         struct thread_information *tip = arg;
313         int ret, pdu_len;
314         char dp[64], *pdu_data;
315         struct blk_io_trace t;
316         pid_t pid = getpid();
317         cpu_set_t cpu_mask;
318
319         CPU_ZERO(&cpu_mask);
320         CPU_SET((tip->cpu), &cpu_mask);
321
322         if (sched_setaffinity(pid, sizeof(cpu_mask), &cpu_mask) == -1) {
323                 perror("sched_setaffinity");
324                 exit_trace(1);
325         }
326
327         snprintf(tip->fn, sizeof(tip->fn), "%s/block/%s/trace%d",
328                         relay_path, tip->device->buts_name, tip->cpu);
329         tip->fd = open(tip->fn, O_RDONLY);
330         if (tip->fd < 0) {
331                 perror(tip->fn);
332                 fprintf(stderr,"Thread %d failed open of %s\n", tip->cpu,
333                         tip->fn);
334                 exit_trace(1);
335         }
336
337         if (use_mmap) {
338                 tip->buf = mmap(NULL, BUF_SIZE * BUF_NR, PROT_READ,
339                                         MAP_PRIVATE | MAP_POPULATE, tip->fd, 0);
340                 if (tip->buf == MAP_FAILED) {
341                         perror("mmap");
342                         exit_trace(1);
343                 }
344         }
345
346         pdu_data = NULL;
347         while (!is_done()) {
348                 if (get_data(tip, &t, sizeof(t), 1))
349                         break;
350
351                 if (verify_trace(&t))
352                         exit_trace(1);
353
354                 pdu_len = t.pdu_len;
355
356                 trace_to_be(&t);
357
358                 if (pdu_len)
359                         pdu_data = extract_data(tip, dp, pdu_len);
360
361                 /*
362                  * now we have both trace and payload, get a lock on the
363                  * output descriptor and send it off
364                  */
365                 tip_fd_lock(tip);
366
367                 ret = write(tip->ofd, &t, sizeof(t));
368                 if (ret < 0) {
369                         fprintf(stderr,"Thread %d failed write\n", tip->cpu);
370                         tip_fd_unlock(tip);
371                         exit_trace(1);
372                 }
373
374                 if (pdu_data) {
375                         ret = write(tip->ofd, pdu_data, pdu_len);
376                         if (ret != pdu_len) {
377                                 perror("write pdu data");
378                                 tip_fd_unlock(tip);
379                                 exit_trace(1);
380                         }
381
382                         free(pdu_data);
383                         pdu_data = NULL;
384                 }
385
386                 tip_fd_unlock(tip);
387                 tip->events_processed++;
388         }
389
390         return NULL;
391 }
392
393 static int start_threads(struct device_information *dip)
394 {
395         struct thread_information *tip;
396         char op[64];
397         int j, pipeline = output_name && !strcmp(output_name, "-");
398
399         for (tip = dip->threads, j = 0; j < ncpus; j++, tip++) {
400                 tip->cpu = j;
401                 tip->device = dip;
402                 tip->fd_lock = NULL;
403                 tip->events_processed = 0;
404
405                 if (pipeline) {
406                         tip->ofd = dup(STDOUT_FILENO);
407                         tip->fd_lock = &stdout_mutex;
408                 } else {
409                         if (output_name) {
410                                 sprintf(op, "%s.blktrace.%d", output_name,
411                                         tip->cpu);
412                         } else {
413                                 sprintf(op, "%s.blktrace.%d",
414                                         dip->buts_name, tip->cpu);
415                         }
416                         tip->ofd = open(op, O_CREAT|O_TRUNC|O_WRONLY, 0644);
417                 }
418
419                 if (tip->ofd < 0) {
420                         perror(op);
421                         return 1;
422                 }
423
424                 if (pthread_create(&tip->thread, NULL, extract, tip)) {
425                         perror("pthread_create");
426                         close(tip->ofd);
427                         return 1;
428                 }
429         }
430
431         return 0;
432 }
433
434 static void close_thread(struct thread_information *tip)
435 {
436         if (tip->buf)
437                 munmap(tip->buf, BUF_SIZE * BUF_NR);
438
439         if (tip->fd != -1)
440                 close(tip->fd);
441         if (tip->ofd != -1)
442                 close(tip->ofd);
443
444         tip->fd = tip->ofd = -1;
445 }
446
447 static void stop_threads(struct device_information *dip)
448 {
449         struct thread_information *tip;
450         long ret;
451         int j;
452
453         for (tip = dip->threads, j = 0; j < ncpus; j++, tip++) {
454                 if (pthread_join(tip->thread, (void *) &ret))
455                         perror("thread_join");
456                 close_thread(tip);
457         }
458 }
459
460 static void stop_all_threads(void)
461 {
462         struct device_information *dip;
463         int i;
464
465         for (dip = device_information, i = 0; i < ndevs; i++, dip++)
466                 stop_threads(dip);
467 }
468
469 static void stop_all_tracing(void)
470 {
471         struct device_information *dip;
472         struct thread_information *tip;
473         int i, j;
474
475         for (dip = device_information, i = 0; i < ndevs; i++, dip++) {
476                 for (tip = dip->threads, j = 0; j < ncpus; j++, tip++)
477                         close_thread(tip);
478                 stop_trace(dip);
479         }
480 }
481
482 static void exit_trace(int status)
483 {
484         stop_all_tracing();
485         exit(status);
486 }
487
488 static int resize_devices(char *path)
489 {
490         int size = (ndevs + 1) * sizeof(struct device_information);
491
492         device_information = realloc(device_information, size);
493         if (!device_information) {
494                 fprintf(stderr, "Out of memory, device %s (%d)\n", path, size);
495                 return 1;
496         }
497         device_information[ndevs].path = path;
498         ndevs++;
499         return 0;
500 }
501
502 static int open_devices(void)
503 {
504         struct device_information *dip;
505         int i;
506
507         for (dip = device_information, i = 0; i < ndevs; i++, dip++) {
508                 dip->fd = open(dip->path, O_RDONLY);
509                 if (dip->fd < 0) {
510                         perror(dip->path);
511                         return 1;
512                 }
513         }
514         return 0;
515 }
516
517 static int start_devices(void)
518 {
519         struct device_information *dip;
520         int i, j, size;
521
522         size = ncpus * sizeof(struct thread_information);
523         thread_information = malloc(size * ndevs);
524         if (!thread_information) {
525                 fprintf(stderr, "Out of memory, threads (%d)\n", size * ndevs);
526                 return 1;
527         }
528
529         for (dip = device_information, i = 0; i < ndevs; i++, dip++) {
530                 if (start_trace(dip)) {
531                         close(dip->fd);
532                         fprintf(stderr, "Failed to start trace on %s\n",
533                                 dip->path);
534                         break;
535                 }
536         }
537         if (i != ndevs) {
538                 for (dip = device_information, j = 0; j < i; j++, dip++)
539                         stop_trace(dip);
540                 return 1;
541         }
542
543         for (dip = device_information, i = 0; i < ndevs; i++, dip++) {
544                 dip->threads = thread_information + (i * ncpus);
545                 if (start_threads(dip)) {
546                         fprintf(stderr, "Failed to start worker threads\n");
547                         break;
548                 }
549         }
550         if (i != ndevs) {
551                 for (dip = device_information, j = 0; j < i; j++, dip++)
552                         stop_threads(dip);
553                 for (dip = device_information, i = 0; i < ndevs; i++, dip++)
554                         stop_trace(dip);
555                 return 1;
556         }
557
558         return 0;
559 }
560
561 static void show_stats(void)
562 {
563         int i, j;
564         struct device_information *dip;
565         struct thread_information *tip;
566         unsigned long long events_processed;
567
568         if (output_name && !strcmp(output_name, "-"))
569                 return;
570
571         for (dip = device_information, i = 0; i < ndevs; i++, dip++) {
572                 printf("Device: %s\n", dip->path);
573                 events_processed = 0;
574                 for (tip = dip->threads, j = 0; j < ncpus; j++, tip++) {
575                         printf("  CPU%3d: %20ld events\n",
576                                tip->cpu, tip->events_processed);
577                         events_processed += tip->events_processed;
578                 }
579                 printf("  Total:  %20lld events\n", events_processed);
580         }
581 }
582   
583 static void show_usage(char *program)
584 {
585         fprintf(stderr,"Usage: %s [-d <dev>] "
586                        "[-a <trace> [-a <trace>]] <dev>\n",
587                 program);
588 }
589
590 static void handle_sigint(int sig)
591 {
592         done = 1;
593 }
594
595 int main(int argc, char *argv[])
596 {
597         static char default_relay_path[] = "/relay";
598         struct statfs st;
599         int i, c;
600         int stop_watch = 0;
601         int act_mask_tmp = 0;
602
603         while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) >= 0) {
604                 switch (c) {
605                 case 'a':
606                         i = find_mask_map(optarg);
607                         if (i < 0) {
608                                 fprintf(stderr,"Invalid action mask %s\n",
609                                         optarg);
610                                 return 1;
611                         }
612                         act_mask_tmp |= i;
613                         break;
614
615                 case 'A':
616                         if ((sscanf(optarg, "%x", &i) != 1) || !VALID_SET(i)) {
617                                 fprintf(stderr,
618                                         "Invalid set action mask %s/0x%x\n",
619                                         optarg, i);
620                                 return 1;
621                         }
622                         act_mask_tmp = i;
623                         break;
624
625                 case 'd':
626                         if (resize_devices(optarg) != 0)
627                                 return 1;
628                         break;
629
630                 case 'r':
631                         relay_path = optarg;
632                         break;
633
634                 case 'o':
635                         output_name = optarg;
636                         break;
637                 case 'k':
638                         kill_running_trace = 1;
639                         break;
640                 case 'w':
641                         stop_watch = atoi(optarg);
642                         if (stop_watch <= 0) {
643                                 fprintf(stderr,
644                                         "Invalid stopwatch value (%d secs)\n",
645                                         stop_watch);
646                                 return 1;
647                         }
648                         break;
649
650                 default:
651                         show_usage(argv[0]);
652                         return 1;
653                 }
654         }
655
656         while (optind < argc) {
657                 if (resize_devices(argv[optind++]) != 0)
658                         return 1;
659         }
660
661         if (ndevs == 0) {
662                 show_usage(argv[0]);
663                 return 1;
664         }
665
666         if (!relay_path)
667                 relay_path = default_relay_path;
668
669         if (act_mask_tmp != 0)
670                 act_mask = act_mask_tmp;
671
672         if (statfs(relay_path, &st) < 0) {
673                 perror("statfs");
674                 fprintf(stderr,"%s does not appear to be a valid path\n",
675                         relay_path);
676                 return 1;
677         } else if (st.f_type != RELAYFS_TYPE) {
678                 fprintf(stderr,"%s does not appear to be a relay filesystem\n",
679                         relay_path);
680                 return 1;
681         }
682
683         if (open_devices() != 0)
684                 return 1;
685
686         if (kill_running_trace) {
687                 stop_all_traces();
688                 return 0;
689         }
690
691         setlocale(LC_NUMERIC, "en_US");
692
693         ncpus = sysconf(_SC_NPROCESSORS_ONLN);
694         if (ncpus < 0) {
695                 fprintf(stderr, "sysconf(_SC_NPROCESSORS_ONLN) failed\n");
696                 return 1;
697         }
698
699         if (start_devices() != 0)
700                 return 1;
701
702         signal(SIGINT, handle_sigint);
703         signal(SIGHUP, handle_sigint);
704         signal(SIGTERM, handle_sigint);
705         signal(SIGALRM, handle_sigint);
706
707         atexit(stop_all_tracing);
708
709         if (stop_watch)
710                 alarm(stop_watch);
711
712         while (!is_done())
713                 sleep(1);
714
715         stop_all_threads();
716         stop_all_traces();
717         show_stats();
718
719         return 0;
720 }
721