[PATCH] blktrace: note that the -b option is in KiB
[blktrace.git] / blktrace.c
1 /*
2  * block queue tracing application
3  *
4  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation; either version 2 of the License, or
9  *  (at your option) any later version.
10  *
11  *  This program is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with this program; if not, write to the Free Software
18  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  *
20  */
21 #include <pthread.h>
22 #include <sys/types.h>
23 #include <sys/stat.h>
24 #include <unistd.h>
25 #include <locale.h>
26 #include <signal.h>
27 #include <fcntl.h>
28 #include <string.h>
29 #include <sys/ioctl.h>
30 #include <sys/param.h>
31 #include <sys/statfs.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <sched.h>
35 #include <ctype.h>
36 #include <getopt.h>
37
38 #include "blktrace.h"
39
40 static char blktrace_version[] = "0.90";
41
42 #define BUF_SIZE        (128 *1024)
43 #define BUF_NR          (4)
44
45 #define RELAYFS_TYPE    0xF0B4A981
46
47 #define DECLARE_MASK_MAP(mask)          { BLK_TC_##mask, #mask, "BLK_TC_"#mask }
48 #define COMPARE_MASK_MAP(mmp, str)                                      \
49         (!strcasecmp((mmp)->short_form, (str)) ||                      \
50          !strcasecmp((mmp)->long_form, (str)))
51
52 #define VALID_SET(x)    ((1 <= (x)) && ((x) < (1 << BLK_TC_SHIFT)))
53
54 struct mask_map {
55         int mask;
56         char *short_form;
57         char *long_form;
58 };
59
60 static struct mask_map mask_maps[] = {
61         DECLARE_MASK_MAP(READ),
62         DECLARE_MASK_MAP(WRITE),
63         DECLARE_MASK_MAP(BARRIER),
64         DECLARE_MASK_MAP(SYNC),
65         DECLARE_MASK_MAP(QUEUE),
66         DECLARE_MASK_MAP(REQUEUE),
67         DECLARE_MASK_MAP(ISSUE),
68         DECLARE_MASK_MAP(COMPLETE),
69         DECLARE_MASK_MAP(FS),
70         DECLARE_MASK_MAP(PC),
71 };
72
73 #define S_OPTS  "d:a:A:r:o:kw:vb:n:"
74 static struct option l_opts[] = {
75         {
76                 .name = "dev",
77                 .has_arg = required_argument,
78                 .flag = NULL,
79                 .val = 'd'
80         },
81         {
82                 .name = "act-mask",
83                 .has_arg = required_argument,
84                 .flag = NULL,
85                 .val = 'a'
86         },
87         {
88                 .name = "set-mask",
89                 .has_arg = required_argument,
90                 .flag = NULL,
91                 .val = 'A'
92         },
93         {
94                 .name = "relay",
95                 .has_arg = required_argument,
96                 .flag = NULL,
97                 .val = 'r'
98         },
99         {
100                 .name = "output",
101                 .has_arg = required_argument,
102                 .flag = NULL,
103                 .val = 'o'
104         },
105         {
106                 .name = "kill",
107                 .has_arg = no_argument,
108                 .flag = NULL,
109                 .val = 'k'
110         },
111         {
112                 .name = "stopwatch",
113                 .has_arg = required_argument,
114                 .flag = NULL,
115                 .val = 'w'
116         },
117         {
118                 .name = "version",
119                 .has_arg = no_argument,
120                 .flag = NULL,
121                 .val = 'v'
122         },
123         {
124                 .name = "buffer size (in KiB)",
125                 .has_arg = required_argument,
126                 .flag = NULL,
127                 .val = 'b'
128         },
129         {
130                 .name = "nr of sub buffers",
131                 .has_arg = required_argument,
132                 .flag = NULL,
133                 .val = 'n'
134         },
135 };
136
137 struct thread_information {
138         int cpu;
139         pthread_t thread;
140
141         int fd;
142         char fn[MAXPATHLEN + 64];
143         void *buf;
144         unsigned long buf_offset;
145         unsigned int buf_subbuf;
146         unsigned int sequence;
147
148         pthread_mutex_t *fd_lock;
149         int ofd;
150
151         unsigned long events_processed;
152         struct device_information *device;
153 };
154
155 struct device_information {
156         int fd;
157         char *path;
158         char buts_name[32];
159         int trace_started;
160         struct thread_information *threads;
161 };
162
163 static int ncpus;
164 static struct thread_information *thread_information;
165 static int ndevs;
166 static struct device_information *device_information;
167
168 /* command line option globals */
169 static char *relay_path;
170 static char *output_name;
171 static int act_mask = ~0U;
172 static int kill_running_trace;
173 static unsigned int buf_size = BUF_SIZE;
174 static unsigned int buf_nr = BUF_NR;
175
176 #define is_done()       (*(volatile int *)(&done))
177 static volatile int done;
178
179 static pthread_mutex_t stdout_mutex = PTHREAD_MUTEX_INITIALIZER;
180
181 static void exit_trace(int status);
182
183 static int find_mask_map(char *string)
184 {
185         unsigned int i;
186
187         for (i = 0; i < sizeof(mask_maps)/sizeof(mask_maps[0]); i++)
188                 if (COMPARE_MASK_MAP(&mask_maps[i], string))
189                         return mask_maps[i].mask;
190
191         return -1;
192 }
193
194 static int start_trace(struct device_information *dip)
195 {
196         struct blk_user_trace_setup buts;
197
198         memset(&buts, 0, sizeof(buts));
199         buts.buf_size = buf_size;
200         buts.buf_nr = buf_nr;
201         buts.act_mask = act_mask;
202
203         if (ioctl(dip->fd, BLKSTARTTRACE, &buts) < 0) {
204                 perror("BLKSTARTTRACE");
205                 return 1;
206         }
207
208         memcpy(dip->buts_name, buts.name, sizeof(dip->buts_name));
209         dip->trace_started = 1;
210         return 0;
211 }
212
213 static void stop_trace(struct device_information *dip)
214 {
215         if (dip->trace_started || kill_running_trace) {
216                 if (ioctl(dip->fd, BLKSTOPTRACE) < 0)
217                         perror("BLKSTOPTRACE");
218                 close(dip->fd);
219                 dip->trace_started = 0;
220         }
221 }
222
223 static void stop_all_traces(void)
224 {
225         struct device_information *dip;
226         int i;
227
228         for (dip = device_information, i = 0; i < ndevs; i++, dip++)
229                 stop_trace(dip);
230 }
231
232 static int read_data(struct thread_information *tip, void *buf, int len)
233 {
234         char *p = buf;
235         int ret, bytes_left = len;
236
237         while (!is_done() && bytes_left > 0) {
238                 ret = read(tip->fd, p, bytes_left);
239                 if (ret == bytes_left)
240                         return 0;
241
242                 if (ret < 0) {
243                         perror(tip->fn);
244                         fprintf(stderr,"Thread %d failed read of %s\n",
245                                 tip->cpu, tip->fn);
246                         break;
247                 } else if (ret > 0) {
248                         p += ret;
249                         bytes_left -= ret;
250                 } else
251                         usleep(1000);
252         }
253
254         return -1;
255 }
256
257 static int write_data(int fd, void *buf, unsigned int buf_len)
258 {
259         int ret, bytes_left;
260         char *p = buf;
261
262         bytes_left = buf_len;
263         while (bytes_left > 0) {
264                 ret = write(fd, p, bytes_left);
265                 if (ret == bytes_left)
266                         break;
267
268                 if (ret < 0) {
269                         perror("write");
270                         return 1;
271                 } else if (ret > 0) {
272                         p += ret;
273                         bytes_left -= ret;
274                 } else {
275                         fprintf(stderr, "Zero write?\n");
276                         return 1;
277                 }
278         }
279
280         return 0;
281 }
282
283 static void *extract_data(struct thread_information *tip, int nb)
284 {
285         unsigned char *buf;
286
287         buf = malloc(nb);
288         if (!read_data(tip, buf, nb))
289                 return buf;
290
291         free(buf);
292         return NULL;
293 }
294
295 /*
296  * trace may start inside 'bit' or may need to be gotten further on
297  */
298 static int get_event_slow(struct thread_information *tip,
299                           struct blk_io_trace *bit)
300 {
301         const int inc = sizeof(__u32);
302         struct blk_io_trace foo;
303         int offset;
304         void *p;
305
306         /*
307          * check is trace is inside
308          */
309         offset = 0;
310         p = bit;
311         while (offset < sizeof(*bit)) {
312                 p += inc;
313                 offset += inc;
314
315                 memcpy(&foo, p, inc);
316
317                 if (CHECK_MAGIC(&foo))
318                         break;
319         }
320
321         /*
322          * part trace found inside, read the rest
323          */
324         if (offset < sizeof(*bit)) {
325                 int good_bytes = sizeof(*bit) - offset;
326
327                 memmove(bit, p, good_bytes);
328                 p = (void *) bit + good_bytes;
329
330                 return read_data(tip, p, offset);
331         }
332
333         /*
334          * nothing found, keep looking for start of trace
335          */
336         do {
337                 if (read_data(tip, bit, sizeof(bit->magic)))
338                         return -1;
339         } while (!CHECK_MAGIC(bit));
340
341         /*
342          * now get the rest of it
343          */
344         p = &bit->sequence;
345         if (!read_data(tip, p, sizeof(*bit) - inc))
346                 return -1;
347
348         return 0;
349 }
350
351 /*
352  * Sometimes relayfs screws us a little, if an event crosses a sub buffer
353  * boundary. So keep looking forward in the trace data until an event
354  * is found
355  */
356 static int get_event(struct thread_information *tip, struct blk_io_trace *bit)
357 {
358         /*
359          * optimize for the common fast case, a full trace read that
360          * succeeds
361          */
362         if (read_data(tip, bit, sizeof(*bit)))
363                 return -1;
364
365         if (CHECK_MAGIC(bit))
366                 return 0;
367
368         /*
369          * ok that didn't work, the event may start somewhere inside the
370          * trace itself
371          */
372         return get_event_slow(tip, bit);
373 }
374
375 static inline void tip_fd_unlock(struct thread_information *tip)
376 {
377         if (tip->fd_lock)
378                 pthread_mutex_unlock(tip->fd_lock);
379 }
380
381 static inline void tip_fd_lock(struct thread_information *tip)
382 {
383         if (tip->fd_lock)
384                 pthread_mutex_lock(tip->fd_lock);
385 }
386
387 static void *extract(void *arg)
388 {
389         struct thread_information *tip = arg;
390         int pdu_len;
391         char *pdu_data;
392         struct blk_io_trace t;
393         pid_t pid = getpid();
394         cpu_set_t cpu_mask;
395
396         CPU_ZERO(&cpu_mask);
397         CPU_SET((tip->cpu), &cpu_mask);
398
399         if (sched_setaffinity(pid, sizeof(cpu_mask), &cpu_mask) == -1) {
400                 perror("sched_setaffinity");
401                 exit_trace(1);
402         }
403
404         snprintf(tip->fn, sizeof(tip->fn), "%s/block/%s/trace%d",
405                         relay_path, tip->device->buts_name, tip->cpu);
406         tip->fd = open(tip->fn, O_RDONLY);
407         if (tip->fd < 0) {
408                 perror(tip->fn);
409                 fprintf(stderr,"Thread %d failed open of %s\n", tip->cpu,
410                         tip->fn);
411                 exit_trace(1);
412         }
413
414         pdu_data = NULL;
415         while (!is_done()) {
416                 if (get_event(tip, &t))
417                         break;
418
419                 if (verify_trace(&t))
420                         break;
421
422                 pdu_len = t.pdu_len;
423
424                 trace_to_be(&t);
425
426                 if (pdu_len) {
427                         pdu_data = extract_data(tip, pdu_len);
428                         if (!pdu_data)
429                                 break;
430                 }
431
432                 /*
433                  * now we have both trace and payload, get a lock on the
434                  * output descriptor and send it off
435                  */
436                 tip_fd_lock(tip);
437
438                 if (write_data(tip->ofd, &t, sizeof(t))) {
439                         tip_fd_unlock(tip);
440                         break;
441                 }
442
443                 if (pdu_data && write_data(tip->ofd, pdu_data, pdu_len)) {
444                         tip_fd_unlock(tip);
445                         break;
446                 }
447
448                 tip_fd_unlock(tip);
449
450                 if (pdu_data) {
451                         free(pdu_data);
452                         pdu_data = NULL;
453                 }
454
455                 tip->events_processed++;
456         }
457
458         exit_trace(1);
459         return NULL;
460 }
461
462 static int start_threads(struct device_information *dip)
463 {
464         struct thread_information *tip;
465         char op[64];
466         int j, pipeline = output_name && !strcmp(output_name, "-");
467
468         for (tip = dip->threads, j = 0; j < ncpus; j++, tip++) {
469                 tip->cpu = j;
470                 tip->device = dip;
471                 tip->fd_lock = NULL;
472                 tip->events_processed = 0;
473
474                 if (pipeline) {
475                         tip->ofd = dup(STDOUT_FILENO);
476                         tip->fd_lock = &stdout_mutex;
477                 } else {
478                         if (output_name) {
479                                 sprintf(op, "%s.blktrace.%d", output_name,
480                                         tip->cpu);
481                         } else {
482                                 sprintf(op, "%s.blktrace.%d",
483                                         dip->buts_name, tip->cpu);
484                         }
485                         tip->ofd = open(op, O_CREAT|O_TRUNC|O_WRONLY, 0644);
486                 }
487
488                 if (tip->ofd < 0) {
489                         perror(op);
490                         return 1;
491                 }
492
493                 if (pthread_create(&tip->thread, NULL, extract, tip)) {
494                         perror("pthread_create");
495                         close(tip->ofd);
496                         return 1;
497                 }
498         }
499
500         return 0;
501 }
502
503 static void close_thread(struct thread_information *tip)
504 {
505         if (tip->fd != -1)
506                 close(tip->fd);
507         if (tip->ofd != -1)
508                 close(tip->ofd);
509
510         tip->fd = tip->ofd = -1;
511 }
512
513 static void stop_threads(struct device_information *dip)
514 {
515         struct thread_information *tip;
516         long ret;
517         int j;
518
519         for (tip = dip->threads, j = 0; j < ncpus; j++, tip++) {
520                 if (pthread_join(tip->thread, (void *) &ret))
521                         perror("thread_join");
522                 close_thread(tip);
523         }
524 }
525
526 static void stop_all_threads(void)
527 {
528         struct device_information *dip;
529         int i;
530
531         for (dip = device_information, i = 0; i < ndevs; i++, dip++)
532                 stop_threads(dip);
533 }
534
535 static void stop_all_tracing(void)
536 {
537         struct device_information *dip;
538         struct thread_information *tip;
539         int i, j;
540
541         for (dip = device_information, i = 0; i < ndevs; i++, dip++) {
542                 for (tip = dip->threads, j = 0; j < ncpus; j++, tip++)
543                         close_thread(tip);
544                 stop_trace(dip);
545         }
546 }
547
548 static void exit_trace(int status)
549 {
550         stop_all_tracing();
551         exit(status);
552 }
553
554 static int resize_devices(char *path)
555 {
556         int size = (ndevs + 1) * sizeof(struct device_information);
557
558         device_information = realloc(device_information, size);
559         if (!device_information) {
560                 fprintf(stderr, "Out of memory, device %s (%d)\n", path, size);
561                 return 1;
562         }
563         device_information[ndevs].path = path;
564         ndevs++;
565         return 0;
566 }
567
568 static int open_devices(void)
569 {
570         struct device_information *dip;
571         int i;
572
573         for (dip = device_information, i = 0; i < ndevs; i++, dip++) {
574                 dip->fd = open(dip->path, O_RDONLY);
575                 if (dip->fd < 0) {
576                         perror(dip->path);
577                         return 1;
578                 }
579         }
580         return 0;
581 }
582
583 static int start_devices(void)
584 {
585         struct device_information *dip;
586         int i, j, size;
587
588         size = ncpus * sizeof(struct thread_information);
589         thread_information = malloc(size * ndevs);
590         if (!thread_information) {
591                 fprintf(stderr, "Out of memory, threads (%d)\n", size * ndevs);
592                 return 1;
593         }
594
595         for (dip = device_information, i = 0; i < ndevs; i++, dip++) {
596                 if (start_trace(dip)) {
597                         close(dip->fd);
598                         fprintf(stderr, "Failed to start trace on %s\n",
599                                 dip->path);
600                         break;
601                 }
602         }
603         if (i != ndevs) {
604                 for (dip = device_information, j = 0; j < i; j++, dip++)
605                         stop_trace(dip);
606                 return 1;
607         }
608
609         for (dip = device_information, i = 0; i < ndevs; i++, dip++) {
610                 dip->threads = thread_information + (i * ncpus);
611                 if (start_threads(dip)) {
612                         fprintf(stderr, "Failed to start worker threads\n");
613                         break;
614                 }
615         }
616         if (i != ndevs) {
617                 for (dip = device_information, j = 0; j < i; j++, dip++)
618                         stop_threads(dip);
619                 for (dip = device_information, i = 0; i < ndevs; i++, dip++)
620                         stop_trace(dip);
621                 return 1;
622         }
623
624         return 0;
625 }
626
627 static void show_stats(void)
628 {
629         int i, j;
630         struct device_information *dip;
631         struct thread_information *tip;
632         unsigned long long events_processed;
633
634         if (output_name && !strcmp(output_name, "-"))
635                 return;
636
637         for (dip = device_information, i = 0; i < ndevs; i++, dip++) {
638                 printf("Device: %s\n", dip->path);
639                 events_processed = 0;
640                 for (tip = dip->threads, j = 0; j < ncpus; j++, tip++) {
641                         printf("  CPU%3d: %20ld events\n",
642                                tip->cpu, tip->events_processed);
643                         events_processed += tip->events_processed;
644                 }
645                 printf("  Total:  %20lld events\n", events_processed);
646         }
647 }
648
649 static char usage_str[] = \
650         "-d <dev> [ -r relay path ] [ -o <output> ] [-k ] [ -w time ]\n" \
651         "[ -a action ] [ -A action mask ] [ -v ]\n\n" \
652         "\t-d Use specified device. May also be given last after options\n" \
653         "\t-r Path to mounted relayfs, defaults to /relay\n" \
654         "\t-o File(s) to send output to\n" \
655         "\t-k Kill a running trace\n" \
656         "\t-w Stop after defined time, in seconds\n" \
657         "\t-a Only trace specified actions. See documentation\n" \
658         "\t-A Give trace mask as a single value. See documentation\n" \
659         "\t-b Sub buffer size in KiB\n" \
660         "\t-n Number of sub buffers\n" \
661         "\t-v Print program version info\n\n";
662
663 static void show_usage(char *program)
664 {
665         fprintf(stderr, "Usage: %s %s %s",program, blktrace_version, usage_str);
666 }
667
668 static void handle_sigint(__attribute__((__unused__)) int sig)
669 {
670         done = 1;
671 }
672
673 int main(int argc, char *argv[])
674 {
675         static char default_relay_path[] = "/relay";
676         struct statfs st;
677         int i, c;
678         int stop_watch = 0;
679         int act_mask_tmp = 0;
680
681         while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) >= 0) {
682                 switch (c) {
683                 case 'a':
684                         i = find_mask_map(optarg);
685                         if (i < 0) {
686                                 fprintf(stderr,"Invalid action mask %s\n",
687                                         optarg);
688                                 return 1;
689                         }
690                         act_mask_tmp |= i;
691                         break;
692
693                 case 'A':
694                         if ((sscanf(optarg, "%x", &i) != 1) || !VALID_SET(i)) {
695                                 fprintf(stderr,
696                                         "Invalid set action mask %s/0x%x\n",
697                                         optarg, i);
698                                 return 1;
699                         }
700                         act_mask_tmp = i;
701                         break;
702
703                 case 'd':
704                         if (resize_devices(optarg) != 0)
705                                 return 1;
706                         break;
707
708                 case 'r':
709                         relay_path = optarg;
710                         break;
711
712                 case 'o':
713                         output_name = optarg;
714                         break;
715                 case 'k':
716                         kill_running_trace = 1;
717                         break;
718                 case 'w':
719                         stop_watch = atoi(optarg);
720                         if (stop_watch <= 0) {
721                                 fprintf(stderr,
722                                         "Invalid stopwatch value (%d secs)\n",
723                                         stop_watch);
724                                 return 1;
725                         }
726                         break;
727                 case 'v':
728                         printf("%s version %s\n", argv[0], blktrace_version);
729                         return 0;
730                 case 'b':
731                         buf_size = atoi(optarg);
732                         if (buf_size <= 0) {
733                                 fprintf(stderr,
734                                         "Invalid buffer size (%d)\n", buf_size);
735                                 return 1;
736                         }
737                         buf_size <<= 10;
738                         break;
739                 case 'n':
740                         buf_nr = atoi(optarg);
741                         if (buf_nr <= 0) {
742                                 fprintf(stderr,
743                                         "Invalid buffer nr (%d)\n", buf_nr);
744                                 return 1;
745                         }
746                         break;
747                 default:
748                         show_usage(argv[0]);
749                         return 1;
750                 }
751         }
752
753         while (optind < argc) {
754                 if (resize_devices(argv[optind++]) != 0)
755                         return 1;
756         }
757
758         if (ndevs == 0) {
759                 show_usage(argv[0]);
760                 return 1;
761         }
762
763         if (!relay_path)
764                 relay_path = default_relay_path;
765
766         if (act_mask_tmp != 0)
767                 act_mask = act_mask_tmp;
768
769         if (statfs(relay_path, &st) < 0) {
770                 perror("statfs");
771                 fprintf(stderr,"%s does not appear to be a valid path\n",
772                         relay_path);
773                 return 1;
774         } else if (st.f_type != RELAYFS_TYPE) {
775                 fprintf(stderr,"%s does not appear to be a relay filesystem\n",
776                         relay_path);
777                 return 1;
778         }
779
780         if (open_devices() != 0)
781                 return 1;
782
783         if (kill_running_trace) {
784                 stop_all_traces();
785                 return 0;
786         }
787
788         setlocale(LC_NUMERIC, "en_US");
789
790         ncpus = sysconf(_SC_NPROCESSORS_ONLN);
791         if (ncpus < 0) {
792                 fprintf(stderr, "sysconf(_SC_NPROCESSORS_ONLN) failed\n");
793                 return 1;
794         }
795
796         if (start_devices() != 0)
797                 return 1;
798
799         signal(SIGINT, handle_sigint);
800         signal(SIGHUP, handle_sigint);
801         signal(SIGTERM, handle_sigint);
802         signal(SIGALRM, handle_sigint);
803
804         atexit(stop_all_tracing);
805
806         if (stop_watch)
807                 alarm(stop_watch);
808
809         while (!is_done())
810                 sleep(1);
811
812         stop_all_threads();
813         stop_all_traces();
814         show_stats();
815
816         return 0;
817 }
818