[PATCH] blktrace: show stats on sigint as well
[blktrace.git] / blktrace.c
1 /*
2  * block queue tracing application
3  *
4  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation; either version 2 of the License, or
9  *  (at your option) any later version.
10  *
11  *  This program is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with this program; if not, write to the Free Software
18  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  *
20  */
21 #include <pthread.h>
22 #include <sys/types.h>
23 #include <sys/stat.h>
24 #include <unistd.h>
25 #include <locale.h>
26 #include <signal.h>
27 #include <fcntl.h>
28 #include <string.h>
29 #include <sys/ioctl.h>
30 #include <sys/param.h>
31 #include <sys/statfs.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <sched.h>
35 #include <ctype.h>
36 #include <getopt.h>
37 #include <errno.h>
38
39 #include "blktrace.h"
40
41 static char blktrace_version[] = "0.99";
42
43 /*
44  * You may want to increase this even more, if you are logging at a high
45  * rate and see skipped/missed events
46  */
47 #define BUF_SIZE        (512 * 1024)
48 #define BUF_NR          (4)
49
50 #define OFILE_BUF       (128 * 1024)
51
52 #define RELAYFS_TYPE    0xF0B4A981
53
54 #define S_OPTS  "d:a:A:r:o:kw:Vb:n:D:"
55 static struct option l_opts[] = {
56         {
57                 .name = "dev",
58                 .has_arg = required_argument,
59                 .flag = NULL,
60                 .val = 'd'
61         },
62         {
63                 .name = "act-mask",
64                 .has_arg = required_argument,
65                 .flag = NULL,
66                 .val = 'a'
67         },
68         {
69                 .name = "set-mask",
70                 .has_arg = required_argument,
71                 .flag = NULL,
72                 .val = 'A'
73         },
74         {
75                 .name = "relay",
76                 .has_arg = required_argument,
77                 .flag = NULL,
78                 .val = 'r'
79         },
80         {
81                 .name = "output",
82                 .has_arg = required_argument,
83                 .flag = NULL,
84                 .val = 'o'
85         },
86         {
87                 .name = "kill",
88                 .has_arg = no_argument,
89                 .flag = NULL,
90                 .val = 'k'
91         },
92         {
93                 .name = "stopwatch",
94                 .has_arg = required_argument,
95                 .flag = NULL,
96                 .val = 'w'
97         },
98         {
99                 .name = "version",
100                 .has_arg = no_argument,
101                 .flag = NULL,
102                 .val = 'V'
103         },
104         {
105                 .name = "buffer-size",
106                 .has_arg = required_argument,
107                 .flag = NULL,
108                 .val = 'b'
109         },
110         {
111                 .name = "num-sub-buffers",
112                 .has_arg = required_argument,
113                 .flag = NULL,
114                 .val = 'n'
115         },
116         {
117                 .name = "output-dir",
118                 .has_arg = required_argument,
119                 .flag = NULL,
120                 .val = 'D'
121         },
122         {
123                 .name = NULL,
124         }
125 };
126
127 struct thread_information {
128         int cpu;
129         pthread_t thread;
130
131         int fd;
132         char fn[MAXPATHLEN + 64];
133         void *buf;
134         unsigned long buf_offset;
135         unsigned int buf_subbuf;
136         unsigned int sequence;
137
138         pthread_mutex_t *fd_lock;
139         FILE *ofile;
140         char *ofile_buffer;
141
142         volatile int closed;
143
144         unsigned long events_processed;
145         struct device_information *device;
146 };
147
148 struct device_information {
149         int fd;
150         char *path;
151         char buts_name[32];
152         volatile int trace_started;
153         struct thread_information *threads;
154 };
155
156 static int ncpus;
157 static struct thread_information *thread_information;
158 static int ndevs;
159 static struct device_information *device_information;
160
161 /* command line option globals */
162 static char *relay_path;
163 static char *output_name;
164 static char *output_dir;
165 static int act_mask = ~0U;
166 static int kill_running_trace;
167 static unsigned int buf_size = BUF_SIZE;
168 static unsigned int buf_nr = BUF_NR;
169
170 #define is_done()       (*(volatile int *)(&done))
171 static volatile int done;
172
173 static pthread_mutex_t stdout_mutex = PTHREAD_MUTEX_INITIALIZER;
174
175 static void exit_trace(int status);
176
177 #define tip_closed(tip)         (*(volatile int *)(&(tip)->closed))
178 #define set_tip_closed(tip)     ((tip)->closed = 1)
179
180 #define dip_tracing(dip)        (*(volatile int *)(&(dip)->trace_started))
181 #define dip_set_tracing(dip, v) ((dip)->trace_started = (v))
182
183 #define __for_each_dip(__d, __i, __e)   \
184         for (__i = 0, __d = device_information; __i < __e; __i++, __d++)
185
186 #define for_each_dip(__d, __i)  __for_each_dip(__d, __i, ndevs)
187 #define for_each_tip(__d, __t, __i)     \
188         for (__i = 0, __t = (__d)->threads; __i < ncpus; __i++, __t++)
189
190 static int start_trace(struct device_information *dip)
191 {
192         struct blk_user_trace_setup buts;
193
194         memset(&buts, 0, sizeof(buts));
195         buts.buf_size = buf_size;
196         buts.buf_nr = buf_nr;
197         buts.act_mask = act_mask;
198
199         if (ioctl(dip->fd, BLKSTARTTRACE, &buts) < 0) {
200                 perror("BLKSTARTTRACE");
201                 return 1;
202         }
203
204         memcpy(dip->buts_name, buts.name, sizeof(dip->buts_name));
205         dip_set_tracing(dip, 1);
206         return 0;
207 }
208
209 static void stop_trace(struct device_information *dip)
210 {
211         if (dip_tracing(dip) || kill_running_trace) {
212                 dip_set_tracing(dip, 0);
213
214                 if (ioctl(dip->fd, BLKSTOPTRACE) < 0)
215                         perror("BLKSTOPTRACE");
216
217                 close(dip->fd);
218                 dip->fd = -1;
219         }
220 }
221
222 static void stop_all_traces(void)
223 {
224         struct device_information *dip;
225         int i;
226
227         for_each_dip(dip, i)
228                 stop_trace(dip);
229 }
230
231 static int read_data(struct thread_information *tip, void *buf, int len)
232 {
233         char *p = buf;
234         int ret, bytes_left = len;
235
236         while (!is_done() && bytes_left > 0) {
237                 ret = read(tip->fd, p, bytes_left);
238                 if (ret == bytes_left)
239                         return 0;
240
241                 if (ret < 0) {
242                         perror(tip->fn);
243                         fprintf(stderr,"Thread %d failed read of %s\n",
244                                 tip->cpu, tip->fn);
245                         break;
246                 } else if (ret > 0) {
247                         p += ret;
248                         bytes_left -= ret;
249                 } else
250                         usleep(1000);
251         }
252
253         return -1;
254 }
255
256 static int write_data(FILE *file, void *buf, unsigned int buf_len)
257 {
258         int ret, bytes_left;
259         char *p = buf;
260
261         bytes_left = buf_len;
262         while (bytes_left > 0) {
263                 ret = fwrite(p, bytes_left, 1, file);
264                 if (ret == 1)
265                         break;
266
267                 if (ret < 0) {
268                         perror("write");
269                         return 1;
270                 }
271         }
272
273         return 0;
274 }
275
276 static void *extract_data(struct thread_information *tip, int nb)
277 {
278         unsigned char *buf;
279
280         buf = malloc(nb);
281         if (!read_data(tip, buf, nb))
282                 return buf;
283
284         free(buf);
285         return NULL;
286 }
287
288 /*
289  * trace may start inside 'bit' or may need to be gotten further on
290  */
291 static int get_event_slow(struct thread_information *tip,
292                           struct blk_io_trace *bit)
293 {
294         const int inc = sizeof(__u32);
295         struct blk_io_trace foo;
296         unsigned int offset;
297         void *p;
298
299         /*
300          * check is trace is inside
301          */
302         offset = 0;
303         p = bit;
304         while (offset < sizeof(*bit)) {
305                 p += inc;
306                 offset += inc;
307
308                 memcpy(&foo, p, inc);
309
310                 if (CHECK_MAGIC(&foo))
311                         break;
312         }
313
314         /*
315          * part trace found inside, read the rest
316          */
317         if (offset < sizeof(*bit)) {
318                 int good_bytes = sizeof(*bit) - offset;
319
320                 memmove(bit, p, good_bytes);
321                 p = (void *) bit + good_bytes;
322
323                 return read_data(tip, p, offset);
324         }
325
326         /*
327          * nothing found, keep looking for start of trace
328          */
329         do {
330                 if (read_data(tip, bit, sizeof(bit->magic)))
331                         return -1;
332         } while (!CHECK_MAGIC(bit));
333
334         /*
335          * now get the rest of it
336          */
337         p = &bit->sequence;
338         if (!read_data(tip, p, sizeof(*bit) - inc))
339                 return -1;
340
341         return 0;
342 }
343
344 /*
345  * Sometimes relayfs screws us a little, if an event crosses a sub buffer
346  * boundary. So keep looking forward in the trace data until an event
347  * is found
348  */
349 static int get_event(struct thread_information *tip, struct blk_io_trace *bit)
350 {
351         /*
352          * optimize for the common fast case, a full trace read that
353          * succeeds
354          */
355         if (read_data(tip, bit, sizeof(*bit)))
356                 return -1;
357
358         if (CHECK_MAGIC(bit))
359                 return 0;
360
361         /*
362          * ok that didn't work, the event may start somewhere inside the
363          * trace itself
364          */
365         return get_event_slow(tip, bit);
366 }
367
368 static inline void tip_fd_unlock(struct thread_information *tip)
369 {
370         if (tip->fd_lock)
371                 pthread_mutex_unlock(tip->fd_lock);
372 }
373
374 static inline void tip_fd_lock(struct thread_information *tip)
375 {
376         if (tip->fd_lock)
377                 pthread_mutex_lock(tip->fd_lock);
378 }
379
380 static void *extract(void *arg)
381 {
382         struct thread_information *tip = arg;
383         int pdu_len;
384         char *pdu_data;
385         struct blk_io_trace t;
386         pid_t pid = getpid();
387         cpu_set_t cpu_mask;
388
389         CPU_ZERO(&cpu_mask);
390         CPU_SET((tip->cpu), &cpu_mask);
391
392         if (sched_setaffinity(pid, sizeof(cpu_mask), &cpu_mask) == -1) {
393                 perror("sched_setaffinity");
394                 exit_trace(1);
395         }
396
397         snprintf(tip->fn, sizeof(tip->fn), "%s/block/%s/trace%d",
398                         relay_path, tip->device->buts_name, tip->cpu);
399         tip->fd = open(tip->fn, O_RDONLY);
400         if (tip->fd < 0) {
401                 perror(tip->fn);
402                 fprintf(stderr,"Thread %d failed open of %s\n", tip->cpu,
403                         tip->fn);
404                 exit_trace(1);
405         }
406
407         pdu_data = NULL;
408         while (!is_done()) {
409                 if (get_event(tip, &t))
410                         break;
411
412                 if (verify_trace(&t))
413                         break;
414
415                 pdu_len = t.pdu_len;
416
417                 trace_to_be(&t);
418
419                 if (pdu_len) {
420                         pdu_data = extract_data(tip, pdu_len);
421                         if (!pdu_data)
422                                 break;
423                 }
424
425                 /*
426                  * now we have both trace and payload, get a lock on the
427                  * output descriptor and send it off
428                  */
429                 tip_fd_lock(tip);
430
431                 if (write_data(tip->ofile, &t, sizeof(t))) {
432                         tip_fd_unlock(tip);
433                         break;
434                 }
435
436                 if (pdu_data && write_data(tip->ofile, pdu_data, pdu_len)) {
437                         tip_fd_unlock(tip);
438                         break;
439                 }
440
441                 tip_fd_unlock(tip);
442
443                 if (pdu_data) {
444                         free(pdu_data);
445                         pdu_data = NULL;
446                 }
447
448                 tip->events_processed++;
449         }
450
451         exit_trace(1);
452         return NULL;
453 }
454
455 static void close_thread(struct thread_information *tip)
456 {
457         if (tip_closed(tip))
458                 return;
459
460         set_tip_closed(tip);
461
462         if (tip->fd != -1)
463                 close(tip->fd);
464         if (tip->ofile)
465                 fclose(tip->ofile);
466         if (tip->ofile_buffer)
467                 free(tip->ofile_buffer);
468
469         tip->fd = -1;
470         tip->ofile = NULL;
471         tip->ofile_buffer = NULL;
472 }
473
474 static int start_threads(struct device_information *dip)
475 {
476         struct thread_information *tip;
477         char op[64];
478         int j, pipeline = output_name && !strcmp(output_name, "-");
479         int len, mode;
480
481         for_each_tip(dip, tip, j) {
482                 tip->cpu = j;
483                 tip->device = dip;
484                 tip->fd_lock = NULL;
485                 tip->events_processed = 0;
486
487                 if (pipeline) {
488                         tip->ofile = fdopen(STDOUT_FILENO, "w");
489                         tip->fd_lock = &stdout_mutex;
490                         mode = _IOLBF;
491                         buf_size = 512;
492                 } else {
493                         len = 0;
494
495                         if (output_dir)
496                                 len = sprintf(op, "%s/", output_dir);
497
498                         if (output_name) {
499                                 sprintf(op + len, "%s.blktrace.%d", output_name,
500                                         tip->cpu);
501                         } else {
502                                 sprintf(op + len, "%s.blktrace.%d",
503                                         dip->buts_name, tip->cpu);
504                         }
505                         tip->ofile = fopen(op, "w");
506                         mode = _IOFBF;
507                         buf_size = OFILE_BUF;
508                 }
509
510                 if (tip->ofile == NULL) {
511                         perror(op);
512                         return 1;
513                 }
514
515                 tip->ofile_buffer = malloc(buf_size);
516                 if (setvbuf(tip->ofile, tip->ofile_buffer, mode, buf_size)) {
517                         perror("setvbuf");
518                         close_thread(tip);
519                         return 1;
520                 }
521
522                 if (pthread_create(&tip->thread, NULL, extract, tip)) {
523                         perror("pthread_create");
524                         close_thread(tip);
525                         return 1;
526                 }
527         }
528
529         return 0;
530 }
531
532 static void stop_threads(struct device_information *dip)
533 {
534         struct thread_information *tip;
535         long ret;
536         int i;
537
538         for_each_tip(dip, tip, i) {
539                 if (pthread_join(tip->thread, (void *) &ret))
540                         perror("thread_join");
541
542                 close_thread(tip);
543         }
544 }
545
546 static void stop_all_threads(void)
547 {
548         struct device_information *dip;
549         int i;
550
551         for_each_dip(dip, i)
552                 stop_threads(dip);
553 }
554
555 static void stop_all_tracing(void)
556 {
557         struct device_information *dip;
558         struct thread_information *tip;
559         int i, j;
560
561         for_each_dip(dip, i) {
562                 for_each_tip(dip, tip, j)
563                         close_thread(tip);
564
565                 stop_trace(dip);
566         }
567 }
568
569 static void exit_trace(int status)
570 {
571         stop_all_tracing();
572         exit(status);
573 }
574
575 static int resize_devices(char *path)
576 {
577         int size = (ndevs + 1) * sizeof(struct device_information);
578
579         device_information = realloc(device_information, size);
580         if (!device_information) {
581                 fprintf(stderr, "Out of memory, device %s (%d)\n", path, size);
582                 return 1;
583         }
584         device_information[ndevs].path = path;
585         ndevs++;
586         return 0;
587 }
588
589 static int open_devices(void)
590 {
591         struct device_information *dip;
592         int i;
593
594         for_each_dip(dip, i) {
595                 dip->fd = open(dip->path, O_RDONLY | O_NONBLOCK);
596                 if (dip->fd < 0) {
597                         perror(dip->path);
598                         return 1;
599                 }
600         }
601
602         return 0;
603 }
604
605 static int start_devices(void)
606 {
607         struct device_information *dip;
608         int i, j, size;
609
610         size = ncpus * sizeof(struct thread_information);
611         thread_information = malloc(size * ndevs);
612         if (!thread_information) {
613                 fprintf(stderr, "Out of memory, threads (%d)\n", size * ndevs);
614                 return 1;
615         }
616
617         for_each_dip(dip, i) {
618                 if (start_trace(dip)) {
619                         close(dip->fd);
620                         fprintf(stderr, "Failed to start trace on %s\n",
621                                 dip->path);
622                         break;
623                 }
624         }
625
626         if (i != ndevs) {
627                 __for_each_dip(dip, j, i)
628                         stop_trace(dip);
629
630                 return 1;
631         }
632
633         for_each_dip(dip, i) {
634                 dip->threads = thread_information + (i * ncpus);
635                 if (start_threads(dip)) {
636                         fprintf(stderr, "Failed to start worker threads\n");
637                         break;
638                 }
639         }
640
641         if (i != ndevs) {
642                 __for_each_dip(dip, j, i)
643                         stop_threads(dip);
644                 for_each_dip(dip, i)
645                         stop_trace(dip);
646
647                 return 1;
648         }
649
650         return 0;
651 }
652
653 static int get_dropped_count(const char *buts_name)
654 {
655         int fd;
656         char tmp[MAXPATHLEN + 64];
657
658         snprintf(tmp, sizeof(tmp), "%s/block/%s/dropped",
659                  relay_path, buts_name);
660
661         fd = open(tmp, O_RDONLY);
662         if (fd < 0) {
663                 /*
664                  * this may be ok, if the kernel doesn't support dropped counts
665                  */
666                 if (errno == ENOENT)
667                         return 0;
668
669                 fprintf(stderr, "Couldn't open dropped file %s\n", tmp);
670                 return -1;
671         }
672
673         if (read(fd, tmp, sizeof(tmp)) < 0) {
674                 perror(tmp);
675                 close(fd);
676                 return -1;
677         }
678
679         close(fd);
680
681         return atoi(tmp);
682 }
683
684 static void show_stats(void)
685 {
686         int i, j, dropped;
687         struct device_information *dip;
688         struct thread_information *tip;
689         unsigned long long events_processed;
690
691         if (output_name && !strcmp(output_name, "-"))
692                 return;
693
694         for_each_dip(dip, i) {
695                 printf("Device: %s\n", dip->path);
696                 events_processed = 0;
697                 for_each_tip(dip, tip, j) {
698                         printf("  CPU%3d: %20ld events\n",
699                                tip->cpu, tip->events_processed);
700                         events_processed += tip->events_processed;
701                 }
702                 dropped = get_dropped_count(dip->buts_name);
703                 printf("  Total:  %20lld events (dropped %d)\n",
704                                 events_processed, dropped);
705         }
706 }
707
708 static char usage_str[] = \
709         "-d <dev> [ -r relay path ] [ -o <output> ] [-k ] [ -w time ]\n" \
710         "[ -a action ] [ -A action mask ] [ -v ]\n\n" \
711         "\t-d Use specified device. May also be given last after options\n" \
712         "\t-r Path to mounted relayfs, defaults to /relay\n" \
713         "\t-o File(s) to send output to\n" \
714         "\t-D Directory to prepend to output file names\n" \
715         "\t-k Kill a running trace\n" \
716         "\t-w Stop after defined time, in seconds\n" \
717         "\t-a Only trace specified actions. See documentation\n" \
718         "\t-A Give trace mask as a single value. See documentation\n" \
719         "\t-b Sub buffer size in KiB\n" \
720         "\t-n Number of sub buffers\n" \
721         "\t-v Print program version info\n\n";
722
723 static void show_usage(char *program)
724 {
725         fprintf(stderr, "Usage: %s %s %s",program, blktrace_version, usage_str);
726 }
727
728 static void handle_sigint(__attribute__((__unused__)) int sig)
729 {
730         done = 1;
731         show_stats();
732 }
733
734 int main(int argc, char *argv[])
735 {
736         static char default_relay_path[] = "/relay";
737         struct statfs st;
738         int i, c;
739         int stop_watch = 0;
740         int act_mask_tmp = 0;
741
742         while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) >= 0) {
743                 switch (c) {
744                 case 'a':
745                         i = find_mask_map(optarg);
746                         if (i < 0) {
747                                 fprintf(stderr,"Invalid action mask %s\n",
748                                         optarg);
749                                 return 1;
750                         }
751                         act_mask_tmp |= i;
752                         break;
753
754                 case 'A':
755                         if ((sscanf(optarg, "%x", &i) != 1) || 
756                                                         !valid_act_opt(i)) {
757                                 fprintf(stderr,
758                                         "Invalid set action mask %s/0x%x\n",
759                                         optarg, i);
760                                 return 1;
761                         }
762                         act_mask_tmp = i;
763                         break;
764
765                 case 'd':
766                         if (resize_devices(optarg) != 0)
767                                 return 1;
768                         break;
769
770                 case 'r':
771                         relay_path = optarg;
772                         break;
773
774                 case 'o':
775                         output_name = optarg;
776                         break;
777                 case 'k':
778                         kill_running_trace = 1;
779                         break;
780                 case 'w':
781                         stop_watch = atoi(optarg);
782                         if (stop_watch <= 0) {
783                                 fprintf(stderr,
784                                         "Invalid stopwatch value (%d secs)\n",
785                                         stop_watch);
786                                 return 1;
787                         }
788                         break;
789                 case 'V':
790                         printf("%s version %s\n", argv[0], blktrace_version);
791                         return 0;
792                 case 'b':
793                         buf_size = atoi(optarg);
794                         if (buf_size <= 0 || buf_size > 16*1024) {
795                                 fprintf(stderr,
796                                         "Invalid buffer size (%d)\n", buf_size);
797                                 return 1;
798                         }
799                         buf_size <<= 10;
800                         break;
801                 case 'n':
802                         buf_nr = atoi(optarg);
803                         if (buf_nr <= 0) {
804                                 fprintf(stderr,
805                                         "Invalid buffer nr (%d)\n", buf_nr);
806                                 return 1;
807                         }
808                         break;
809                 case 'D':
810                         output_dir = optarg;
811                         break;
812                 default:
813                         show_usage(argv[0]);
814                         return 1;
815                 }
816         }
817
818         while (optind < argc) {
819                 if (resize_devices(argv[optind++]) != 0)
820                         return 1;
821         }
822
823         if (ndevs == 0) {
824                 show_usage(argv[0]);
825                 return 1;
826         }
827
828         if (!relay_path)
829                 relay_path = default_relay_path;
830
831         if (act_mask_tmp != 0)
832                 act_mask = act_mask_tmp;
833
834         if (statfs(relay_path, &st) < 0) {
835                 perror("statfs");
836                 fprintf(stderr,"%s does not appear to be a valid path\n",
837                         relay_path);
838                 return 1;
839         } else if (st.f_type != (long) RELAYFS_TYPE) {
840                 fprintf(stderr,"%s does not appear to be a relay filesystem\n",
841                         relay_path);
842                 return 1;
843         }
844
845         if (open_devices() != 0)
846                 return 1;
847
848         if (kill_running_trace) {
849                 stop_all_traces();
850                 return 0;
851         }
852
853         setlocale(LC_NUMERIC, "en_US");
854
855         ncpus = sysconf(_SC_NPROCESSORS_ONLN);
856         if (ncpus < 0) {
857                 fprintf(stderr, "sysconf(_SC_NPROCESSORS_ONLN) failed\n");
858                 return 1;
859         }
860
861         if (start_devices() != 0)
862                 return 1;
863
864         signal(SIGINT, handle_sigint);
865         signal(SIGHUP, handle_sigint);
866         signal(SIGTERM, handle_sigint);
867         signal(SIGALRM, handle_sigint);
868
869         atexit(stop_all_tracing);
870
871         if (stop_watch)
872                 alarm(stop_watch);
873
874         while (!is_done())
875                 sleep(1);
876
877         stop_all_threads();
878         stop_all_traces();
879         show_stats();
880
881         return 0;
882 }
883