[PATCH] blktrace: smaller code cleanups
[blktrace.git] / blktrace.c
1 /*
2  * block queue tracing application
3  *
4  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation; either version 2 of the License, or
9  *  (at your option) any later version.
10  *
11  *  This program is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with this program; if not, write to the Free Software
18  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  *
20  */
21 #include <pthread.h>
22 #include <sys/types.h>
23 #include <sys/stat.h>
24 #include <unistd.h>
25 #include <locale.h>
26 #include <signal.h>
27 #include <fcntl.h>
28 #include <string.h>
29 #include <sys/ioctl.h>
30 #include <sys/param.h>
31 #include <sys/statfs.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <sched.h>
35 #include <ctype.h>
36 #include <getopt.h>
37
38 #include "blktrace.h"
39
40 static char blktrace_version[] = "0.99";
41
42 /*
43  * You may want to increase this even more, if you are logging at a high
44  * rate and see skipped/missed events
45  */
46 #define BUF_SIZE        (512 * 1024)
47 #define BUF_NR          (4)
48
49 #define OFILE_BUF       (128 * 1024)
50
51 #define RELAYFS_TYPE    0xF0B4A981
52
53 #define S_OPTS  "d:a:A:r:o:kw:Vb:n:D:"
54 static struct option l_opts[] = {
55         {
56                 .name = "dev",
57                 .has_arg = required_argument,
58                 .flag = NULL,
59                 .val = 'd'
60         },
61         {
62                 .name = "act-mask",
63                 .has_arg = required_argument,
64                 .flag = NULL,
65                 .val = 'a'
66         },
67         {
68                 .name = "set-mask",
69                 .has_arg = required_argument,
70                 .flag = NULL,
71                 .val = 'A'
72         },
73         {
74                 .name = "relay",
75                 .has_arg = required_argument,
76                 .flag = NULL,
77                 .val = 'r'
78         },
79         {
80                 .name = "output",
81                 .has_arg = required_argument,
82                 .flag = NULL,
83                 .val = 'o'
84         },
85         {
86                 .name = "kill",
87                 .has_arg = no_argument,
88                 .flag = NULL,
89                 .val = 'k'
90         },
91         {
92                 .name = "stopwatch",
93                 .has_arg = required_argument,
94                 .flag = NULL,
95                 .val = 'w'
96         },
97         {
98                 .name = "version",
99                 .has_arg = no_argument,
100                 .flag = NULL,
101                 .val = 'V'
102         },
103         {
104                 .name = "buffer-size",
105                 .has_arg = required_argument,
106                 .flag = NULL,
107                 .val = 'b'
108         },
109         {
110                 .name = "num-sub-buffers",
111                 .has_arg = required_argument,
112                 .flag = NULL,
113                 .val = 'n'
114         },
115         {
116                 .name = "output-dir",
117                 .has_arg = required_argument,
118                 .flag = NULL,
119                 .val = 'D'
120         },
121         {
122                 .name = NULL,
123         }
124 };
125
126 struct thread_information {
127         int cpu;
128         pthread_t thread;
129
130         int fd;
131         char fn[MAXPATHLEN + 64];
132         void *buf;
133         unsigned long buf_offset;
134         unsigned int buf_subbuf;
135         unsigned int sequence;
136
137         pthread_mutex_t *fd_lock;
138         FILE *ofile;
139         char *ofile_buffer;
140
141         volatile int closed;
142
143         unsigned long events_processed;
144         struct device_information *device;
145 };
146
147 struct device_information {
148         int fd;
149         char *path;
150         char buts_name[32];
151         volatile int trace_started;
152         struct thread_information *threads;
153 };
154
155 static int ncpus;
156 static struct thread_information *thread_information;
157 static int ndevs;
158 static struct device_information *device_information;
159
160 /* command line option globals */
161 static char *relay_path;
162 static char *output_name;
163 static char *output_dir;
164 static int act_mask = ~0U;
165 static int kill_running_trace;
166 static unsigned int buf_size = BUF_SIZE;
167 static unsigned int buf_nr = BUF_NR;
168
169 #define is_done()       (*(volatile int *)(&done))
170 static volatile int done;
171
172 static pthread_mutex_t stdout_mutex = PTHREAD_MUTEX_INITIALIZER;
173
174 static void exit_trace(int status);
175
176 #define tip_closed(tip)         (*(volatile int *)(&(tip)->closed))
177 #define set_tip_closed(tip)     ((tip)->closed = 1)
178
179 #define dip_tracing(dip)        (*(volatile int *)(&(dip)->trace_started))
180 #define dip_set_tracing(dip, v) ((dip)->trace_started = (v))
181
182 #define __for_each_dip(__d, __i, __e)   \
183         for (__i = 0, __d = device_information; __i < __e; __i++, __d++)
184
185 #define for_each_dip(__d, __i)  __for_each_dip(__d, __i, ndevs)
186 #define for_each_tip(__d, __t, __i)     \
187         for (__i = 0, __t = (__d)->threads; __i < ncpus; __i++, __t++)
188
189 static int start_trace(struct device_information *dip)
190 {
191         struct blk_user_trace_setup buts;
192
193         memset(&buts, 0, sizeof(buts));
194         buts.buf_size = buf_size;
195         buts.buf_nr = buf_nr;
196         buts.act_mask = act_mask;
197
198         if (ioctl(dip->fd, BLKSTARTTRACE, &buts) < 0) {
199                 perror("BLKSTARTTRACE");
200                 return 1;
201         }
202
203         memcpy(dip->buts_name, buts.name, sizeof(dip->buts_name));
204         dip_set_tracing(dip, 1);
205         return 0;
206 }
207
208 static void stop_trace(struct device_information *dip)
209 {
210         if (dip_tracing(dip) || kill_running_trace) {
211                 dip_set_tracing(dip, 0);
212
213                 if (ioctl(dip->fd, BLKSTOPTRACE) < 0)
214                         perror("BLKSTOPTRACE");
215
216                 close(dip->fd);
217                 dip->fd = -1;
218         }
219 }
220
221 static void stop_all_traces(void)
222 {
223         struct device_information *dip;
224         int i;
225
226         for_each_dip(dip, i)
227                 stop_trace(dip);
228 }
229
230 static int read_data(struct thread_information *tip, void *buf, int len)
231 {
232         char *p = buf;
233         int ret, bytes_left = len;
234
235         while (!is_done() && bytes_left > 0) {
236                 ret = read(tip->fd, p, bytes_left);
237                 if (ret == bytes_left)
238                         return 0;
239
240                 if (ret < 0) {
241                         perror(tip->fn);
242                         fprintf(stderr,"Thread %d failed read of %s\n",
243                                 tip->cpu, tip->fn);
244                         break;
245                 } else if (ret > 0) {
246                         p += ret;
247                         bytes_left -= ret;
248                 } else
249                         usleep(1000);
250         }
251
252         return -1;
253 }
254
255 static int write_data(FILE *file, void *buf, unsigned int buf_len)
256 {
257         int ret, bytes_left;
258         char *p = buf;
259
260         bytes_left = buf_len;
261         while (bytes_left > 0) {
262                 ret = fwrite(p, bytes_left, 1, file);
263                 if (ret == 1)
264                         break;
265
266                 if (ret < 0) {
267                         perror("write");
268                         return 1;
269                 }
270         }
271
272         return 0;
273 }
274
275 static void *extract_data(struct thread_information *tip, int nb)
276 {
277         unsigned char *buf;
278
279         buf = malloc(nb);
280         if (!read_data(tip, buf, nb))
281                 return buf;
282
283         free(buf);
284         return NULL;
285 }
286
287 /*
288  * trace may start inside 'bit' or may need to be gotten further on
289  */
290 static int get_event_slow(struct thread_information *tip,
291                           struct blk_io_trace *bit)
292 {
293         const int inc = sizeof(__u32);
294         struct blk_io_trace foo;
295         unsigned int offset;
296         void *p;
297
298         /*
299          * check is trace is inside
300          */
301         offset = 0;
302         p = bit;
303         while (offset < sizeof(*bit)) {
304                 p += inc;
305                 offset += inc;
306
307                 memcpy(&foo, p, inc);
308
309                 if (CHECK_MAGIC(&foo))
310                         break;
311         }
312
313         /*
314          * part trace found inside, read the rest
315          */
316         if (offset < sizeof(*bit)) {
317                 int good_bytes = sizeof(*bit) - offset;
318
319                 memmove(bit, p, good_bytes);
320                 p = (void *) bit + good_bytes;
321
322                 return read_data(tip, p, offset);
323         }
324
325         /*
326          * nothing found, keep looking for start of trace
327          */
328         do {
329                 if (read_data(tip, bit, sizeof(bit->magic)))
330                         return -1;
331         } while (!CHECK_MAGIC(bit));
332
333         /*
334          * now get the rest of it
335          */
336         p = &bit->sequence;
337         if (!read_data(tip, p, sizeof(*bit) - inc))
338                 return -1;
339
340         return 0;
341 }
342
343 /*
344  * Sometimes relayfs screws us a little, if an event crosses a sub buffer
345  * boundary. So keep looking forward in the trace data until an event
346  * is found
347  */
348 static int get_event(struct thread_information *tip, struct blk_io_trace *bit)
349 {
350         /*
351          * optimize for the common fast case, a full trace read that
352          * succeeds
353          */
354         if (read_data(tip, bit, sizeof(*bit)))
355                 return -1;
356
357         if (CHECK_MAGIC(bit))
358                 return 0;
359
360         /*
361          * ok that didn't work, the event may start somewhere inside the
362          * trace itself
363          */
364         return get_event_slow(tip, bit);
365 }
366
367 static inline void tip_fd_unlock(struct thread_information *tip)
368 {
369         if (tip->fd_lock)
370                 pthread_mutex_unlock(tip->fd_lock);
371 }
372
373 static inline void tip_fd_lock(struct thread_information *tip)
374 {
375         if (tip->fd_lock)
376                 pthread_mutex_lock(tip->fd_lock);
377 }
378
379 static void *extract(void *arg)
380 {
381         struct thread_information *tip = arg;
382         int pdu_len;
383         char *pdu_data;
384         struct blk_io_trace t;
385         pid_t pid = getpid();
386         cpu_set_t cpu_mask;
387
388         CPU_ZERO(&cpu_mask);
389         CPU_SET((tip->cpu), &cpu_mask);
390
391         if (sched_setaffinity(pid, sizeof(cpu_mask), &cpu_mask) == -1) {
392                 perror("sched_setaffinity");
393                 exit_trace(1);
394         }
395
396         snprintf(tip->fn, sizeof(tip->fn), "%s/block/%s/trace%d",
397                         relay_path, tip->device->buts_name, tip->cpu);
398         tip->fd = open(tip->fn, O_RDONLY);
399         if (tip->fd < 0) {
400                 perror(tip->fn);
401                 fprintf(stderr,"Thread %d failed open of %s\n", tip->cpu,
402                         tip->fn);
403                 exit_trace(1);
404         }
405
406         pdu_data = NULL;
407         while (!is_done()) {
408                 if (get_event(tip, &t))
409                         break;
410
411                 if (verify_trace(&t))
412                         break;
413
414                 pdu_len = t.pdu_len;
415
416                 trace_to_be(&t);
417
418                 if (pdu_len) {
419                         pdu_data = extract_data(tip, pdu_len);
420                         if (!pdu_data)
421                                 break;
422                 }
423
424                 /*
425                  * now we have both trace and payload, get a lock on the
426                  * output descriptor and send it off
427                  */
428                 tip_fd_lock(tip);
429
430                 if (write_data(tip->ofile, &t, sizeof(t))) {
431                         tip_fd_unlock(tip);
432                         break;
433                 }
434
435                 if (pdu_data && write_data(tip->ofile, pdu_data, pdu_len)) {
436                         tip_fd_unlock(tip);
437                         break;
438                 }
439
440                 tip_fd_unlock(tip);
441
442                 if (pdu_data) {
443                         free(pdu_data);
444                         pdu_data = NULL;
445                 }
446
447                 tip->events_processed++;
448         }
449
450         exit_trace(1);
451         return NULL;
452 }
453
454 static void close_thread(struct thread_information *tip)
455 {
456         if (tip_closed(tip))
457                 return;
458
459         set_tip_closed(tip);
460
461         if (tip->fd != -1)
462                 close(tip->fd);
463         if (tip->ofile)
464                 fclose(tip->ofile);
465         if (tip->ofile_buffer)
466                 free(tip->ofile_buffer);
467
468         tip->fd = -1;
469         tip->ofile = NULL;
470         tip->ofile_buffer = NULL;
471 }
472
473 static int start_threads(struct device_information *dip)
474 {
475         struct thread_information *tip;
476         char op[64];
477         int j, pipeline = output_name && !strcmp(output_name, "-");
478         int len, mode;
479
480         for_each_tip(dip, tip, j) {
481                 tip->cpu = j;
482                 tip->device = dip;
483                 tip->fd_lock = NULL;
484                 tip->events_processed = 0;
485
486                 if (pipeline) {
487                         tip->ofile = fdopen(STDOUT_FILENO, "w");
488                         tip->fd_lock = &stdout_mutex;
489                         mode = _IOLBF;
490                         buf_size = 512;
491                 } else {
492                         len = 0;
493
494                         if (output_dir)
495                                 len = sprintf(op, "%s/", output_dir);
496
497                         if (output_name) {
498                                 sprintf(op + len, "%s.blktrace.%d", output_name,
499                                         tip->cpu);
500                         } else {
501                                 sprintf(op + len, "%s.blktrace.%d",
502                                         dip->buts_name, tip->cpu);
503                         }
504                         tip->ofile = fopen(op, "w");
505                         mode = _IOFBF;
506                         buf_size = OFILE_BUF;
507                 }
508
509                 if (tip->ofile == NULL) {
510                         perror(op);
511                         return 1;
512                 }
513
514                 tip->ofile_buffer = malloc(buf_size);
515                 if (setvbuf(tip->ofile, tip->ofile_buffer, mode, buf_size)) {
516                         perror("setvbuf");
517                         close_thread(tip);
518                         return 1;
519                 }
520
521                 if (pthread_create(&tip->thread, NULL, extract, tip)) {
522                         perror("pthread_create");
523                         close_thread(tip);
524                         return 1;
525                 }
526         }
527
528         return 0;
529 }
530
531 static void stop_threads(struct device_information *dip)
532 {
533         struct thread_information *tip;
534         long ret;
535         int i;
536
537         for_each_tip(dip, tip, i) {
538                 if (pthread_join(tip->thread, (void *) &ret))
539                         perror("thread_join");
540
541                 close_thread(tip);
542         }
543 }
544
545 static void stop_all_threads(void)
546 {
547         struct device_information *dip;
548         int i;
549
550         for_each_dip(dip, i)
551                 stop_threads(dip);
552 }
553
554 static void stop_all_tracing(void)
555 {
556         struct device_information *dip;
557         struct thread_information *tip;
558         int i, j;
559
560         for_each_dip(dip, i) {
561                 for_each_tip(dip, tip, j)
562                         close_thread(tip);
563
564                 stop_trace(dip);
565         }
566 }
567
568 static void exit_trace(int status)
569 {
570         stop_all_tracing();
571         exit(status);
572 }
573
574 static int resize_devices(char *path)
575 {
576         int size = (ndevs + 1) * sizeof(struct device_information);
577
578         device_information = realloc(device_information, size);
579         if (!device_information) {
580                 fprintf(stderr, "Out of memory, device %s (%d)\n", path, size);
581                 return 1;
582         }
583         device_information[ndevs].path = path;
584         ndevs++;
585         return 0;
586 }
587
588 static int open_devices(void)
589 {
590         struct device_information *dip;
591         int i;
592
593         for_each_dip(dip, i) {
594                 dip->fd = open(dip->path, O_RDONLY | O_NONBLOCK);
595                 if (dip->fd < 0) {
596                         perror(dip->path);
597                         return 1;
598                 }
599         }
600
601         return 0;
602 }
603
604 static int start_devices(void)
605 {
606         struct device_information *dip;
607         int i, j, size;
608
609         size = ncpus * sizeof(struct thread_information);
610         thread_information = malloc(size * ndevs);
611         if (!thread_information) {
612                 fprintf(stderr, "Out of memory, threads (%d)\n", size * ndevs);
613                 return 1;
614         }
615
616         for_each_dip(dip, i) {
617                 if (start_trace(dip)) {
618                         close(dip->fd);
619                         fprintf(stderr, "Failed to start trace on %s\n",
620                                 dip->path);
621                         break;
622                 }
623         }
624
625         if (i != ndevs) {
626                 __for_each_dip(dip, j, i)
627                         stop_trace(dip);
628
629                 return 1;
630         }
631
632         for_each_dip(dip, i) {
633                 dip->threads = thread_information + (i * ncpus);
634                 if (start_threads(dip)) {
635                         fprintf(stderr, "Failed to start worker threads\n");
636                         break;
637                 }
638         }
639
640         if (i != ndevs) {
641                 __for_each_dip(dip, j, i)
642                         stop_threads(dip);
643                 for_each_dip(dip, i)
644                         stop_trace(dip);
645
646                 return 1;
647         }
648
649         return 0;
650 }
651
652 static void show_stats(void)
653 {
654         int i, j;
655         struct device_information *dip;
656         struct thread_information *tip;
657         unsigned long long events_processed;
658
659         if (output_name && !strcmp(output_name, "-"))
660                 return;
661
662         for_each_dip(dip, i) {
663                 printf("Device: %s\n", dip->path);
664                 events_processed = 0;
665                 for_each_tip(dip, tip, j) {
666                         printf("  CPU%3d: %20ld events\n",
667                                tip->cpu, tip->events_processed);
668                         events_processed += tip->events_processed;
669                 }
670                 printf("  Total:  %20lld events\n", events_processed);
671         }
672 }
673
674 static char usage_str[] = \
675         "-d <dev> [ -r relay path ] [ -o <output> ] [-k ] [ -w time ]\n" \
676         "[ -a action ] [ -A action mask ] [ -v ]\n\n" \
677         "\t-d Use specified device. May also be given last after options\n" \
678         "\t-r Path to mounted relayfs, defaults to /relay\n" \
679         "\t-o File(s) to send output to\n" \
680         "\t-D Directory to prepend to output file names\n" \
681         "\t-k Kill a running trace\n" \
682         "\t-w Stop after defined time, in seconds\n" \
683         "\t-a Only trace specified actions. See documentation\n" \
684         "\t-A Give trace mask as a single value. See documentation\n" \
685         "\t-b Sub buffer size in KiB\n" \
686         "\t-n Number of sub buffers\n" \
687         "\t-v Print program version info\n\n";
688
689 static void show_usage(char *program)
690 {
691         fprintf(stderr, "Usage: %s %s %s",program, blktrace_version, usage_str);
692 }
693
694 static void handle_sigint(__attribute__((__unused__)) int sig)
695 {
696         done = 1;
697 }
698
699 int main(int argc, char *argv[])
700 {
701         static char default_relay_path[] = "/relay";
702         struct statfs st;
703         int i, c;
704         int stop_watch = 0;
705         int act_mask_tmp = 0;
706
707         while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) >= 0) {
708                 switch (c) {
709                 case 'a':
710                         i = find_mask_map(optarg);
711                         if (i < 0) {
712                                 fprintf(stderr,"Invalid action mask %s\n",
713                                         optarg);
714                                 return 1;
715                         }
716                         act_mask_tmp |= i;
717                         break;
718
719                 case 'A':
720                         if ((sscanf(optarg, "%x", &i) != 1) || 
721                                                         !valid_act_opt(i)) {
722                                 fprintf(stderr,
723                                         "Invalid set action mask %s/0x%x\n",
724                                         optarg, i);
725                                 return 1;
726                         }
727                         act_mask_tmp = i;
728                         break;
729
730                 case 'd':
731                         if (resize_devices(optarg) != 0)
732                                 return 1;
733                         break;
734
735                 case 'r':
736                         relay_path = optarg;
737                         break;
738
739                 case 'o':
740                         output_name = optarg;
741                         break;
742                 case 'k':
743                         kill_running_trace = 1;
744                         break;
745                 case 'w':
746                         stop_watch = atoi(optarg);
747                         if (stop_watch <= 0) {
748                                 fprintf(stderr,
749                                         "Invalid stopwatch value (%d secs)\n",
750                                         stop_watch);
751                                 return 1;
752                         }
753                         break;
754                 case 'V':
755                         printf("%s version %s\n", argv[0], blktrace_version);
756                         return 0;
757                 case 'b':
758                         buf_size = atoi(optarg);
759                         if (buf_size <= 0 || buf_size > 16*1024) {
760                                 fprintf(stderr,
761                                         "Invalid buffer size (%d)\n", buf_size);
762                                 return 1;
763                         }
764                         buf_size <<= 10;
765                         break;
766                 case 'n':
767                         buf_nr = atoi(optarg);
768                         if (buf_nr <= 0) {
769                                 fprintf(stderr,
770                                         "Invalid buffer nr (%d)\n", buf_nr);
771                                 return 1;
772                         }
773                         break;
774                 case 'D':
775                         output_dir = optarg;
776                         break;
777                 default:
778                         show_usage(argv[0]);
779                         return 1;
780                 }
781         }
782
783         while (optind < argc) {
784                 if (resize_devices(argv[optind++]) != 0)
785                         return 1;
786         }
787
788         if (ndevs == 0) {
789                 show_usage(argv[0]);
790                 return 1;
791         }
792
793         if (!relay_path)
794                 relay_path = default_relay_path;
795
796         if (act_mask_tmp != 0)
797                 act_mask = act_mask_tmp;
798
799         if (statfs(relay_path, &st) < 0) {
800                 perror("statfs");
801                 fprintf(stderr,"%s does not appear to be a valid path\n",
802                         relay_path);
803                 return 1;
804         } else if (st.f_type != (long) RELAYFS_TYPE) {
805                 fprintf(stderr,"%s does not appear to be a relay filesystem\n",
806                         relay_path);
807                 return 1;
808         }
809
810         if (open_devices() != 0)
811                 return 1;
812
813         if (kill_running_trace) {
814                 stop_all_traces();
815                 return 0;
816         }
817
818         setlocale(LC_NUMERIC, "en_US");
819
820         ncpus = sysconf(_SC_NPROCESSORS_ONLN);
821         if (ncpus < 0) {
822                 fprintf(stderr, "sysconf(_SC_NPROCESSORS_ONLN) failed\n");
823                 return 1;
824         }
825
826         if (start_devices() != 0)
827                 return 1;
828
829         signal(SIGINT, handle_sigint);
830         signal(SIGHUP, handle_sigint);
831         signal(SIGTERM, handle_sigint);
832         signal(SIGALRM, handle_sigint);
833
834         atexit(stop_all_tracing);
835
836         if (stop_watch)
837                 alarm(stop_watch);
838
839         while (!is_done())
840                 sleep(1);
841
842         stop_all_threads();
843         stop_all_traces();
844         show_stats();
845
846         return 0;
847 }
848