2 * block queue tracing application
4 * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 #include <sys/types.h>
29 #include <sys/ioctl.h>
30 #include <sys/param.h>
31 #include <sys/statfs.h>
44 static char blktrace_version[] = "0.99";
47 * You may want to increase this even more, if you are logging at a high
48 * rate and see skipped/missed events
50 #define BUF_SIZE (512 * 1024)
53 #define OFILE_BUF (128 * 1024)
55 #define RELAYFS_TYPE 0xF0B4A981
57 #define S_OPTS "d:a:A:r:o:kw:Vb:n:D:"
58 static struct option l_opts[] = {
61 .has_arg = required_argument,
67 .has_arg = required_argument,
73 .has_arg = required_argument,
79 .has_arg = required_argument,
85 .has_arg = required_argument,
91 .has_arg = no_argument,
97 .has_arg = required_argument,
103 .has_arg = no_argument,
108 .name = "buffer-size",
109 .has_arg = required_argument,
114 .name = "num-sub-buffers",
115 .has_arg = required_argument,
120 .name = "output-dir",
121 .has_arg = required_argument,
133 unsigned int max_len;
136 #define FIFO_SIZE (1024) /* should be plenty big! */
137 #define CL_SIZE (128) /* cache line, any bigger? */
139 struct tip_subbuf_fifo {
140 int tail __attribute__((aligned(CL_SIZE)));
141 int head __attribute__((aligned(CL_SIZE)));
142 struct tip_subbuf *q[FIFO_SIZE];
145 struct thread_information {
151 char fn[MAXPATHLEN + 64];
157 unsigned long events_processed;
158 unsigned long long data_read;
159 struct device_information *device;
166 struct tip_subbuf_fifo fifo;
167 struct tip_subbuf *leftover_ts;
170 * mmap controlled output files
172 unsigned long long fs_size;
173 unsigned long long fs_max_size;
174 unsigned long fs_off;
176 unsigned long fs_buf_len;
179 struct device_information {
183 volatile int trace_started;
184 unsigned long drop_count;
185 struct thread_information *threads;
189 static struct thread_information *thread_information;
191 static struct device_information *device_information;
193 /* command line option globals */
194 static char *relay_path;
195 static char *output_name;
196 static char *output_dir;
197 static int act_mask = ~0U;
198 static int kill_running_trace;
199 static unsigned long buf_size = BUF_SIZE;
200 static unsigned long buf_nr = BUF_NR;
201 static unsigned int page_size;
203 #define is_done() (*(volatile int *)(&done))
204 static volatile int done;
206 #define is_trace_stopped() (*(volatile int *)(&trace_stopped))
207 static volatile int trace_stopped;
209 #define is_stat_shown() (*(volatile int *)(&stat_shown))
210 static volatile int stat_shown;
212 static void exit_trace(int status);
214 #define dip_tracing(dip) (*(volatile int *)(&(dip)->trace_started))
215 #define dip_set_tracing(dip, v) ((dip)->trace_started = (v))
217 #define __for_each_dip(__d, __i, __e) \
218 for (__i = 0, __d = device_information; __i < __e; __i++, __d++)
220 #define for_each_dip(__d, __i) __for_each_dip(__d, __i, ndevs)
221 #define for_each_tip(__d, __t, __j) \
222 for (__j = 0, __t = (__d)->threads; __j < ncpus; __j++, __t++)
224 static int get_dropped_count(const char *buts_name)
227 char tmp[MAXPATHLEN + 64];
229 snprintf(tmp, sizeof(tmp), "%s/block/%s/dropped",
230 relay_path, buts_name);
232 fd = open(tmp, O_RDONLY);
235 * this may be ok, if the kernel doesn't support dropped counts
240 fprintf(stderr, "Couldn't open dropped file %s\n", tmp);
244 if (read(fd, tmp, sizeof(tmp)) < 0) {
255 static int start_trace(struct device_information *dip)
257 struct blk_user_trace_setup buts;
259 memset(&buts, 0, sizeof(buts));
260 buts.buf_size = buf_size;
261 buts.buf_nr = buf_nr;
262 buts.act_mask = act_mask;
264 if (ioctl(dip->fd, BLKTRACESETUP, &buts) < 0) {
265 perror("BLKTRACESETUP");
269 if (ioctl(dip->fd, BLKTRACESTART) < 0) {
270 perror("BLKTRACESTART");
274 memcpy(dip->buts_name, buts.name, sizeof(dip->buts_name));
275 dip_set_tracing(dip, 1);
279 static void stop_trace(struct device_information *dip)
281 if (dip_tracing(dip) || kill_running_trace) {
282 dip_set_tracing(dip, 0);
284 if (ioctl(dip->fd, BLKTRACESTOP) < 0)
285 perror("BLKTRACESTOP");
286 if (ioctl(dip->fd, BLKTRACETEARDOWN) < 0)
287 perror("BLKTRACETEARDOWN");
294 static void stop_all_traces(void)
296 struct device_information *dip;
299 for_each_dip(dip, i) {
300 dip->drop_count = get_dropped_count(dip->buts_name);
305 static void wait_for_data(struct thread_information *tip)
307 struct pollfd pfd = { .fd = tip->fd, .events = POLLIN };
311 if (pfd.revents & POLLIN)
313 if (tip->ofile_stdout)
315 } while (!is_done());
318 static int read_data(struct thread_information *tip, void *buf, int len)
325 ret = read(tip->fd, buf, len);
331 if (errno != EAGAIN) {
333 fprintf(stderr,"Thread %d failed read of %s\n",
339 } while (!is_done());
344 static inline struct tip_subbuf *subbuf_fifo_dequeue(struct thread_information *tip)
346 const int head = tip->fifo.head;
347 const int next = (head + 1) & (FIFO_SIZE - 1);
349 if (head != tip->fifo.tail) {
350 struct tip_subbuf *ts = tip->fifo.q[head];
353 tip->fifo.head = next;
360 static inline int subbuf_fifo_queue(struct thread_information *tip,
361 struct tip_subbuf *ts)
363 const int tail = tip->fifo.tail;
364 const int next = (tail + 1) & (FIFO_SIZE - 1);
366 if (next != tip->fifo.head) {
367 tip->fifo.q[tail] = ts;
369 tip->fifo.tail = next;
373 fprintf(stderr, "fifo too small!\n");
378 * For file output, truncate and mmap the file appropriately
380 static int mmap_subbuf(struct thread_information *tip)
382 int ofd = fileno(tip->ofile);
386 * extend file, if we have to. use chunks of 16 subbuffers.
388 if (tip->fs_off + buf_size > tip->fs_buf_len) {
390 munlock(tip->fs_buf, tip->fs_buf_len);
391 munmap(tip->fs_buf, tip->fs_buf_len);
395 tip->fs_off = tip->fs_size & (page_size - 1);
396 tip->fs_buf_len = (16 * buf_size) - tip->fs_off;
397 tip->fs_max_size += tip->fs_buf_len;
399 if (ftruncate(ofd, tip->fs_max_size) < 0) {
404 tip->fs_buf = mmap(NULL, tip->fs_buf_len, PROT_WRITE,
405 MAP_SHARED, ofd, tip->fs_size - tip->fs_off);
406 if (tip->fs_buf == MAP_FAILED) {
410 mlock(tip->fs_buf, tip->fs_buf_len);
413 ret = read_data(tip, tip->fs_buf + tip->fs_off, buf_size);
415 tip->data_read += ret;
425 * Use the copy approach for pipes
427 static int get_subbuf(struct thread_information *tip)
429 struct tip_subbuf *ts;
432 ts = malloc(sizeof(*ts));
433 ts->buf = malloc(buf_size);
434 ts->max_len = buf_size;
436 ret = read_data(tip, ts->buf, ts->max_len);
439 return subbuf_fifo_queue(tip, ts);
447 static void close_thread(struct thread_information *tip)
453 if (tip->ofile_buffer)
454 free(tip->ofile_buffer);
460 tip->ofile_buffer = NULL;
464 static void *thread_main(void *arg)
466 struct thread_information *tip = arg;
467 pid_t pid = getpid();
471 CPU_SET((tip->cpu), &cpu_mask);
473 if (sched_setaffinity(pid, sizeof(cpu_mask), &cpu_mask) == -1) {
474 perror("sched_setaffinity");
478 snprintf(tip->fn, sizeof(tip->fn), "%s/block/%s/trace%d",
479 relay_path, tip->device->buts_name, tip->cpu);
480 tip->fd = open(tip->fn, O_RDONLY);
483 fprintf(stderr,"Thread %d failed open of %s\n", tip->cpu,
489 if (tip->ofile_stdout) {
493 if (mmap_subbuf(tip))
499 * truncate to right size and cleanup mmap
501 if (!tip->ofile_stdout) {
502 int ofd = fileno(tip->ofile);
505 munmap(tip->fs_buf, tip->fs_buf_len);
507 ftruncate(ofd, tip->fs_size);
514 static int write_data(struct thread_information *tip,
515 void *buf, unsigned int buf_len)
523 ret = fwrite(buf, buf_len, 1, tip->ofile);
533 if (tip->ofile_stdout)
539 static int flush_subbuf(struct thread_information *tip, struct tip_subbuf *ts)
541 unsigned int offset = 0;
542 struct blk_io_trace *t;
543 int pdu_len, events = 0;
546 * surplus from last run
548 if (tip->leftover_ts) {
549 struct tip_subbuf *prev_ts = tip->leftover_ts;
551 if (prev_ts->len + ts->len > prev_ts->max_len) {
552 prev_ts->max_len += ts->len;
553 prev_ts->buf = realloc(prev_ts->buf, prev_ts->max_len);
556 memcpy(prev_ts->buf + prev_ts->len, ts->buf, ts->len);
557 prev_ts->len += ts->len;
563 tip->leftover_ts = NULL;
566 while (offset + sizeof(*t) <= ts->len) {
567 t = ts->buf + offset;
569 if (verify_trace(t)) {
570 write_data(tip, ts->buf, offset);
574 pdu_len = t->pdu_len;
576 if (offset + sizeof(*t) + pdu_len > ts->len)
579 offset += sizeof(*t) + pdu_len;
580 tip->events_processed++;
581 tip->data_read += sizeof(*t) + pdu_len;
585 if (write_data(tip, ts->buf, offset))
589 * leftover bytes, save them for next time
591 if (offset != ts->len) {
592 tip->leftover_ts = ts;
594 memmove(ts->buf, ts->buf + offset, ts->len);
603 static int write_tip_events(struct thread_information *tip)
605 struct tip_subbuf *ts = subbuf_fifo_dequeue(tip);
608 return flush_subbuf(tip, ts);
614 * scans the tips we know and writes out the subbuffers we accumulate
616 static void get_and_write_events(void)
618 struct device_information *dip;
619 struct thread_information *tip;
620 int i, j, events, ret, tips_running;
625 for_each_dip(dip, i) {
626 for_each_tip(dip, tip, j) {
627 ret = write_tip_events(tip);
643 for_each_dip(dip, i) {
644 for_each_tip(dip, tip, j) {
645 ret = write_tip_events(tip);
648 tips_running += !tip->exited;
652 } while (events || tips_running);
655 static void wait_for_threads(void)
658 * for piped output, poll and fetch data for writeout. for files,
659 * we just wait around for trace threads to exit
661 if (output_name && !strcmp(output_name, "-"))
662 get_and_write_events();
664 struct device_information *dip;
665 struct thread_information *tip;
666 int i, j, tips_running;
673 for_each_tip(dip, tip, j)
674 tips_running += !tip->exited;
675 } while (tips_running);
679 static int start_threads(struct device_information *dip)
681 struct thread_information *tip;
683 int j, pipeline = output_name && !strcmp(output_name, "-");
684 int len, mode, vbuf_size;
686 for_each_tip(dip, tip, j) {
689 tip->events_processed = 0;
690 memset(&tip->fifo, 0, sizeof(tip->fifo));
691 tip->leftover_ts = NULL;
694 tip->ofile = fdopen(STDOUT_FILENO, "w");
695 tip->ofile_stdout = 1;
702 len = sprintf(op, "%s/", output_dir);
705 sprintf(op + len, "%s.blktrace.%d", output_name,
708 sprintf(op + len, "%s.blktrace.%d",
709 dip->buts_name, tip->cpu);
711 tip->ofile = fopen(op, "w+");
712 tip->ofile_stdout = 0;
714 vbuf_size = OFILE_BUF;
717 if (tip->ofile == NULL) {
722 tip->ofile_buffer = malloc(vbuf_size);
723 if (setvbuf(tip->ofile, tip->ofile_buffer, mode, vbuf_size)) {
729 if (pthread_create(&tip->thread, NULL, thread_main, tip)) {
730 perror("pthread_create");
739 static void stop_threads(struct device_information *dip)
741 struct thread_information *tip;
745 for_each_tip(dip, tip, i) {
746 (void) pthread_join(tip->thread, (void *) &ret);
751 static void stop_all_threads(void)
753 struct device_information *dip;
760 static void stop_all_tracing(void)
762 struct device_information *dip;
769 static void exit_trace(int status)
771 if (!is_trace_stopped()) {
780 static int resize_devices(char *path)
782 int size = (ndevs + 1) * sizeof(struct device_information);
784 device_information = realloc(device_information, size);
785 if (!device_information) {
786 fprintf(stderr, "Out of memory, device %s (%d)\n", path, size);
789 device_information[ndevs].path = path;
794 static int open_devices(void)
796 struct device_information *dip;
799 for_each_dip(dip, i) {
800 dip->fd = open(dip->path, O_RDONLY | O_NONBLOCK);
810 static int start_devices(void)
812 struct device_information *dip;
815 size = ncpus * sizeof(struct thread_information);
816 thread_information = malloc(size * ndevs);
817 if (!thread_information) {
818 fprintf(stderr, "Out of memory, threads (%d)\n", size * ndevs);
822 for_each_dip(dip, i) {
823 if (start_trace(dip)) {
825 fprintf(stderr, "Failed to start trace on %s\n",
832 __for_each_dip(dip, j, i)
838 for_each_dip(dip, i) {
839 dip->threads = thread_information + (i * ncpus);
840 if (start_threads(dip)) {
841 fprintf(stderr, "Failed to start worker threads\n");
847 __for_each_dip(dip, j, i)
858 static void show_stats(void)
860 struct device_information *dip;
861 struct thread_information *tip;
862 unsigned long long events_processed, data_read;
863 unsigned long total_drops;
864 int i, j, no_stdout = 0;
869 if (output_name && !strcmp(output_name, "-"))
875 for_each_dip(dip, i) {
877 printf("Device: %s\n", dip->path);
878 events_processed = 0;
880 for_each_tip(dip, tip, j) {
882 printf(" CPU%3d: %20lu events, %8llu KiB data\n",
883 tip->cpu, tip->events_processed,
884 tip->data_read >> 10);
885 events_processed += tip->events_processed;
886 data_read += tip->data_read;
888 total_drops += dip->drop_count;
890 printf(" Total: %20llu events (dropped %lu), %8llu KiB data\n",
891 events_processed, dip->drop_count,
896 fprintf(stderr, "You have dropped events, consider using a larger buffer size (-b)\n");
899 static char usage_str[] = \
900 "-d <dev> [ -r relay path ] [ -o <output> ] [-k ] [ -w time ]\n" \
901 "[ -a action ] [ -A action mask ] [ -v ]\n\n" \
902 "\t-d Use specified device. May also be given last after options\n" \
903 "\t-r Path to mounted relayfs, defaults to /relay\n" \
904 "\t-o File(s) to send output to\n" \
905 "\t-D Directory to prepend to output file names\n" \
906 "\t-k Kill a running trace\n" \
907 "\t-w Stop after defined time, in seconds\n" \
908 "\t-a Only trace specified actions. See documentation\n" \
909 "\t-A Give trace mask as a single value. See documentation\n" \
910 "\t-b Sub buffer size in KiB\n" \
911 "\t-n Number of sub buffers\n" \
912 "\t-v Print program version info\n\n";
914 static void show_usage(char *program)
916 fprintf(stderr, "Usage: %s %s %s",program, blktrace_version, usage_str);
918 static void handle_sigint(__attribute__((__unused__)) int sig)
923 int main(int argc, char *argv[])
925 static char default_relay_path[] = "/relay";
929 int act_mask_tmp = 0;
931 while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) >= 0) {
934 i = find_mask_map(optarg);
936 fprintf(stderr,"Invalid action mask %s\n",
944 if ((sscanf(optarg, "%x", &i) != 1) ||
947 "Invalid set action mask %s/0x%x\n",
955 if (resize_devices(optarg) != 0)
964 output_name = optarg;
967 kill_running_trace = 1;
970 stop_watch = atoi(optarg);
971 if (stop_watch <= 0) {
973 "Invalid stopwatch value (%d secs)\n",
979 printf("%s version %s\n", argv[0], blktrace_version);
982 buf_size = strtoul(optarg, NULL, 10);
983 if (buf_size <= 0 || buf_size > 16*1024) {
985 "Invalid buffer size (%lu)\n",buf_size);
991 buf_nr = strtoul(optarg, NULL, 10);
994 "Invalid buffer nr (%lu)\n", buf_nr);
1002 show_usage(argv[0]);
1007 while (optind < argc) {
1008 if (resize_devices(argv[optind++]) != 0)
1013 show_usage(argv[0]);
1018 relay_path = default_relay_path;
1020 if (act_mask_tmp != 0)
1021 act_mask = act_mask_tmp;
1023 if (statfs(relay_path, &st) < 0) {
1025 fprintf(stderr,"%s does not appear to be a valid path\n",
1028 } else if (st.f_type != (long) RELAYFS_TYPE) {
1029 fprintf(stderr,"%s does not appear to be a relay filesystem\n",
1034 if (open_devices() != 0)
1037 if (kill_running_trace) {
1042 setlocale(LC_NUMERIC, "en_US");
1044 ncpus = sysconf(_SC_NPROCESSORS_ONLN);
1046 fprintf(stderr, "sysconf(_SC_NPROCESSORS_ONLN) failed\n");
1050 page_size = getpagesize();
1052 if (start_devices() != 0)
1055 signal(SIGINT, handle_sigint);
1056 signal(SIGHUP, handle_sigint);
1057 signal(SIGTERM, handle_sigint);
1058 signal(SIGALRM, handle_sigint);
1060 atexit(stop_all_tracing);
1067 if (!is_trace_stopped()) {