2 * block queue tracing application
4 * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 #include <sys/types.h>
29 #include <sys/ioctl.h>
30 #include <sys/param.h>
31 #include <sys/statfs.h>
43 static char blktrace_version[] = "0.99";
46 * You may want to increase this even more, if you are logging at a high
47 * rate and see skipped/missed events
49 #define BUF_SIZE (512 * 1024)
52 #define OFILE_BUF (128 * 1024)
54 #define RELAYFS_TYPE 0xF0B4A981
56 #define RING_INIT_NR (2)
57 #define RING_MAX_NR (16UL)
59 #define S_OPTS "d:a:A:r:o:kw:Vb:n:D:"
60 static struct option l_opts[] = {
63 .has_arg = required_argument,
69 .has_arg = required_argument,
75 .has_arg = required_argument,
81 .has_arg = required_argument,
87 .has_arg = required_argument,
93 .has_arg = no_argument,
99 .has_arg = required_argument,
105 .has_arg = no_argument,
110 .name = "buffer-size",
111 .has_arg = required_argument,
116 .name = "num-sub-buffers",
117 .has_arg = required_argument,
122 .name = "output-dir",
123 .has_arg = required_argument,
132 struct thread_information {
138 unsigned long fd_off;
139 unsigned long fd_size;
140 unsigned long fd_max_size;
141 char fn[MAXPATHLEN + 64];
143 pthread_mutex_t *fd_lock;
147 unsigned long events_processed;
148 struct device_information *device;
151 struct device_information {
155 volatile int trace_started;
156 unsigned long drop_count;
157 struct thread_information *threads;
161 static struct thread_information *thread_information;
163 static struct device_information *device_information;
165 /* command line option globals */
166 static char *relay_path;
167 static char *output_name;
168 static char *output_dir;
169 static int act_mask = ~0U;
170 static int kill_running_trace;
171 static unsigned long buf_size = BUF_SIZE;
172 static unsigned long buf_nr = BUF_NR;
174 #define is_done() (*(volatile int *)(&done))
175 static volatile int done;
177 #define is_trace_stopped() (*(volatile int *)(&trace_stopped))
178 static volatile int trace_stopped;
180 #define is_stat_shown() (*(volatile int *)(&stat_shown))
181 static volatile int stat_shown;
183 static pthread_mutex_t stdout_mutex = PTHREAD_MUTEX_INITIALIZER;
185 static void exit_trace(int status);
187 #define dip_tracing(dip) (*(volatile int *)(&(dip)->trace_started))
188 #define dip_set_tracing(dip, v) ((dip)->trace_started = (v))
190 #define __for_each_dip(__d, __i, __e) \
191 for (__i = 0, __d = device_information; __i < __e; __i++, __d++)
193 #define for_each_dip(__d, __i) __for_each_dip(__d, __i, ndevs)
194 #define for_each_tip(__d, __t, __i) \
195 for (__i = 0, __t = (__d)->threads; __i < ncpus; __i++, __t++)
197 static int get_dropped_count(const char *buts_name)
200 char tmp[MAXPATHLEN + 64];
202 snprintf(tmp, sizeof(tmp), "%s/block/%s/dropped",
203 relay_path, buts_name);
205 fd = open(tmp, O_RDONLY);
208 * this may be ok, if the kernel doesn't support dropped counts
213 fprintf(stderr, "Couldn't open dropped file %s\n", tmp);
217 if (read(fd, tmp, sizeof(tmp)) < 0) {
228 static int start_trace(struct device_information *dip)
230 struct blk_user_trace_setup buts;
232 memset(&buts, 0, sizeof(buts));
233 buts.buf_size = buf_size;
234 buts.buf_nr = buf_nr;
235 buts.act_mask = act_mask;
237 if (ioctl(dip->fd, BLKSTARTTRACE, &buts) < 0) {
238 perror("BLKSTARTTRACE");
242 memcpy(dip->buts_name, buts.name, sizeof(dip->buts_name));
243 dip_set_tracing(dip, 1);
247 static void stop_trace(struct device_information *dip)
249 if (dip_tracing(dip) || kill_running_trace) {
250 dip_set_tracing(dip, 0);
252 if (ioctl(dip->fd, BLKSTOPTRACE) < 0)
253 perror("BLKSTOPTRACE");
260 static void stop_all_traces(void)
262 struct device_information *dip;
265 for_each_dip(dip, i) {
266 dip->drop_count = get_dropped_count(dip->buts_name);
271 static void wait_for_data(struct thread_information *tip)
273 struct pollfd pfd = { .fd = tip->fd, .events = POLLIN };
277 if (pfd.revents & POLLIN)
282 static int __read_data(struct thread_information *tip, void *buf, int len,
285 int ret = 0, waited = 0;
287 while (!is_done() || waited) {
288 ret = read(tip->fd, buf, len);
290 fprintf(stderr, "got %d, block %d\n", ret, block);
297 * the waited logic is needed, because the relayfs
298 * poll works on a sub-buffer granularity
303 if (errno != EAGAIN) {
305 fprintf(stderr,"Thread %d failed read of %s\n",
322 #define can_grow_ring(tip) ((tip)->fd_max_size < RING_MAX_NR * buf_size * buf_nr)
324 static int resize_ringbuffer(struct thread_information *tip)
326 if (!can_grow_ring(tip))
329 tip->fd_buf = realloc(tip->fd_buf, 2 * tip->fd_max_size);
332 * if the ring currently wraps, copy range over
334 if (tip->fd_off + tip->fd_size > tip->fd_max_size) {
335 unsigned long wrap_size = tip->fd_size - (tip->fd_max_size - tip->fd_off);
336 memmove(tip->fd_buf + tip->fd_off, tip->fd_buf, wrap_size);
339 tip->fd_max_size <<= 1;
343 static int __refill_ringbuffer(struct thread_information *tip, int len,
349 off = (tip->fd_size + tip->fd_off) & (tip->fd_max_size - 1);
350 if (off + len > tip->fd_max_size)
351 len = tip->fd_max_size - off;
355 ret = __read_data(tip, tip->fd_buf + off, len, block);
364 * keep filling ring until we get a short read
366 static void refill_ringbuffer(struct thread_information *tip, int block)
372 if (len + tip->fd_size > tip->fd_max_size)
373 resize_ringbuffer(tip);
375 ret = __refill_ringbuffer(tip, len, block);
376 } while ((ret = len) && !is_done());
379 static int read_data(struct thread_information *tip, void *buf,
382 unsigned int start_size, end_size;
384 refill_ringbuffer(tip, len > tip->fd_size);
386 if (len > tip->fd_size)
390 * see if we wrap the ring
394 if (len > (tip->fd_max_size - tip->fd_off)) {
395 start_size = tip->fd_max_size - tip->fd_off;
396 end_size = len - start_size;
399 memcpy(buf, tip->fd_buf + tip->fd_off, start_size);
401 memcpy(buf + start_size, tip->fd_buf, end_size);
403 tip->fd_off = (tip->fd_off + len) & (tip->fd_max_size - 1);
408 static int write_data(FILE *file, void *buf, unsigned int buf_len)
413 bytes_left = buf_len;
414 while (bytes_left > 0) {
415 ret = fwrite(p, bytes_left, 1, file);
428 static void *extract_data(struct thread_information *tip, int nb)
433 if (!read_data(tip, buf, nb))
441 * trace may start inside 'bit' or may need to be gotten further on
443 static int get_event_slow(struct thread_information *tip,
444 struct blk_io_trace *bit)
446 const int inc = sizeof(__u32);
447 struct blk_io_trace foo;
452 * check if trace is inside
456 while (offset < sizeof(*bit)) {
460 memcpy(&foo, p, inc);
462 if (CHECK_MAGIC(&foo))
467 * part trace found inside, read the rest
469 if (offset < sizeof(*bit)) {
470 int good_bytes = sizeof(*bit) - offset;
472 memmove(bit, p, good_bytes);
473 p = (void *) bit + good_bytes;
475 return read_data(tip, p, offset);
479 * nothing found, keep looking for start of trace
482 if (read_data(tip, bit, sizeof(bit->magic)))
484 } while (!CHECK_MAGIC(bit));
487 * now get the rest of it
490 if (read_data(tip, p, sizeof(*bit) - inc))
497 * Sometimes relayfs screws us a little, if an event crosses a sub buffer
498 * boundary. So keep looking forward in the trace data until an event
501 static int get_event(struct thread_information *tip, struct blk_io_trace *bit)
504 * optimize for the common fast case, a full trace read that
507 if (read_data(tip, bit, sizeof(*bit)))
510 if (CHECK_MAGIC(bit))
514 * ok that didn't work, the event may start somewhere inside the
517 return get_event_slow(tip, bit);
520 static inline void tip_fd_unlock(struct thread_information *tip)
523 pthread_mutex_unlock(tip->fd_lock);
526 static inline void tip_fd_lock(struct thread_information *tip)
529 pthread_mutex_lock(tip->fd_lock);
532 static void close_thread(struct thread_information *tip)
538 if (tip->ofile_buffer)
539 free(tip->ofile_buffer);
545 tip->ofile_buffer = NULL;
549 static void *extract(void *arg)
551 struct thread_information *tip = arg;
554 struct blk_io_trace t;
555 pid_t pid = getpid();
559 CPU_SET((tip->cpu), &cpu_mask);
561 if (sched_setaffinity(pid, sizeof(cpu_mask), &cpu_mask) == -1) {
562 perror("sched_setaffinity");
566 snprintf(tip->fn, sizeof(tip->fn), "%s/block/%s/trace%d",
567 relay_path, tip->device->buts_name, tip->cpu);
568 tip->fd = open(tip->fn, O_RDONLY | O_NONBLOCK);
571 fprintf(stderr,"Thread %d failed open of %s\n", tip->cpu,
577 * start with a ringbuffer that is twice the size of the kernel side
579 tip->fd_max_size = buf_size * buf_nr * RING_INIT_NR;
580 tip->fd_buf = malloc(tip->fd_max_size);
586 if (get_event(tip, &t))
589 if (verify_trace(&t))
597 pdu_data = extract_data(tip, pdu_len);
603 * now we have both trace and payload, get a lock on the
604 * output descriptor and send it off
608 if (write_data(tip->ofile, &t, sizeof(t))) {
613 if (pdu_data && write_data(tip->ofile, pdu_data, pdu_len)) {
625 tip->events_processed++;
632 static int start_threads(struct device_information *dip)
634 struct thread_information *tip;
636 int j, pipeline = output_name && !strcmp(output_name, "-");
639 for_each_tip(dip, tip, j) {
643 tip->events_processed = 0;
646 tip->ofile = fdopen(STDOUT_FILENO, "w");
647 tip->fd_lock = &stdout_mutex;
654 len = sprintf(op, "%s/", output_dir);
657 sprintf(op + len, "%s.blktrace.%d", output_name,
660 sprintf(op + len, "%s.blktrace.%d",
661 dip->buts_name, tip->cpu);
663 tip->ofile = fopen(op, "w");
665 buf_size = OFILE_BUF;
668 if (tip->ofile == NULL) {
673 tip->ofile_buffer = malloc(buf_size);
674 if (setvbuf(tip->ofile, tip->ofile_buffer, mode, buf_size)) {
680 if (pthread_create(&tip->thread, NULL, extract, tip)) {
681 perror("pthread_create");
690 static void stop_threads(struct device_information *dip)
692 struct thread_information *tip;
696 for_each_tip(dip, tip, i)
697 (void) pthread_join(tip->thread, (void *) &ret);
700 static void stop_all_threads(void)
702 struct device_information *dip;
709 static void stop_all_tracing(void)
711 struct device_information *dip;
718 static void exit_trace(int status)
720 if (!is_trace_stopped()) {
729 static int resize_devices(char *path)
731 int size = (ndevs + 1) * sizeof(struct device_information);
733 device_information = realloc(device_information, size);
734 if (!device_information) {
735 fprintf(stderr, "Out of memory, device %s (%d)\n", path, size);
738 device_information[ndevs].path = path;
743 static int open_devices(void)
745 struct device_information *dip;
748 for_each_dip(dip, i) {
749 dip->fd = open(dip->path, O_RDONLY | O_NONBLOCK);
759 static int start_devices(void)
761 struct device_information *dip;
764 size = ncpus * sizeof(struct thread_information);
765 thread_information = malloc(size * ndevs);
766 if (!thread_information) {
767 fprintf(stderr, "Out of memory, threads (%d)\n", size * ndevs);
771 for_each_dip(dip, i) {
772 if (start_trace(dip)) {
774 fprintf(stderr, "Failed to start trace on %s\n",
781 __for_each_dip(dip, j, i)
787 for_each_dip(dip, i) {
788 dip->threads = thread_information + (i * ncpus);
789 if (start_threads(dip)) {
790 fprintf(stderr, "Failed to start worker threads\n");
796 __for_each_dip(dip, j, i)
807 static void show_stats(void)
809 int i, j, no_stdout = 0;
810 struct device_information *dip;
811 struct thread_information *tip;
812 unsigned long long events_processed;
813 unsigned long total_drops;
820 if (output_name && !strcmp(output_name, "-"))
824 for_each_dip(dip, i) {
826 printf("Device: %s\n", dip->path);
827 events_processed = 0;
828 for_each_tip(dip, tip, j) {
830 printf(" CPU%3d: %20ld events\n",
831 tip->cpu, tip->events_processed);
832 events_processed += tip->events_processed;
834 total_drops += dip->drop_count;
836 printf(" Total: %20lld events (dropped %lu)\n",
837 events_processed, dip->drop_count);
841 fprintf(stderr, "You have dropped events, consider using a larger buffer size (-b)\n");
844 static char usage_str[] = \
845 "-d <dev> [ -r relay path ] [ -o <output> ] [-k ] [ -w time ]\n" \
846 "[ -a action ] [ -A action mask ] [ -v ]\n\n" \
847 "\t-d Use specified device. May also be given last after options\n" \
848 "\t-r Path to mounted relayfs, defaults to /relay\n" \
849 "\t-o File(s) to send output to\n" \
850 "\t-D Directory to prepend to output file names\n" \
851 "\t-k Kill a running trace\n" \
852 "\t-w Stop after defined time, in seconds\n" \
853 "\t-a Only trace specified actions. See documentation\n" \
854 "\t-A Give trace mask as a single value. See documentation\n" \
855 "\t-b Sub buffer size in KiB\n" \
856 "\t-n Number of sub buffers\n" \
857 "\t-v Print program version info\n\n";
859 static void show_usage(char *program)
861 fprintf(stderr, "Usage: %s %s %s",program, blktrace_version, usage_str);
863 static void handle_sigint(__attribute__((__unused__)) int sig)
866 if (!is_trace_stopped()) {
875 int main(int argc, char *argv[])
877 static char default_relay_path[] = "/relay";
881 int act_mask_tmp = 0;
883 while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) >= 0) {
886 i = find_mask_map(optarg);
888 fprintf(stderr,"Invalid action mask %s\n",
896 if ((sscanf(optarg, "%x", &i) != 1) ||
899 "Invalid set action mask %s/0x%x\n",
907 if (resize_devices(optarg) != 0)
916 output_name = optarg;
919 kill_running_trace = 1;
922 stop_watch = atoi(optarg);
923 if (stop_watch <= 0) {
925 "Invalid stopwatch value (%d secs)\n",
931 printf("%s version %s\n", argv[0], blktrace_version);
934 buf_size = strtoul(optarg, NULL, 10);
935 if (buf_size <= 0 || buf_size > 16*1024) {
937 "Invalid buffer size (%lu)\n",buf_size);
943 buf_nr = strtoul(optarg, NULL, 10);
946 "Invalid buffer nr (%lu)\n", buf_nr);
959 while (optind < argc) {
960 if (resize_devices(argv[optind++]) != 0)
970 relay_path = default_relay_path;
972 if (act_mask_tmp != 0)
973 act_mask = act_mask_tmp;
975 if (statfs(relay_path, &st) < 0) {
977 fprintf(stderr,"%s does not appear to be a valid path\n",
980 } else if (st.f_type != (long) RELAYFS_TYPE) {
981 fprintf(stderr,"%s does not appear to be a relay filesystem\n",
986 if (open_devices() != 0)
989 if (kill_running_trace) {
994 setlocale(LC_NUMERIC, "en_US");
996 ncpus = sysconf(_SC_NPROCESSORS_ONLN);
998 fprintf(stderr, "sysconf(_SC_NPROCESSORS_ONLN) failed\n");
1002 if (start_devices() != 0)
1005 signal(SIGINT, handle_sigint);
1006 signal(SIGHUP, handle_sigint);
1007 signal(SIGTERM, handle_sigint);
1008 signal(SIGALRM, handle_sigint);
1010 atexit(stop_all_tracing);
1018 if (!is_trace_stopped()) {