2 * block queue tracing parse application
4 * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 #include <sys/types.h>
36 #define SECONDS(x) ((unsigned long long)(x) / 1000000000)
37 #define NANO_SECONDS(x) ((unsigned long long)(x) % 1000000000)
40 static unsigned long long genesis_time, last_reported_time;
43 unsigned long qreads, qwrites, creads, cwrites, mreads, mwrites;
44 unsigned long ireads, iwrites;
45 unsigned long long qread_kb, qwrite_kb, cread_kb, cwrite_kb;
46 unsigned long long iread_kb, iwrite_kb;
56 struct io_stats io_stats;
59 struct per_process_info {
62 struct io_stats io_stats;
63 struct per_process_info *hash_next, *list_next;
66 #define PPI_HASH_SHIFT (8)
67 static struct per_process_info *ppi_hash[1 << PPI_HASH_SHIFT];
68 static struct per_process_info *ppi_list;
70 #define S_OPTS "i:o:b:st"
71 static struct option l_opts[] = {
91 .name = "per program stats",
110 static struct rb_root rb_sort_root;
111 static struct rb_root rb_track_root;
114 * for sorting the displayed output
117 struct blk_io_trace *bit;
118 struct rb_node rb_node;
122 * for tracking individual ios
125 struct rb_node rb_node;
129 unsigned long long queue_time;
130 unsigned long long dispatch_time;
131 unsigned long long completion_time;
135 static struct per_cpu_info *per_cpu_info;
137 static unsigned long long events;
139 static char *dev, *output_name;
142 static int per_process_stats;
143 static int track_ios;
145 #define RB_BATCH_DEFAULT (1024)
146 static int rb_batch = RB_BATCH_DEFAULT;
148 #define is_done() (*(volatile int *)(&done))
149 static volatile int done;
151 static inline unsigned long hash_long(unsigned long val)
155 #elif __WORDSIZE == 64
156 val *= 0x9e37fffffffc0001UL;
158 #error unknown word size
161 return val >> (__WORDSIZE - PPI_HASH_SHIFT);
164 static inline void add_process_to_hash(struct per_process_info *ppi)
166 const int hash_idx = hash_long(ppi->pid);
168 ppi->hash_next = ppi_hash[hash_idx];
169 ppi_hash[hash_idx] = ppi;
172 static inline void add_process_to_list(struct per_process_info *ppi)
174 ppi->list_next = ppi_list;
178 static struct per_process_info *find_process_by_pid(__u32 pid)
180 const int hash_idx = hash_long(pid);
181 struct per_process_info *ppi;
183 ppi = ppi_hash[hash_idx];
188 ppi = ppi->hash_next;
194 static inline int trace_rb_insert(struct trace *t)
196 struct rb_node **p = &rb_sort_root.rb_node;
197 struct rb_node *parent = NULL;
202 __t = rb_entry(parent, struct trace, rb_node);
204 if (t->bit->sequence < __t->bit->sequence)
206 else if (t->bit->sequence > __t->bit->sequence)
209 fprintf(stderr, "sequence alias!\n");
214 rb_link_node(&t->rb_node, parent, p);
215 rb_insert_color(&t->rb_node, &rb_sort_root);
219 static inline int track_rb_insert(struct io_track *iot)
221 struct rb_node **p = &rb_track_root.rb_node;
222 struct rb_node *parent = NULL;
223 struct io_track *__iot;
228 __iot = rb_entry(parent, struct io_track, rb_node);
230 if (iot->sector < __iot->sector)
232 else if (iot->sector > __iot->sector)
235 fprintf(stderr, "sequence alias!\n");
240 rb_link_node(&iot->rb_node, parent, p);
241 rb_insert_color(&iot->rb_node, &rb_track_root);
245 static struct io_track *__find_track(__u64 sector)
247 struct rb_node **p = &rb_track_root.rb_node;
248 struct rb_node *parent = NULL;
249 struct io_track *__iot;
254 __iot = rb_entry(parent, struct io_track, rb_node);
256 if (sector < __iot->sector)
258 else if (sector > __iot->sector)
267 static struct io_track *find_track(__u64 sector)
269 struct io_track *iot = __find_track(sector);
271 iot = __find_track(sector);
273 iot = malloc(sizeof(*iot));
274 iot->sector = sector;
275 track_rb_insert(iot);
281 static void log_track_merge(struct blk_io_trace *t)
283 struct io_track *iot;
287 if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
290 iot = __find_track(t->sector - (t->bytes >> 10));
292 fprintf(stderr, "Trying to merge on non-existing request\n");
296 rb_erase(&iot->rb_node, &rb_track_root);
297 iot->sector -= t->bytes >> 10;
298 track_rb_insert(iot);
301 static void log_track_queue(struct blk_io_trace *t)
303 struct io_track *iot;
308 iot = find_track(t->sector);
309 iot->queue_time = t->time;
313 * return time between queue and issue
315 static unsigned long long log_track_issue(struct blk_io_trace *t)
317 struct io_track *iot;
321 if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
324 iot = __find_track(t->sector);
326 fprintf(stderr, "Trying to issue on non-existing request\n");
330 iot->dispatch_time = t->time;
331 return iot->dispatch_time - iot->queue_time;
335 * return time between dispatch and complete
337 static unsigned long long log_track_complete(struct blk_io_trace *t)
339 unsigned long long elapsed;
340 struct io_track *iot;
344 if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
347 iot = __find_track(t->sector);
349 fprintf(stderr, "Trying to dispatch on non-existing request\n");
353 iot->completion_time = t->time;
354 elapsed = iot->completion_time - iot->dispatch_time;
357 * kill the trace, we don't need it after completion
359 rb_erase(&iot->rb_node, &rb_track_root);
366 static struct io_stats *find_process_io_stats(__u32 pid, char *name)
368 struct per_process_info *ppi = find_process_by_pid(pid);
371 ppi = malloc(sizeof(*ppi));
372 memset(ppi, 0, sizeof(*ppi));
373 strncpy(ppi->name, name, sizeof(ppi->name));
375 add_process_to_hash(ppi);
376 add_process_to_list(ppi);
379 return &ppi->io_stats;
382 static void resize_cpu_info(int cpuid)
384 int new_space, new_max = cpuid + 1;
387 per_cpu_info = realloc(per_cpu_info, new_max * sizeof(*per_cpu_info));
389 fprintf(stderr, "Cannot allocate CPU info -- %d x %d bytes\n",
390 new_max, (int) sizeof(*per_cpu_info));
394 new_start = (char *)per_cpu_info + (max_cpus * sizeof(*per_cpu_info));
395 new_space = (new_max - max_cpus) * sizeof(*per_cpu_info);
396 memset(new_start, 0, new_space);
400 static struct per_cpu_info *get_cpu_info(int cpu)
402 struct per_cpu_info *pci;
405 resize_cpu_info(cpu);
408 * ->cpu might already be set, but just set it unconditionally
410 pci = &per_cpu_info[cpu];
416 static inline void check_time(struct blk_io_trace *bit)
418 unsigned long long this = bit->time;
419 unsigned long long last = last_reported_time;
421 backwards = (this < last) ? 'B' : ' ';
422 last_reported_time = this;
425 static inline void __account_m(struct io_stats *ios, struct blk_io_trace *t,
430 ios->qwrite_kb += t->bytes >> 10;
433 ios->qread_kb += t->bytes >> 10;
437 static inline void account_m(struct blk_io_trace *t, struct per_cpu_info *pci,
440 __account_m(&pci->io_stats, t, rw);
442 if (per_process_stats) {
443 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
445 __account_m(ios, t, rw);
449 static inline void __account_q(struct io_stats *ios, struct blk_io_trace *t,
454 ios->qwrite_kb += t->bytes >> 10;
457 ios->qread_kb += t->bytes >> 10;
461 static inline void account_q(struct blk_io_trace *t, struct per_cpu_info *pci,
464 __account_q(&pci->io_stats, t, rw);
466 if (per_process_stats) {
467 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
469 __account_q(ios, t, rw);
473 static inline void __account_c(struct io_stats *ios, int rw, unsigned int bytes)
477 ios->cwrite_kb += bytes >> 10;
480 ios->cread_kb += bytes >> 10;
484 static inline void account_c(struct blk_io_trace *t, struct per_cpu_info *pci,
487 __account_c(&pci->io_stats, rw, bytes);
489 if (per_process_stats) {
490 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
492 __account_c(ios, rw, bytes);
496 static inline void __account_i(struct io_stats *ios, int rw, unsigned int bytes)
500 ios->iwrite_kb += bytes >> 10;
503 ios->iread_kb += bytes >> 10;
507 static inline void account_i(struct blk_io_trace *t, struct per_cpu_info *pci,
510 __account_i(&pci->io_stats, rw, t->bytes);
512 if (per_process_stats) {
513 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
515 __account_i(ios, rw, t->bytes);
519 static void output(struct per_cpu_info *pci, char *s)
521 fprintf(ofp, "%s", s);
524 static char hstring[256];
525 static char tstring[256];
527 static inline char *setup_header(struct per_cpu_info *pci,
528 struct blk_io_trace *t, char act)
530 int w = t->action & BLK_TC_ACT(BLK_TC_WRITE);
531 int b = t->action & BLK_TC_ACT(BLK_TC_BARRIER);
532 int s = t->action & BLK_TC_ACT(BLK_TC_SYNC);
547 sprintf(hstring, "%2d %15ld %5Lu.%09Lu %5u %c %3s",
549 (unsigned long)t->sequence, SECONDS(t->time),
550 NANO_SECONDS(t->time), t->pid, act, rwbs);
555 static void log_complete(struct per_cpu_info *pci, struct blk_io_trace *t,
558 unsigned long long elapsed = log_track_complete(t);
560 if (elapsed != -1ULL) {
561 double usec = (double) elapsed / 1000;
563 sprintf(tstring,"%s %Lu + %u (%4.2f) [%d]\n",
564 setup_header(pci, t, act),
565 (unsigned long long)t->sector, t->bytes >> 9,
568 sprintf(tstring,"%s %Lu + %u [%d]\n", setup_header(pci, t, act),
569 (unsigned long long)t->sector, t->bytes >> 9, t->error);
572 output(pci, tstring);
575 static void log_queue(struct per_cpu_info *pci, struct blk_io_trace *t,
580 sprintf(tstring,"%s %Lu + %u [%s]\n", setup_header(pci, t, act),
581 (unsigned long long)t->sector, t->bytes >> 9, t->comm);
582 output(pci, tstring);
585 static void log_issue(struct per_cpu_info *pci, struct blk_io_trace *t,
588 unsigned long long elapsed = log_track_issue(t);
590 if (elapsed != -1ULL) {
591 double usec = (double) elapsed / 1000;
593 sprintf(tstring,"%s %Lu + %u (%4.2f) [%s]\n",
594 setup_header(pci, t, act),
595 (unsigned long long)t->sector, t->bytes >> 9,
598 sprintf(tstring,"%s %Lu + %u [%s]\n", setup_header(pci, t, act),
599 (unsigned long long)t->sector, t->bytes >> 9, t->comm);
602 output(pci, tstring);
605 static void log_merge(struct per_cpu_info *pci, struct blk_io_trace *t,
610 sprintf(tstring,"%s %Lu + %u [%s]\n", setup_header(pci, t, act),
611 (unsigned long long)t->sector, t->bytes >> 9, t->comm);
612 output(pci, tstring);
615 static void log_generic(struct per_cpu_info *pci, struct blk_io_trace *t,
618 sprintf(tstring,"%s %Lu + %u [%s]\n", setup_header(pci, t, act),
619 (unsigned long long)t->sector, t->bytes >> 9, t->comm);
620 output(pci, tstring);
623 static int log_pc(struct per_cpu_info *pci, struct blk_io_trace *t, char act)
628 sprintf(tstring,"%s ", setup_header(pci, t, act));
629 output(pci, tstring);
631 buf = (unsigned char *) t + sizeof(*t);
632 for (i = 0; i < t->pdu_len; i++) {
633 sprintf(tstring,"%02x ", buf[i]);
634 output(pci, tstring);
638 sprintf(tstring,"[%d]\n", t->error);
639 output(pci, tstring);
641 sprintf(tstring,"[%s]\n", t->comm);
642 output(pci, tstring);
647 static int dump_trace_pc(struct blk_io_trace *t, struct per_cpu_info *pci)
651 switch (t->action & 0xffff) {
653 log_generic(pci, t, 'Q');
656 log_generic(pci, t, 'G');
658 case __BLK_TA_SLEEPRQ:
659 log_generic(pci, t, 'S');
661 case __BLK_TA_REQUEUE:
662 log_generic(pci, t, 'R');
665 ret = log_pc(pci, t, 'D');
667 case __BLK_TA_COMPLETE:
671 fprintf(stderr, "Bad pc action %x\n", t->action);
679 static void dump_trace_fs(struct blk_io_trace *t, struct per_cpu_info *pci)
681 int w = t->action & BLK_TC_ACT(BLK_TC_WRITE);
682 int act = t->action & 0xffff;
686 account_q(t, pci, w);
687 log_queue(pci, t, 'Q');
689 case __BLK_TA_BACKMERGE:
690 account_m(t, pci, w);
691 log_merge(pci, t, 'M');
693 case __BLK_TA_FRONTMERGE:
694 account_m(t, pci, w);
695 log_merge(pci, t, 'F');
698 log_generic(pci, t, 'G');
700 case __BLK_TA_SLEEPRQ:
701 log_generic(pci, t, 'S');
703 case __BLK_TA_REQUEUE:
704 account_c(t, pci, w, -t->bytes);
705 log_queue(pci, t, 'R');
708 account_i(t, pci, w);
709 log_issue(pci, t, 'D');
711 case __BLK_TA_COMPLETE:
712 account_c(t, pci, w, t->bytes);
713 log_complete(pci, t, 'C');
716 fprintf(stderr, "Bad fs action %x\n", t->action);
721 static int dump_trace(struct blk_io_trace *t, struct per_cpu_info *pci)
725 if (t->action & BLK_TC_ACT(BLK_TC_PC))
726 ret = dump_trace_pc(t, pci);
728 dump_trace_fs(t, pci);
734 static void dump_io_stats(struct io_stats *ios, char *msg)
736 fprintf(ofp, "%s\n", msg);
738 fprintf(ofp, " Reads Queued: %'8lu, %'8LuKiB\t", ios->qreads, ios->qread_kb);
739 fprintf(ofp, " Writes Queued: %'8lu, %'8LuKiB\n", ios->qwrites,ios->qwrite_kb);
741 fprintf(ofp, " Read Dispatches: %'8lu, %'8LuKiB\t", ios->ireads, ios->iread_kb);
742 fprintf(ofp, " Write Dispatches: %'8lu, %'8LuKiB\n", ios->iwrites,ios->iwrite_kb);
743 fprintf(ofp, " Reads Completed: %'8lu, %'8LuKiB\t", ios->creads, ios->cread_kb);
744 fprintf(ofp, " Writes Completed: %'8lu, %'8LuKiB\n", ios->cwrites,ios->cwrite_kb);
745 fprintf(ofp, " Read Merges: %'8lu%8c\t", ios->mreads, ' ');
747 fprintf(ofp, " Write Merges: %'8lu\n", ios->mwrites);
750 static void show_process_stats(void)
752 struct per_process_info *ppi;
756 dump_io_stats(&ppi->io_stats, ppi->name);
757 ppi = ppi->list_next;
763 static void show_cpu_stats(void)
765 struct per_cpu_info foo, *pci;
766 struct io_stats *ios;
767 int i, pci_events = 0;
769 memset(&foo, 0, sizeof(foo));
771 for (i = 0; i < max_cpus; i++) {
774 pci = &per_cpu_info[i];
775 ios = &pci->io_stats;
780 foo.io_stats.qreads += ios->qreads;
781 foo.io_stats.qwrites += ios->qwrites;
782 foo.io_stats.creads += ios->creads;
783 foo.io_stats.cwrites += ios->cwrites;
784 foo.io_stats.mreads += ios->mreads;
785 foo.io_stats.mwrites += ios->mwrites;
786 foo.io_stats.ireads += ios->ireads;
787 foo.io_stats.iwrites += ios->iwrites;
788 foo.io_stats.qread_kb += ios->qread_kb;
789 foo.io_stats.qwrite_kb += ios->qwrite_kb;
790 foo.io_stats.cread_kb += ios->cread_kb;
791 foo.io_stats.cwrite_kb += ios->cwrite_kb;
792 foo.io_stats.iread_kb += ios->iread_kb;
793 foo.io_stats.iwrite_kb += ios->iwrite_kb;
795 snprintf(cpu, sizeof(cpu) - 1, "CPU%d:", i);
796 dump_io_stats(ios, cpu);
800 if (pci_events > 1) {
802 dump_io_stats(&foo.io_stats, "Total:");
805 fprintf(ofp, "\nEvents: %'Lu\n", events);
808 #define min(a, b) ((a) < (b) ? (a) : (b))
810 static struct blk_io_trace *find_trace(void *p, unsigned long offset, int nr)
812 unsigned long max_offset = min(offset,nr * sizeof(struct blk_io_trace));
814 struct blk_io_trace *bit;
817 for (off = 0; off < max_offset; off++) {
820 magic = be32_to_cpu(bit->magic);
821 if ((magic & 0xffffff00) == BLK_IO_TRACE_MAGIC)
828 static int sort_entries(void *traces, unsigned long offset, int nr)
830 struct per_cpu_info *pci;
831 struct blk_io_trace *bit;
833 void *start = traces;
836 while (traces - start <= offset - sizeof(*bit)) {
840 bit = find_trace(traces, offset - (traces - start), nr);
844 t = malloc(sizeof(*t));
846 memset(&t->rb_node, 0, sizeof(t->rb_node));
850 if (verify_trace(bit))
853 pci = get_cpu_info(bit->cpu);
856 if (trace_rb_insert(t))
859 traces += sizeof(*bit) + bit->pdu_len;
867 static void free_entries_rb(void)
871 while ((n = rb_first(&rb_sort_root)) != NULL) {
872 struct trace *t = rb_entry(n, struct trace, rb_node);
874 rb_erase(&t->rb_node, &rb_sort_root);
879 static void show_entries_rb(void)
881 struct blk_io_trace *bit;
886 n = rb_first(&rb_sort_root);
891 t = rb_entry(n, struct trace, rb_node);
895 if (cpu > max_cpus) {
896 fprintf(stderr, "CPU number too large (%d)\n", cpu);
900 if (genesis_time == 0)
901 genesis_time = bit->time;
902 bit->time -= genesis_time;
906 if (dump_trace(bit, &per_cpu_info[cpu]))
909 } while ((n = rb_next(n)) != NULL);
912 static int read_data(int fd, void *buffer, int bytes, int block)
914 int ret, bytes_left, fl;
917 fl = fcntl(fd, F_GETFL);
920 fcntl(fd, F_SETFL, fl | O_NONBLOCK);
922 fcntl(fd, F_SETFL, fl & ~O_NONBLOCK);
926 while (bytes_left > 0) {
927 ret = read(fd, p, bytes_left);
943 static int do_file(void)
947 for (i = 0, nfiles = 0;; i++, nfiles++) {
948 struct per_cpu_info *pci;
952 pci = get_cpu_info(i);
954 snprintf(pci->fname, sizeof(pci->fname)-1,"%s_out.%d", dev, i);
955 if (stat(pci->fname, &st) < 0)
960 printf("Processing %s\n", pci->fname);
962 tb = malloc(st.st_size);
964 pci->fd = open(pci->fname, O_RDONLY);
970 if (read_data(pci->fd, tb, st.st_size, 1))
973 if (sort_entries(tb, st.st_size, ~0U) == -1)
977 printf("\t%2d %10s %15d\n", i, pci->fname, pci->nelems);
982 fprintf(stderr, "No files found\n");
990 static void resize_buffer(void **buffer, long *size, long offset)
992 long old_size = *size;
995 *buffer = realloc(*buffer, *size);
996 memset(*buffer + offset, 0, *size - old_size);
999 static int read_sort_events(int fd, void **buffer)
1001 long offset, max_offset;
1004 max_offset = 128 * sizeof(struct blk_io_trace);
1005 *buffer = malloc(max_offset);
1010 struct blk_io_trace *t;
1013 if (max_offset - offset < sizeof(*t))
1014 resize_buffer(buffer, &max_offset, offset);
1016 if (read_data(fd, *buffer + offset, sizeof(*t), !events)) {
1024 t = *buffer + offset;
1025 offset += sizeof(*t);
1027 pdu_len = be16_to_cpu(t->pdu_len);
1029 if (max_offset - offset < pdu_len)
1030 resize_buffer(buffer, &max_offset, offset);
1032 if (read_data(fd, *buffer + offset, pdu_len, 1))
1039 } while (!is_done() && events < rb_batch);
1044 static int do_stdin(void)
1049 fd = dup(STDIN_FILENO);
1053 events = read_sort_events(fd, &ptr);
1057 if (sort_entries(ptr, ~0UL, events) == -1)
1069 static void flush_output(void)
1074 static void handle_sigint(int sig)
1080 static void usage(char *prog)
1082 fprintf(stderr, "Usage: %s -i <name> [-o <output>][-s]\n", prog);
1085 int main(int argc, char *argv[])
1090 while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
1096 output_name = optarg;
1099 rb_batch = atoi(optarg);
1101 rb_batch = RB_BATCH_DEFAULT;
1104 per_process_stats = 1;
1120 memset(&rb_sort_root, 0, sizeof(rb_sort_root));
1121 memset(&rb_track_root, 0, sizeof(rb_track_root));
1123 signal(SIGINT, handle_sigint);
1124 signal(SIGHUP, handle_sigint);
1125 signal(SIGTERM, handle_sigint);
1127 setlocale(LC_NUMERIC, "en_US");
1130 ofp = fdopen(STDOUT_FILENO, "w");
1135 snprintf(ofname, sizeof(ofname) - 1, "%s.log", output_name);
1136 ofp = fopen(ofname, "w");
1145 ofp_buffer = malloc(4096);
1146 if (setvbuf(ofp, ofp_buffer, mode, 4096)) {
1151 if (!strcmp(dev, "-"))
1156 if (per_process_stats)
1157 show_process_stats();