2 * block queue tracing parse application
4 * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 #include <sys/types.h>
38 static char blkparse_version[] = "0.90";
45 unsigned long long events;
46 unsigned long long last_reported_time;
47 unsigned long long last_read_time;
48 struct io_stats io_stats;
49 unsigned long last_sequence;
52 struct rb_root rb_last;
53 unsigned long rb_last_entries;
57 struct per_cpu_info *cpus;
60 struct per_process_info {
63 struct io_stats io_stats;
64 struct per_process_info *hash_next, *list_next;
70 unsigned long long longest_allocation_wait[2];
71 unsigned long long longest_dispatch_wait[2];
72 unsigned long long longest_completion_wait[2];
75 #define PPI_HASH_SHIFT (8)
76 #define PPI_HASH_SIZE (1 << PPI_HASH_SHIFT)
77 #define PPI_HASH_MASK (PPI_HASH_SIZE - 1)
78 static struct per_process_info *ppi_hash_table[PPI_HASH_SIZE];
79 static struct per_process_info *ppi_list;
80 static int ppi_list_entries;
82 #define S_OPTS "i:o:b:stqw:f:F:vn"
83 static struct option l_opts[] = {
86 .has_arg = required_argument,
92 .has_arg = required_argument,
98 .has_arg = required_argument,
103 .name = "per program stats",
104 .has_arg = no_argument,
110 .has_arg = no_argument,
116 .has_arg = no_argument,
122 .has_arg = required_argument,
128 .has_arg = required_argument,
133 .name = "format-spec",
134 .has_arg = required_argument,
139 .name = "hash by name",
140 .has_arg = no_argument,
146 .has_arg = no_argument,
153 * for sorting the displayed output
156 struct blk_io_trace *bit;
157 struct rb_node rb_node;
161 static struct rb_root rb_sort_root;
162 static unsigned long rb_sort_entries;
164 static struct rb_root rb_track_root;
166 static struct trace *trace_list;
171 static struct blk_io_trace *bit_alloc_list;
172 static struct trace *t_alloc_list;
175 * for tracking individual ios
178 struct rb_node rb_node;
184 unsigned long long allocation_time;
185 unsigned long long queue_time;
186 unsigned long long dispatch_time;
187 unsigned long long completion_time;
191 static struct per_dev_info *devices;
192 static char *get_dev_name(struct per_dev_info *, char *, int);
195 static char *output_name;
197 static unsigned long long genesis_time;
198 static unsigned long long last_allowed_time;
199 static unsigned long long stopwatch_start; /* start from zero by default */
200 static unsigned long long stopwatch_end = ULONG_LONG_MAX; /* "infinity" */
202 static int per_process_stats;
203 static int track_ios;
204 static int ppi_hash_by_pid = 1;
206 #define RB_BATCH_DEFAULT (512)
207 static int rb_batch = RB_BATCH_DEFAULT;
211 #define is_done() (*(volatile int *)(&done))
212 static volatile int done;
214 #define JHASH_RANDOM (0x3af5f2ee)
216 static inline int ppi_hash_pid(__u32 pid)
218 return jhash_1word(pid, JHASH_RANDOM) & PPI_HASH_MASK;
221 static inline int ppi_hash_name(const char *name)
223 return jhash(name, 16, JHASH_RANDOM) & PPI_HASH_MASK;
226 static inline int ppi_hash(struct per_process_info *ppi)
229 return ppi_hash_pid(ppi->pid);
231 return ppi_hash_name(ppi->name);
234 static inline void add_process_to_hash(struct per_process_info *ppi)
236 const int hash_idx = ppi_hash(ppi);
238 ppi->hash_next = ppi_hash_table[hash_idx];
239 ppi_hash_table[hash_idx] = ppi;
242 static inline void add_process_to_list(struct per_process_info *ppi)
244 ppi->list_next = ppi_list;
249 static struct per_process_info *find_process_by_name(char *name)
251 const int hash_idx = ppi_hash_name(name);
252 struct per_process_info *ppi;
254 ppi = ppi_hash_table[hash_idx];
256 if (!strcmp(ppi->name, name))
259 ppi = ppi->hash_next;
265 static struct per_process_info *find_process_by_pid(__u32 pid)
267 const int hash_idx = ppi_hash_pid(pid);
268 struct per_process_info *ppi;
270 ppi = ppi_hash_table[hash_idx];
275 ppi = ppi->hash_next;
281 static struct per_process_info *find_process(__u32 pid, char *name)
283 struct per_process_info *ppi;
286 return find_process_by_pid(pid);
288 ppi = find_process_by_name(name);
289 if (ppi && ppi->pid != pid)
290 ppi->more_than_one = 1;
295 static inline int trace_rb_insert(struct trace *t, struct rb_root *root)
297 struct rb_node **p = &root->rb_node;
298 struct rb_node *parent = NULL;
304 __t = rb_entry(parent, struct trace, rb_node);
306 if (t->bit->time < __t->bit->time)
308 else if (t->bit->time > __t->bit->time)
310 else if (t->bit->device < __t->bit->device)
312 else if (t->bit->device > __t->bit->device)
314 else if (t->bit->sequence < __t->bit->sequence)
316 else if (t->bit->sequence > __t->bit->sequence)
318 else if (t->bit->device == __t->bit->device) {
320 "sequence alias (%d) on device %d,%d!\n",
322 MAJOR(t->bit->device), MINOR(t->bit->device));
327 rb_link_node(&t->rb_node, parent, p);
328 rb_insert_color(&t->rb_node, root);
332 static inline int trace_rb_insert_sort(struct trace *t)
334 if (!trace_rb_insert(t, &rb_sort_root)) {
342 static inline int trace_rb_insert_last(struct per_dev_info *pdi,struct trace *t)
344 if (!trace_rb_insert(t, &pdi->rb_last)) {
345 pdi->rb_last_entries++;
352 static struct trace *trace_rb_find(dev_t device, unsigned long sequence,
353 struct rb_root *root, int order)
355 struct rb_node *n = root->rb_node;
356 struct rb_node *prev = NULL;
360 __t = rb_entry(n, struct trace, rb_node);
363 if (device < __t->bit->device)
365 else if (device > __t->bit->device)
367 else if (sequence < __t->bit->sequence)
369 else if (sequence > __t->bit->sequence)
376 * hack - the list may not be sequence ordered because some
377 * events don't have sequence and time matched. so we end up
378 * being a little off in the rb lookup here, because we don't
379 * know the time we are looking for. compensate by browsing
380 * a little ahead from the last entry to find the match
385 while (((n = rb_next(prev)) != NULL) && max--) {
386 __t = rb_entry(n, struct trace, rb_node);
388 if (__t->bit->device == device &&
389 __t->bit->sequence == sequence)
399 static inline struct trace *trace_rb_find_sort(dev_t dev, unsigned long seq)
401 return trace_rb_find(dev, seq, &rb_sort_root, 1);
404 static inline struct trace *trace_rb_find_last(struct per_dev_info *pdi,
407 return trace_rb_find(pdi->id, seq, &pdi->rb_last, 0);
410 static inline int track_rb_insert(struct io_track *iot)
412 struct rb_node **p = &rb_track_root.rb_node;
413 struct rb_node *parent = NULL;
414 struct io_track *__iot;
418 __iot = rb_entry(parent, struct io_track, rb_node);
420 if (iot->device < __iot->device)
422 else if (iot->device > __iot->device)
424 else if (iot->sector < __iot->sector)
426 else if (iot->sector > __iot->sector)
430 "sector alias (%Lu) on device %d,%d!\n",
431 (unsigned long long) iot->sector,
432 MAJOR(iot->device), MINOR(iot->device));
437 rb_link_node(&iot->rb_node, parent, p);
438 rb_insert_color(&iot->rb_node, &rb_track_root);
442 static struct io_track *__find_track(dev_t device, __u64 sector)
444 struct rb_node *n = rb_track_root.rb_node;
445 struct io_track *__iot;
448 __iot = rb_entry(n, struct io_track, rb_node);
450 if (device < __iot->device)
452 else if (device > __iot->device)
454 else if (sector < __iot->sector)
456 else if (sector > __iot->sector)
465 static struct io_track *find_track(__u32 pid, char *comm, dev_t device,
468 struct io_track *iot;
470 iot = __find_track(device, sector);
472 iot = malloc(sizeof(*iot));
474 memcpy(iot->comm, comm, sizeof(iot->comm));
475 iot->device = device;
476 iot->sector = sector;
477 track_rb_insert(iot);
483 static void log_track_frontmerge(struct blk_io_trace *t)
485 struct io_track *iot;
490 iot = __find_track(t->device, t->sector + (t->bytes >> 9));
492 fprintf(stderr, "merge not found for (%d,%d): %llu\n",
493 MAJOR(t->device), MINOR(t->device),
494 t->sector + (t->bytes >> 9));
498 rb_erase(&iot->rb_node, &rb_track_root);
499 iot->sector -= t->bytes >> 9;
500 track_rb_insert(iot);
503 static void log_track_getrq(struct blk_io_trace *t)
505 struct io_track *iot;
510 iot = find_track(t->pid, t->comm, t->device, t->sector);
511 iot->allocation_time = t->time;
515 * return time between rq allocation and insertion
517 static unsigned long long log_track_insert(struct blk_io_trace *t)
519 unsigned long long elapsed;
520 struct io_track *iot;
525 iot = find_track(t->pid, t->comm, t->device, t->sector);
526 iot->queue_time = t->time;
528 if (!iot->allocation_time)
531 elapsed = iot->queue_time - iot->allocation_time;
533 if (per_process_stats) {
534 struct per_process_info *ppi = find_process(iot->pid,iot->comm);
535 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
537 if (ppi && elapsed > ppi->longest_allocation_wait[w])
538 ppi->longest_allocation_wait[w] = elapsed;
545 * return time between queue and issue
547 static unsigned long long log_track_issue(struct blk_io_trace *t)
549 unsigned long long elapsed;
550 struct io_track *iot;
554 if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
557 iot = __find_track(t->device, t->sector);
559 fprintf(stderr, "issue not found for (%d,%d): %llu\n",
560 MAJOR(t->device), MINOR(t->device), t->sector);
564 iot->dispatch_time = t->time;
565 elapsed = iot->dispatch_time - iot->queue_time;
567 if (per_process_stats) {
568 struct per_process_info *ppi = find_process(iot->pid,iot->comm);
569 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
571 if (ppi && elapsed > ppi->longest_dispatch_wait[w])
572 ppi->longest_dispatch_wait[w] = elapsed;
579 * return time between dispatch and complete
581 static unsigned long long log_track_complete(struct blk_io_trace *t)
583 unsigned long long elapsed;
584 struct io_track *iot;
588 if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
591 iot = __find_track(t->device, t->sector);
593 fprintf(stderr, "complete not found for (%d,%d): %llu\n",
594 MAJOR(t->device), MINOR(t->device), t->sector);
598 iot->completion_time = t->time;
599 elapsed = iot->completion_time - iot->dispatch_time;
601 if (per_process_stats) {
602 struct per_process_info *ppi = find_process(iot->pid,iot->comm);
603 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
605 if (ppi && elapsed > ppi->longest_completion_wait[w])
606 ppi->longest_completion_wait[w] = elapsed;
610 * kill the trace, we don't need it after completion
612 rb_erase(&iot->rb_node, &rb_track_root);
619 static struct io_stats *find_process_io_stats(__u32 pid, char *name)
621 struct per_process_info *ppi = find_process(pid, name);
624 ppi = malloc(sizeof(*ppi));
625 memset(ppi, 0, sizeof(*ppi));
626 memcpy(ppi->name, name, 16);
628 add_process_to_hash(ppi);
629 add_process_to_list(ppi);
632 return &ppi->io_stats;
635 static void resize_cpu_info(struct per_dev_info *pdi, int cpu)
637 struct per_cpu_info *cpus = pdi->cpus;
638 int ncpus = pdi->ncpus;
639 int new_count = cpu + 1;
643 size = new_count * sizeof(struct per_cpu_info);
644 cpus = realloc(cpus, size);
647 fprintf(stderr, "Out of memory, CPU info for device %s (%d)\n",
648 get_dev_name(pdi, name, sizeof(name)), size);
652 new_start = (char *)cpus + (ncpus * sizeof(struct per_cpu_info));
653 new_space = (new_count - ncpus) * sizeof(struct per_cpu_info);
654 memset(new_start, 0, new_space);
656 pdi->ncpus = new_count;
660 static struct per_cpu_info *get_cpu_info(struct per_dev_info *pdi, int cpu)
662 struct per_cpu_info *pci;
664 if (cpu >= pdi->ncpus)
665 resize_cpu_info(pdi, cpu);
667 pci = &pdi->cpus[cpu];
673 static int resize_devices(char *name)
675 int size = (ndevices + 1) * sizeof(struct per_dev_info);
677 devices = realloc(devices, size);
679 fprintf(stderr, "Out of memory, device %s (%d)\n", name, size);
682 memset(&devices[ndevices], 0, sizeof(struct per_dev_info));
683 devices[ndevices].name = name;
688 static struct per_dev_info *get_dev_info(dev_t id)
690 struct per_dev_info *pdi;
693 for (i = 0; i < ndevices; i++) {
696 if (devices[i].id == id)
700 if (resize_devices(NULL))
703 pdi = &devices[ndevices - 1];
705 pdi->last_sequence = 0;
706 pdi->last_read_time = 0;
707 memset(&pdi->rb_last, 0, sizeof(pdi->rb_last));
708 pdi->rb_last_entries = 0;
712 static char *get_dev_name(struct per_dev_info *pdi, char *buffer, int size)
715 snprintf(buffer, size, "%s", pdi->name);
717 snprintf(buffer, size, "%d,%d", MAJOR(pdi->id), MINOR(pdi->id));
721 static void check_time(struct per_dev_info *pdi, struct blk_io_trace *bit)
723 unsigned long long this = bit->time;
724 unsigned long long last = pdi->last_reported_time;
726 pdi->backwards = (this < last) ? 'B' : ' ';
727 pdi->last_reported_time = this;
730 static inline void __account_m(struct io_stats *ios, struct blk_io_trace *t,
735 ios->qwrite_kb += t->bytes >> 10;
738 ios->qread_kb += t->bytes >> 10;
742 static inline void account_m(struct blk_io_trace *t, struct per_cpu_info *pci,
745 __account_m(&pci->io_stats, t, rw);
747 if (per_process_stats) {
748 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
750 __account_m(ios, t, rw);
754 static inline void __account_queue(struct io_stats *ios, struct blk_io_trace *t,
759 ios->qwrite_kb += t->bytes >> 10;
762 ios->qread_kb += t->bytes >> 10;
766 static inline void account_queue(struct blk_io_trace *t,
767 struct per_cpu_info *pci, int rw)
769 __account_queue(&pci->io_stats, t, rw);
771 if (per_process_stats) {
772 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
774 __account_queue(ios, t, rw);
778 static inline void __account_c(struct io_stats *ios, int rw, unsigned int bytes)
782 ios->cwrite_kb += bytes >> 10;
785 ios->cread_kb += bytes >> 10;
789 static inline void account_c(struct blk_io_trace *t, struct per_cpu_info *pci,
792 __account_c(&pci->io_stats, rw, bytes);
794 if (per_process_stats) {
795 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
797 __account_c(ios, rw, bytes);
801 static inline void __account_issue(struct io_stats *ios, int rw,
806 ios->iwrite_kb += bytes >> 10;
809 ios->iread_kb += bytes >> 10;
813 static inline void account_issue(struct blk_io_trace *t,
814 struct per_cpu_info *pci, int rw)
816 __account_issue(&pci->io_stats, rw, t->bytes);
818 if (per_process_stats) {
819 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
821 __account_issue(ios, rw, t->bytes);
825 static inline void __account_unplug(struct io_stats *ios, int timer)
828 ios->timer_unplugs++;
833 static inline void account_unplug(struct blk_io_trace *t,
834 struct per_cpu_info *pci, int timer)
836 __account_unplug(&pci->io_stats, timer);
838 if (per_process_stats) {
839 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
841 __account_unplug(ios, timer);
845 static void log_complete(struct per_cpu_info *pci, struct blk_io_trace *t,
848 process_fmt(act, pci, t, log_track_complete(t), 0, NULL);
851 static void log_insert(struct per_cpu_info *pci, struct blk_io_trace *t,
854 process_fmt(act, pci, t, log_track_insert(t), 0, NULL);
857 static void log_queue(struct per_cpu_info *pci, struct blk_io_trace *t,
860 process_fmt(act, pci, t, -1, 0, NULL);
863 static void log_issue(struct per_cpu_info *pci, struct blk_io_trace *t,
866 process_fmt(act, pci, t, log_track_issue(t), 0, NULL);
869 static void log_merge(struct per_cpu_info *pci, struct blk_io_trace *t,
873 log_track_frontmerge(t);
875 process_fmt(act, pci, t, -1ULL, 0, NULL);
878 static void log_action(struct per_cpu_info *pci, struct blk_io_trace *t,
881 process_fmt(act, pci, t, -1ULL, 0, NULL);
884 static void log_generic(struct per_cpu_info *pci, struct blk_io_trace *t,
887 process_fmt(act, pci, t, -1ULL, 0, NULL);
890 static void log_unplug(struct per_cpu_info *pci, struct blk_io_trace *t,
893 process_fmt(act, pci, t, -1ULL, 0, NULL);
896 static void log_split(struct per_cpu_info *pci, struct blk_io_trace *t,
899 process_fmt(act, pci, t, -1ULL, 0, NULL);
902 static void log_pc(struct per_cpu_info *pci, struct blk_io_trace *t, char *act)
904 unsigned char *buf = (unsigned char *) t + sizeof(*t);
906 process_fmt(act, pci, t, -1ULL, t->pdu_len, buf);
909 static void dump_trace_pc(struct blk_io_trace *t, struct per_cpu_info *pci)
911 int act = t->action & 0xffff;
915 log_generic(pci, t, "Q");
918 log_generic(pci, t, "G");
920 case __BLK_TA_SLEEPRQ:
921 log_generic(pci, t, "S");
923 case __BLK_TA_REQUEUE:
924 log_generic(pci, t, "R");
929 case __BLK_TA_COMPLETE:
932 case __BLK_TA_INSERT:
936 fprintf(stderr, "Bad pc action %x\n", act);
941 static void dump_trace_fs(struct blk_io_trace *t, struct per_cpu_info *pci)
943 int w = t->action & BLK_TC_ACT(BLK_TC_WRITE);
944 int act = t->action & 0xffff;
948 account_queue(t, pci, w);
949 log_queue(pci, t, "Q");
951 case __BLK_TA_INSERT:
952 log_insert(pci, t, "I");
954 case __BLK_TA_BACKMERGE:
955 account_m(t, pci, w);
956 log_merge(pci, t, "M");
958 case __BLK_TA_FRONTMERGE:
959 account_m(t, pci, w);
960 log_merge(pci, t, "F");
964 log_generic(pci, t, "G");
966 case __BLK_TA_SLEEPRQ:
967 log_generic(pci, t, "S");
969 case __BLK_TA_REQUEUE:
970 account_c(t, pci, w, -t->bytes);
971 log_queue(pci, t, "R");
974 account_issue(t, pci, w);
975 log_issue(pci, t, "D");
977 case __BLK_TA_COMPLETE:
978 account_c(t, pci, w, t->bytes);
979 log_complete(pci, t, "C");
982 log_action(pci, t, "P");
984 case __BLK_TA_UNPLUG_IO:
985 account_unplug(t, pci, 0);
986 log_unplug(pci, t, "U");
988 case __BLK_TA_UNPLUG_TIMER:
989 account_unplug(t, pci, 1);
990 log_unplug(pci, t, "UT");
993 log_split(pci, t, "X");
995 case __BLK_TA_BOUNCE:
996 log_generic(pci, t, "B");
999 fprintf(stderr, "Bad fs action %x\n", t->action);
1004 static void dump_trace(struct blk_io_trace *t, struct per_cpu_info *pci,
1005 struct per_dev_info *pdi)
1007 if (t->action & BLK_TC_ACT(BLK_TC_PC))
1008 dump_trace_pc(t, pci);
1010 dump_trace_fs(t, pci);
1015 static void dump_io_stats(struct io_stats *ios, char *msg)
1017 fprintf(ofp, "%s\n", msg);
1019 fprintf(ofp, " Reads Queued: %'8lu, %'8LuKiB\t", ios->qreads, ios->qread_kb);
1020 fprintf(ofp, " Writes Queued: %'8lu, %'8LuKiB\n", ios->qwrites,ios->qwrite_kb);
1022 fprintf(ofp, " Read Dispatches: %'8lu, %'8LuKiB\t", ios->ireads, ios->iread_kb);
1023 fprintf(ofp, " Write Dispatches: %'8lu, %'8LuKiB\n", ios->iwrites,ios->iwrite_kb);
1024 fprintf(ofp, " Reads Completed: %'8lu, %'8LuKiB\t", ios->creads, ios->cread_kb);
1025 fprintf(ofp, " Writes Completed: %'8lu, %'8LuKiB\n", ios->cwrites,ios->cwrite_kb);
1026 fprintf(ofp, " Read Merges: %'8lu%8c\t", ios->mreads, ' ');
1027 fprintf(ofp, " Write Merges: %'8lu\n", ios->mwrites);
1028 fprintf(ofp, " IO unplugs: %'8lu%8c\t", ios->io_unplugs, ' ');
1029 fprintf(ofp, " Timer unplugs: %'8lu\n", ios->timer_unplugs);
1032 static void dump_wait_stats(struct per_process_info *ppi)
1034 unsigned long rawait = ppi->longest_allocation_wait[0] / 1000;
1035 unsigned long rdwait = ppi->longest_dispatch_wait[0] / 1000;
1036 unsigned long rcwait = ppi->longest_completion_wait[0] / 1000;
1037 unsigned long wawait = ppi->longest_allocation_wait[1] / 1000;
1038 unsigned long wdwait = ppi->longest_dispatch_wait[1] / 1000;
1039 unsigned long wcwait = ppi->longest_completion_wait[1] / 1000;
1041 fprintf(ofp, " Allocation wait: %'8lu%8c\t", rawait, ' ');
1042 fprintf(ofp, " Allocation wait: %'8lu\n", wawait);
1043 fprintf(ofp, " Dispatch wait: %'8lu%8c\t", rdwait, ' ');
1044 fprintf(ofp, " Dispatch wait: %'8lu\n", wdwait);
1045 fprintf(ofp, " Completion wait: %'8lu%8c\t", rcwait, ' ');
1046 fprintf(ofp, " Completion wait: %'8lu\n", wcwait);
1049 static int ppi_name_compare(const void *p1, const void *p2)
1051 struct per_process_info *ppi1 = *((struct per_process_info **) p1);
1052 struct per_process_info *ppi2 = *((struct per_process_info **) p2);
1055 res = strverscmp(ppi1->name, ppi2->name);
1057 res = ppi1->pid > ppi2->pid;
1062 static void sort_process_list(void)
1064 struct per_process_info **ppis;
1065 struct per_process_info *ppi;
1068 ppis = malloc(ppi_list_entries * sizeof(struct per_process_info *));
1073 ppi = ppi->list_next;
1076 qsort(ppis, ppi_list_entries, sizeof(ppi), ppi_name_compare);
1078 i = ppi_list_entries - 1;
1083 ppi->list_next = ppi_list;
1091 static void show_process_stats(void)
1093 struct per_process_info *ppi;
1095 sort_process_list();
1101 if (ppi->more_than_one)
1102 sprintf(name, "%s (%u, ...)", ppi->name, ppi->pid);
1104 sprintf(name, "%s (%u)", ppi->name, ppi->pid);
1106 dump_io_stats(&ppi->io_stats, name);
1107 dump_wait_stats(ppi);
1108 ppi = ppi->list_next;
1114 static void show_device_and_cpu_stats(void)
1116 struct per_dev_info *pdi;
1117 struct per_cpu_info *pci;
1118 struct io_stats total, *ios;
1119 int i, j, pci_events;
1120 char line[3 + 8/*cpu*/ + 2 + 32/*dev*/ + 3];
1123 for (pdi = devices, i = 0; i < ndevices; i++, pdi++) {
1125 memset(&total, 0, sizeof(total));
1131 for (pci = pdi->cpus, j = 0; j < pdi->ncpus; j++, pci++) {
1135 ios = &pci->io_stats;
1136 total.qreads += ios->qreads;
1137 total.qwrites += ios->qwrites;
1138 total.creads += ios->creads;
1139 total.cwrites += ios->cwrites;
1140 total.mreads += ios->mreads;
1141 total.mwrites += ios->mwrites;
1142 total.ireads += ios->ireads;
1143 total.iwrites += ios->iwrites;
1144 total.qread_kb += ios->qread_kb;
1145 total.qwrite_kb += ios->qwrite_kb;
1146 total.cread_kb += ios->cread_kb;
1147 total.cwrite_kb += ios->cwrite_kb;
1148 total.iread_kb += ios->iread_kb;
1149 total.iwrite_kb += ios->iwrite_kb;
1150 total.timer_unplugs += ios->timer_unplugs;
1151 total.io_unplugs += ios->io_unplugs;
1153 snprintf(line, sizeof(line) - 1, "CPU%d (%s):",
1154 j, get_dev_name(pdi, name, sizeof(name)));
1155 dump_io_stats(ios, line);
1159 if (pci_events > 1) {
1161 snprintf(line, sizeof(line) - 1, "Total (%s):",
1162 get_dev_name(pdi, name, sizeof(name)));
1163 dump_io_stats(&total, line);
1166 fprintf(ofp, "\nEvents (%s): %'Lu entries, %'lu skips\n",
1167 get_dev_name(pdi, line, sizeof(line)), pdi->events,
1173 * struct trace and blktrace allocation cache, we do potentially
1174 * millions of mallocs for these structures while only using at most
1175 * a few thousand at the time
1177 static inline void t_free(struct trace *t)
1179 t->next = t_alloc_list;
1183 static inline struct trace *t_alloc(void)
1185 struct trace *t = t_alloc_list;
1188 t_alloc_list = t->next;
1192 return malloc(sizeof(*t));
1195 static inline void bit_free(struct blk_io_trace *bit)
1198 * abuse a 64-bit field for a next pointer for the free item
1200 bit->time = (__u64) (unsigned long) bit_alloc_list;
1201 bit_alloc_list = (struct blk_io_trace *) bit;
1204 static inline struct blk_io_trace *bit_alloc(void)
1206 struct blk_io_trace *bit = bit_alloc_list;
1209 bit_alloc_list = (struct blk_io_trace *) (unsigned long) \
1214 return malloc(sizeof(*bit));
1217 static void find_genesis(void)
1219 struct trace *t = trace_list;
1221 genesis_time = -1ULL;
1223 if (t->bit->time < genesis_time)
1224 genesis_time = t->bit->time;
1230 static inline int check_stopwatch(struct blk_io_trace *bit)
1232 if (bit->time < stopwatch_end &&
1233 bit->time >= stopwatch_start)
1240 * return youngest entry read
1242 static int sort_entries(unsigned long long *youngest)
1250 while ((t = trace_list) != NULL) {
1251 struct blk_io_trace *bit = t->bit;
1253 trace_list = t->next;
1255 if (verify_trace(bit))
1258 bit->time -= genesis_time;
1260 if (check_stopwatch(bit)) {
1266 if (trace_rb_insert_sort(t))
1269 if (bit->time < *youngest)
1270 *youngest = bit->time;
1276 static inline void put_trace(struct per_dev_info *pdi, struct trace *t)
1278 rb_erase(&t->rb_node, &rb_sort_root);
1281 trace_rb_insert_last(pdi, t);
1283 if (pdi->rb_last_entries > 1024) {
1284 struct rb_node *n = rb_first(&pdi->rb_last);
1286 t = rb_entry(n, struct trace, rb_node);
1287 rb_erase(n, &pdi->rb_last);
1288 pdi->rb_last_entries--;
1295 static int check_sequence(struct per_dev_info *pdi, struct blk_io_trace *bit,
1298 unsigned long expected_sequence = pdi->last_sequence + 1;
1301 if (bit->sequence == expected_sequence)
1304 if (bit->sequence < expected_sequence &&
1305 bit->time > pdi->last_reported_time)
1309 * the wanted sequence is really there, continue
1310 * because this means that the log time is earlier
1311 * on the trace we have now */
1312 t = trace_rb_find_sort(pdi->id, expected_sequence);
1316 t = trace_rb_find_last(pdi, expected_sequence);
1321 * unless this is the last run, break and wait for more entries
1326 fprintf(stderr, "(%d,%d): skipping %lu -> %u\n", MAJOR(pdi->id),
1327 MINOR(pdi->id), pdi->last_sequence, bit->sequence);
1332 static void show_entries_rb(int force)
1334 struct per_dev_info *pdi = NULL;
1335 struct per_cpu_info *pci = NULL;
1336 struct blk_io_trace *bit;
1340 while ((n = rb_first(&rb_sort_root)) != NULL) {
1344 t = rb_entry(n, struct trace, rb_node);
1347 if (!pdi || pdi->id != bit->device)
1348 pdi = get_dev_info(bit->device);
1351 fprintf(stderr, "Unknown device ID? (%d,%d)\n",
1352 MAJOR(bit->device), MINOR(bit->device));
1356 if (bit->cpu > pdi->ncpus) {
1357 fprintf(stderr, "Unknown CPU ID? (%d, device %d,%d)\n",
1358 bit->cpu, MAJOR(bit->device),
1359 MINOR(bit->device));
1363 if (check_sequence(pdi, bit, force))
1366 if (!force && bit->time > last_allowed_time)
1369 pdi->last_sequence = bit->sequence;
1371 check_time(pdi, bit);
1373 if (!pci || pci->cpu != bit->cpu)
1374 pci = get_cpu_info(pdi, bit->cpu);
1376 dump_trace(bit, pci, pdi);
1382 static int read_data(int fd, void *buffer, int bytes, int block)
1384 int ret, bytes_left, fl;
1387 fl = fcntl(fd, F_GETFL);
1390 fcntl(fd, F_SETFL, fl | O_NONBLOCK);
1392 fcntl(fd, F_SETFL, fl & ~O_NONBLOCK);
1396 while (bytes_left > 0) {
1397 ret = read(fd, p, bytes_left);
1401 if (errno != EAGAIN)
1414 static int read_events(int fd, int always_block)
1416 struct per_dev_info *pdi = NULL;
1419 while (!is_done() && events < rb_batch) {
1420 struct blk_io_trace *bit;
1427 if (read_data(fd, bit, sizeof(*bit), !events || always_block))
1430 magic = be32_to_cpu(bit->magic);
1431 if ((magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
1432 fprintf(stderr, "Bad magic %x\n", magic);
1436 pdu_len = be16_to_cpu(bit->pdu_len);
1438 void *ptr = realloc(bit, sizeof(*bit) + pdu_len);
1440 if (read_data(fd, ptr + sizeof(*bit), pdu_len, 1))
1447 memset(t, 0, sizeof(*t));
1452 t->next = trace_list;
1455 if (!pdi || pdi->id != bit->device)
1456 pdi = get_dev_info(bit->device);
1458 if (bit->time > pdi->last_read_time)
1459 pdi->last_read_time = bit->time;
1467 static int do_file(void)
1469 struct per_cpu_info *pci;
1470 struct per_dev_info *pdi;
1471 int i, j, events, events_added;
1474 * first prepare all files for reading
1476 for (i = 0; i < ndevices; i++) {
1479 pdi->last_sequence = 0;
1484 pci = get_cpu_info(pdi, j);
1488 snprintf(pci->fname, sizeof(pci->fname)-1,
1489 "%s.blktrace.%d", pdi->name, pci->cpu);
1490 if (stat(pci->fname, &st) < 0)
1493 pci->fd = open(pci->fname, O_RDONLY);
1500 printf("Input file %s added\n", pci->fname);
1506 * now loop over the files reading in the data
1509 unsigned long long youngest;
1512 last_allowed_time = -1ULL;
1514 for (i = 0; i < ndevices; i++) {
1517 for (j = 0; j < pdi->nfiles; j++) {
1519 pci = get_cpu_info(pdi, j);
1524 events = read_events(pci->fd, 1);
1531 if (pdi->last_read_time < last_allowed_time)
1532 last_allowed_time = pdi->last_read_time;
1534 events_added += events;
1538 if (sort_entries(&youngest))
1541 if (youngest > stopwatch_end)
1546 } while (events_added);
1548 if (rb_sort_entries)
1554 static int do_stdin(void)
1556 unsigned long long youngest;
1559 last_allowed_time = -1ULL;
1560 fd = dup(STDIN_FILENO);
1564 events = read_events(fd, 0);
1568 if (sort_entries(&youngest))
1571 if (youngest > stopwatch_end)
1577 if (rb_sort_entries)
1584 static void flush_output(void)
1589 static void handle_sigint(int sig)
1596 * Extract start and duration times from a string, allowing
1597 * us to specify a time interval of interest within a trace.
1598 * Format: "duration" (start is zero) or "start:duration".
1600 static int find_stopwatch_interval(char *string)
1605 value = strtod(string, &sp);
1607 fprintf(stderr,"Invalid stopwatch timer: %s\n", string);
1611 stopwatch_start = DOUBLE_TO_NANO_ULL(value);
1613 value = strtod(string, &sp);
1614 if (sp == string || *sp != '\0') {
1615 fprintf(stderr,"Invalid stopwatch duration time: %s\n",
1619 } else if (*sp != '\0') {
1620 fprintf(stderr,"Invalid stopwatch start timer: %s\n", string);
1623 stopwatch_end = DOUBLE_TO_NANO_ULL(value);
1624 if (stopwatch_end <= stopwatch_start) {
1625 fprintf(stderr, "Invalid stopwatch interval: %Lu -> %Lu\n",
1626 stopwatch_start, stopwatch_end);
1633 static char usage_str[] = \
1634 "[ -i <input name> ] [-o <output name> [ -s ] [ -t ] [ -q ]\n" \
1635 "[ -w start:stop ] [ -f output format ] [ -F format spec ] [ -v] \n\n" \
1636 "\t-i Input file containing trace data, or '-' for stdin\n" \
1637 "\t-o Output file. If not given, output is stdout\n" \
1638 "\t-b stdin read batching\n" \
1639 "\t-s Show per-program io statistics\n" \
1640 "\t-n Hash processes by name, not pid\n" \
1641 "\t-t Track individual ios. Will tell you the time a request took\n" \
1642 "\t to get queued, to get dispatched, and to get completed\n" \
1643 "\t-q Quiet. Don't display any stats at the end of the trace\n" \
1644 "\t-w Only parse data between the given time interval in seconds.\n" \
1645 "\t If 'start' isn't given, blkparse defaults the start time to 0\n" \
1646 "\t -f Output format. Customize the output format. The format field\n" \
1647 "\t identifies can be found in the documentation\n" \
1648 "\t-F Format specification. Can be found in the documentation\n" \
1649 "\t-v Print program version info\n\n";
1651 static void usage(char *prog)
1653 fprintf(stderr, "Usage: %s %s %s", prog, blkparse_version, usage_str);
1656 int main(int argc, char *argv[])
1660 int per_device_and_cpu_stats = 1;
1662 while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
1665 if (!strcmp(optarg, "-") && !pipeline)
1667 else if (resize_devices(optarg) != 0)
1671 output_name = optarg;
1674 rb_batch = atoi(optarg);
1676 rb_batch = RB_BATCH_DEFAULT;
1679 per_process_stats = 1;
1685 per_device_and_cpu_stats = 0;
1688 if (find_stopwatch_interval(optarg) != 0)
1692 set_all_format_specs(optarg);
1695 if (add_format_spec(optarg) != 0)
1699 ppi_hash_by_pid = 0;
1702 printf("%s version %s\n", argv[0], blkparse_version);
1710 while (optind < argc) {
1711 if (!strcmp(argv[optind], "-") && !pipeline)
1713 else if (resize_devices(argv[optind]) != 0)
1718 if (!pipeline && !ndevices) {
1723 memset(&rb_sort_root, 0, sizeof(rb_sort_root));
1724 memset(&rb_track_root, 0, sizeof(rb_track_root));
1726 signal(SIGINT, handle_sigint);
1727 signal(SIGHUP, handle_sigint);
1728 signal(SIGTERM, handle_sigint);
1730 setlocale(LC_NUMERIC, "en_US");
1733 ofp = fdopen(STDOUT_FILENO, "w");
1738 snprintf(ofname, sizeof(ofname) - 1, "%s", output_name);
1739 ofp = fopen(ofname, "w");
1748 ofp_buffer = malloc(4096);
1749 if (setvbuf(ofp, ofp_buffer, mode, 4096)) {
1759 if (per_process_stats)
1760 show_process_stats();
1762 if (per_device_and_cpu_stats)
1763 show_device_and_cpu_stats();