2 * block queue tracing parse application
4 * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 #include <sys/types.h>
38 static char blkparse_version[] = "0.90";
45 unsigned long long events;
46 unsigned long long last_reported_time;
47 unsigned long long last_read_time;
48 struct io_stats io_stats;
49 unsigned long last_sequence;
54 struct per_cpu_info *cpus;
57 struct per_process_info {
60 struct io_stats io_stats;
61 struct per_process_info *hash_next, *list_next;
67 unsigned long long longest_allocation_wait[2];
68 unsigned long long longest_dispatch_wait[2];
69 unsigned long long longest_completion_wait[2];
72 #define PPI_HASH_SHIFT (8)
73 #define PPI_HASH_SIZE (1 << PPI_HASH_SHIFT)
74 #define PPI_HASH_MASK (PPI_HASH_SIZE - 1)
75 static struct per_process_info *ppi_hash_table[PPI_HASH_SIZE];
76 static struct per_process_info *ppi_list;
77 static int ppi_list_entries;
79 #define S_OPTS "i:o:b:stqw:f:F:vn"
80 static struct option l_opts[] = {
83 .has_arg = required_argument,
89 .has_arg = required_argument,
95 .has_arg = required_argument,
100 .name = "per program stats",
101 .has_arg = no_argument,
107 .has_arg = no_argument,
113 .has_arg = no_argument,
119 .has_arg = required_argument,
125 .has_arg = required_argument,
130 .name = "format-spec",
131 .has_arg = required_argument,
136 .name = "hash by name",
137 .has_arg = no_argument,
143 .has_arg = no_argument,
150 * for sorting the displayed output
153 struct blk_io_trace *bit;
154 struct rb_node rb_node;
158 static struct rb_root rb_sort_root;
159 static unsigned long rb_sort_entries;
161 static struct rb_root rb_track_root;
163 static struct trace *trace_list;
168 static struct blk_io_trace *bit_alloc_list;
169 static struct trace *t_alloc_list;
172 * for tracking individual ios
175 struct rb_node rb_node;
181 unsigned long long allocation_time;
182 unsigned long long queue_time;
183 unsigned long long dispatch_time;
184 unsigned long long completion_time;
188 static struct per_dev_info *devices;
189 static char *get_dev_name(struct per_dev_info *, char *, int);
192 static char *output_name;
194 static unsigned long long genesis_time;
195 static unsigned long long last_allowed_time;
196 static unsigned long long stopwatch_start; /* start from zero by default */
197 static unsigned long long stopwatch_end = ULONG_LONG_MAX; /* "infinity" */
199 static int per_process_stats;
200 static int track_ios;
201 static int ppi_hash_by_pid = 1;
203 #define RB_BATCH_DEFAULT (512)
204 static int rb_batch = RB_BATCH_DEFAULT;
208 #define is_done() (*(volatile int *)(&done))
209 static volatile int done;
211 #define JHASH_RANDOM (0x3af5f2ee)
213 static inline int ppi_hash_pid(__u32 pid)
215 return jhash_1word(pid, JHASH_RANDOM) & PPI_HASH_MASK;
218 static inline int ppi_hash_name(const char *name)
220 return jhash(name, 16, JHASH_RANDOM) & PPI_HASH_MASK;
223 static inline int ppi_hash(struct per_process_info *ppi)
226 return ppi_hash_pid(ppi->pid);
228 return ppi_hash_name(ppi->name);
231 static inline void add_process_to_hash(struct per_process_info *ppi)
233 const int hash_idx = ppi_hash(ppi);
235 ppi->hash_next = ppi_hash_table[hash_idx];
236 ppi_hash_table[hash_idx] = ppi;
239 static inline void add_process_to_list(struct per_process_info *ppi)
241 ppi->list_next = ppi_list;
246 static struct per_process_info *find_process_by_name(char *name)
248 const int hash_idx = ppi_hash_name(name);
249 struct per_process_info *ppi;
251 ppi = ppi_hash_table[hash_idx];
253 if (!strcmp(ppi->name, name))
256 ppi = ppi->hash_next;
262 static struct per_process_info *find_process_by_pid(__u32 pid)
264 const int hash_idx = ppi_hash_pid(pid);
265 struct per_process_info *ppi;
267 ppi = ppi_hash_table[hash_idx];
272 ppi = ppi->hash_next;
278 static struct per_process_info *find_process(__u32 pid, char *name)
280 struct per_process_info *ppi;
283 return find_process_by_pid(pid);
285 ppi = find_process_by_name(name);
286 if (ppi && ppi->pid != pid)
287 ppi->more_than_one = 1;
292 static inline int trace_rb_insert(struct trace *t)
294 struct rb_node **p = &rb_sort_root.rb_node;
295 struct rb_node *parent = NULL;
300 __t = rb_entry(parent, struct trace, rb_node);
302 if (t->bit->time < __t->bit->time)
304 else if (t->bit->time > __t->bit->time)
306 else if (t->bit->device < __t->bit->device)
308 else if (t->bit->device > __t->bit->device)
310 else if (t->bit->sequence < __t->bit->sequence)
312 else if (t->bit->sequence > __t->bit->sequence)
314 else if (t->bit->device == __t->bit->device) {
316 "sequence alias (%d) on device %d,%d!\n",
318 MAJOR(t->bit->device), MINOR(t->bit->device));
324 rb_link_node(&t->rb_node, parent, p);
325 rb_insert_color(&t->rb_node, &rb_sort_root);
329 static struct trace *trace_rb_find(dev_t device, unsigned long sequence)
331 struct rb_node **p = &rb_sort_root.rb_node;
332 struct rb_node *parent = NULL;
337 __t = rb_entry(parent, struct trace, rb_node);
339 if (device < __t->bit->device)
341 else if (device > __t->bit->device)
343 else if (sequence < __t->bit->sequence)
345 else if (sequence > __t->bit->sequence)
354 static inline int track_rb_insert(struct io_track *iot)
356 struct rb_node **p = &rb_track_root.rb_node;
357 struct rb_node *parent = NULL;
358 struct io_track *__iot;
363 __iot = rb_entry(parent, struct io_track, rb_node);
365 if (iot->device < __iot->device)
367 else if (iot->device > __iot->device)
369 else if (iot->sector < __iot->sector)
371 else if (iot->sector > __iot->sector)
375 "sector alias (%Lu) on device %d,%d!\n",
376 (unsigned long long) iot->sector,
377 MAJOR(iot->device), MINOR(iot->device));
382 rb_link_node(&iot->rb_node, parent, p);
383 rb_insert_color(&iot->rb_node, &rb_track_root);
387 static struct io_track *__find_track(dev_t device, __u64 sector)
389 struct rb_node **p = &rb_track_root.rb_node;
390 struct rb_node *parent = NULL;
391 struct io_track *__iot;
396 __iot = rb_entry(parent, struct io_track, rb_node);
398 if (device < __iot->device)
400 else if (device > __iot->device)
402 else if (sector < __iot->sector)
404 else if (sector > __iot->sector)
413 static struct io_track *find_track(__u32 pid, char *comm, dev_t device,
416 struct io_track *iot;
418 iot = __find_track(device, sector);
420 iot = malloc(sizeof(*iot));
422 memcpy(iot->comm, comm, sizeof(iot->comm));
423 iot->device = device;
424 iot->sector = sector;
425 track_rb_insert(iot);
431 static void log_track_frontmerge(struct blk_io_trace *t)
433 struct io_track *iot;
438 iot = __find_track(t->device, t->sector + (t->bytes >> 9));
440 fprintf(stderr, "merge not found for (%d,%d): %llu\n",
441 MAJOR(t->device), MINOR(t->device),
442 t->sector + (t->bytes >> 9));
446 rb_erase(&iot->rb_node, &rb_track_root);
447 iot->sector -= t->bytes >> 9;
448 track_rb_insert(iot);
451 static void log_track_getrq(struct blk_io_trace *t)
453 struct io_track *iot;
458 iot = find_track(t->pid, t->comm, t->device, t->sector);
459 iot->allocation_time = t->time;
463 * return time between rq allocation and insertion
465 static unsigned long long log_track_insert(struct blk_io_trace *t)
467 unsigned long long elapsed;
468 struct io_track *iot;
473 iot = find_track(t->pid, t->comm, t->device, t->sector);
474 iot->queue_time = t->time;
476 if (!iot->allocation_time)
479 elapsed = iot->queue_time - iot->allocation_time;
481 if (per_process_stats) {
482 struct per_process_info *ppi = find_process(iot->pid,iot->comm);
483 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
485 if (ppi && elapsed > ppi->longest_allocation_wait[w])
486 ppi->longest_allocation_wait[w] = elapsed;
493 * return time between queue and issue
495 static unsigned long long log_track_issue(struct blk_io_trace *t)
497 unsigned long long elapsed;
498 struct io_track *iot;
502 if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
505 iot = __find_track(t->device, t->sector);
507 fprintf(stderr, "issue not found for (%d,%d): %llu\n",
508 MAJOR(t->device), MINOR(t->device), t->sector);
512 iot->dispatch_time = t->time;
513 elapsed = iot->dispatch_time - iot->queue_time;
515 if (per_process_stats) {
516 struct per_process_info *ppi = find_process(iot->pid,iot->comm);
517 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
519 if (ppi && elapsed > ppi->longest_dispatch_wait[w])
520 ppi->longest_dispatch_wait[w] = elapsed;
527 * return time between dispatch and complete
529 static unsigned long long log_track_complete(struct blk_io_trace *t)
531 unsigned long long elapsed;
532 struct io_track *iot;
536 if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
539 iot = __find_track(t->device, t->sector);
541 fprintf(stderr, "complete not found for (%d,%d): %llu\n",
542 MAJOR(t->device), MINOR(t->device), t->sector);
546 iot->completion_time = t->time;
547 elapsed = iot->completion_time - iot->dispatch_time;
549 if (per_process_stats) {
550 struct per_process_info *ppi = find_process(iot->pid,iot->comm);
551 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
553 if (ppi && elapsed > ppi->longest_completion_wait[w])
554 ppi->longest_completion_wait[w] = elapsed;
558 * kill the trace, we don't need it after completion
560 rb_erase(&iot->rb_node, &rb_track_root);
567 static struct io_stats *find_process_io_stats(__u32 pid, char *name)
569 struct per_process_info *ppi = find_process(pid, name);
572 ppi = malloc(sizeof(*ppi));
573 memset(ppi, 0, sizeof(*ppi));
574 memcpy(ppi->name, name, 16);
576 add_process_to_hash(ppi);
577 add_process_to_list(ppi);
580 return &ppi->io_stats;
583 static void resize_cpu_info(struct per_dev_info *pdi, int cpu)
585 struct per_cpu_info *cpus = pdi->cpus;
586 int ncpus = pdi->ncpus;
587 int new_count = cpu + 1;
591 size = new_count * sizeof(struct per_cpu_info);
592 cpus = realloc(cpus, size);
595 fprintf(stderr, "Out of memory, CPU info for device %s (%d)\n",
596 get_dev_name(pdi, name, sizeof(name)), size);
600 new_start = (char *)cpus + (ncpus * sizeof(struct per_cpu_info));
601 new_space = (new_count - ncpus) * sizeof(struct per_cpu_info);
602 memset(new_start, 0, new_space);
604 pdi->ncpus = new_count;
608 static struct per_cpu_info *get_cpu_info(struct per_dev_info *pdi, int cpu)
610 struct per_cpu_info *pci;
612 if (cpu >= pdi->ncpus)
613 resize_cpu_info(pdi, cpu);
615 pci = &pdi->cpus[cpu];
621 static int resize_devices(char *name)
623 int size = (ndevices + 1) * sizeof(struct per_dev_info);
625 devices = realloc(devices, size);
627 fprintf(stderr, "Out of memory, device %s (%d)\n", name, size);
630 memset(&devices[ndevices], 0, sizeof(struct per_dev_info));
631 devices[ndevices].name = name;
636 static struct per_dev_info *get_dev_info(dev_t id)
638 struct per_dev_info *pdi;
641 for (i = 0; i < ndevices; i++) {
644 if (devices[i].id == id)
648 if (resize_devices(NULL) != 0)
651 pdi = &devices[ndevices - 1];
653 pdi->last_sequence = 0;
654 pdi->last_read_time = 0;
658 static char *get_dev_name(struct per_dev_info *pdi, char *buffer, int size)
661 snprintf(buffer, size, "%s", pdi->name);
663 snprintf(buffer, size, "%d,%d", MAJOR(pdi->id), MINOR(pdi->id));
667 static void check_time(struct per_dev_info *pdi, struct blk_io_trace *bit)
669 unsigned long long this = bit->time;
670 unsigned long long last = pdi->last_reported_time;
672 pdi->backwards = (this < last) ? 'B' : ' ';
673 pdi->last_reported_time = this;
676 static inline void __account_m(struct io_stats *ios, struct blk_io_trace *t,
681 ios->qwrite_kb += t->bytes >> 10;
684 ios->qread_kb += t->bytes >> 10;
688 static inline void account_m(struct blk_io_trace *t, struct per_cpu_info *pci,
691 __account_m(&pci->io_stats, t, rw);
693 if (per_process_stats) {
694 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
696 __account_m(ios, t, rw);
700 static inline void __account_queue(struct io_stats *ios, struct blk_io_trace *t,
705 ios->qwrite_kb += t->bytes >> 10;
708 ios->qread_kb += t->bytes >> 10;
712 static inline void account_queue(struct blk_io_trace *t,
713 struct per_cpu_info *pci, int rw)
715 __account_queue(&pci->io_stats, t, rw);
717 if (per_process_stats) {
718 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
720 __account_queue(ios, t, rw);
724 static inline void __account_c(struct io_stats *ios, int rw, unsigned int bytes)
728 ios->cwrite_kb += bytes >> 10;
731 ios->cread_kb += bytes >> 10;
735 static inline void account_c(struct blk_io_trace *t, struct per_cpu_info *pci,
738 __account_c(&pci->io_stats, rw, bytes);
740 if (per_process_stats) {
741 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
743 __account_c(ios, rw, bytes);
747 static inline void __account_issue(struct io_stats *ios, int rw,
752 ios->iwrite_kb += bytes >> 10;
755 ios->iread_kb += bytes >> 10;
759 static inline void account_issue(struct blk_io_trace *t,
760 struct per_cpu_info *pci, int rw)
762 __account_issue(&pci->io_stats, rw, t->bytes);
764 if (per_process_stats) {
765 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
767 __account_issue(ios, rw, t->bytes);
771 static inline void __account_unplug(struct io_stats *ios, int timer)
774 ios->timer_unplugs++;
779 static inline void account_unplug(struct blk_io_trace *t,
780 struct per_cpu_info *pci, int timer)
782 __account_unplug(&pci->io_stats, timer);
784 if (per_process_stats) {
785 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
787 __account_unplug(ios, timer);
791 static void log_complete(struct per_cpu_info *pci, struct blk_io_trace *t,
794 process_fmt(act, pci, t, log_track_complete(t), 0, NULL);
797 static void log_insert(struct per_cpu_info *pci, struct blk_io_trace *t,
800 process_fmt(act, pci, t, log_track_insert(t), 0, NULL);
803 static void log_queue(struct per_cpu_info *pci, struct blk_io_trace *t,
806 process_fmt(act, pci, t, -1, 0, NULL);
809 static void log_issue(struct per_cpu_info *pci, struct blk_io_trace *t,
812 process_fmt(act, pci, t, log_track_issue(t), 0, NULL);
815 static void log_merge(struct per_cpu_info *pci, struct blk_io_trace *t,
819 log_track_frontmerge(t);
821 process_fmt(act, pci, t, -1ULL, 0, NULL);
824 static void log_action(struct per_cpu_info *pci, struct blk_io_trace *t,
827 process_fmt(act, pci, t, -1ULL, 0, NULL);
830 static void log_generic(struct per_cpu_info *pci, struct blk_io_trace *t,
833 process_fmt(act, pci, t, -1ULL, 0, NULL);
836 static void log_unplug(struct per_cpu_info *pci, struct blk_io_trace *t,
839 process_fmt(act, pci, t, -1ULL, 0, NULL);
842 static void log_split(struct per_cpu_info *pci, struct blk_io_trace *t,
845 process_fmt(act, pci, t, -1ULL, 0, NULL);
848 static void log_pc(struct per_cpu_info *pci, struct blk_io_trace *t, char *act)
850 unsigned char *buf = (unsigned char *) t + sizeof(*t);
852 process_fmt(act, pci, t, -1ULL, t->pdu_len, buf);
855 static void dump_trace_pc(struct blk_io_trace *t, struct per_cpu_info *pci)
857 int act = t->action & 0xffff;
861 log_generic(pci, t, "Q");
864 log_generic(pci, t, "G");
866 case __BLK_TA_SLEEPRQ:
867 log_generic(pci, t, "S");
869 case __BLK_TA_REQUEUE:
870 log_generic(pci, t, "R");
875 case __BLK_TA_COMPLETE:
878 case __BLK_TA_INSERT:
882 fprintf(stderr, "Bad pc action %x\n", act);
887 static void dump_trace_fs(struct blk_io_trace *t, struct per_cpu_info *pci)
889 int w = t->action & BLK_TC_ACT(BLK_TC_WRITE);
890 int act = t->action & 0xffff;
894 account_queue(t, pci, w);
895 log_queue(pci, t, "Q");
897 case __BLK_TA_INSERT:
898 log_insert(pci, t, "I");
900 case __BLK_TA_BACKMERGE:
901 account_m(t, pci, w);
902 log_merge(pci, t, "M");
904 case __BLK_TA_FRONTMERGE:
905 account_m(t, pci, w);
906 log_merge(pci, t, "F");
910 log_generic(pci, t, "G");
912 case __BLK_TA_SLEEPRQ:
913 log_generic(pci, t, "S");
915 case __BLK_TA_REQUEUE:
916 account_c(t, pci, w, -t->bytes);
917 log_queue(pci, t, "R");
920 account_issue(t, pci, w);
921 log_issue(pci, t, "D");
923 case __BLK_TA_COMPLETE:
924 account_c(t, pci, w, t->bytes);
925 log_complete(pci, t, "C");
928 log_action(pci, t, "P");
930 case __BLK_TA_UNPLUG_IO:
931 account_unplug(t, pci, 0);
932 log_unplug(pci, t, "U");
934 case __BLK_TA_UNPLUG_TIMER:
935 account_unplug(t, pci, 1);
936 log_unplug(pci, t, "UT");
939 log_split(pci, t, "X");
941 case __BLK_TA_BOUNCE:
942 log_generic(pci, t, "B");
945 fprintf(stderr, "Bad fs action %x\n", t->action);
950 static void dump_trace(struct blk_io_trace *t, struct per_cpu_info *pci,
951 struct per_dev_info *pdi)
953 if (t->action & BLK_TC_ACT(BLK_TC_PC))
954 dump_trace_pc(t, pci);
956 dump_trace_fs(t, pci);
961 static void dump_io_stats(struct io_stats *ios, char *msg)
963 fprintf(ofp, "%s\n", msg);
965 fprintf(ofp, " Reads Queued: %'8lu, %'8LuKiB\t", ios->qreads, ios->qread_kb);
966 fprintf(ofp, " Writes Queued: %'8lu, %'8LuKiB\n", ios->qwrites,ios->qwrite_kb);
968 fprintf(ofp, " Read Dispatches: %'8lu, %'8LuKiB\t", ios->ireads, ios->iread_kb);
969 fprintf(ofp, " Write Dispatches: %'8lu, %'8LuKiB\n", ios->iwrites,ios->iwrite_kb);
970 fprintf(ofp, " Reads Completed: %'8lu, %'8LuKiB\t", ios->creads, ios->cread_kb);
971 fprintf(ofp, " Writes Completed: %'8lu, %'8LuKiB\n", ios->cwrites,ios->cwrite_kb);
972 fprintf(ofp, " Read Merges: %'8lu%8c\t", ios->mreads, ' ');
973 fprintf(ofp, " Write Merges: %'8lu\n", ios->mwrites);
974 fprintf(ofp, " IO unplugs: %'8lu%8c\t", ios->io_unplugs, ' ');
975 fprintf(ofp, " Timer unplugs: %'8lu\n", ios->timer_unplugs);
978 static void dump_wait_stats(struct per_process_info *ppi)
980 unsigned long rawait = ppi->longest_allocation_wait[0] / 1000;
981 unsigned long rdwait = ppi->longest_dispatch_wait[0] / 1000;
982 unsigned long rcwait = ppi->longest_completion_wait[0] / 1000;
983 unsigned long wawait = ppi->longest_allocation_wait[1] / 1000;
984 unsigned long wdwait = ppi->longest_dispatch_wait[1] / 1000;
985 unsigned long wcwait = ppi->longest_completion_wait[1] / 1000;
987 fprintf(ofp, " Allocation wait: %'8lu%8c\t", rawait, ' ');
988 fprintf(ofp, " Allocation wait: %'8lu\n", wawait);
989 fprintf(ofp, " Dispatch wait: %'8lu%8c\t", rdwait, ' ');
990 fprintf(ofp, " Dispatch wait: %'8lu\n", wdwait);
991 fprintf(ofp, " Completion wait: %'8lu%8c\t", rcwait, ' ');
992 fprintf(ofp, " Completion wait: %'8lu\n", wcwait);
995 static int ppi_name_compare(const void *p1, const void *p2)
997 struct per_process_info *ppi1 = *((struct per_process_info **) p1);
998 struct per_process_info *ppi2 = *((struct per_process_info **) p2);
1001 res = strverscmp(ppi1->name, ppi2->name);
1003 res = ppi1->pid > ppi2->pid;
1008 static void sort_process_list(void)
1010 struct per_process_info **ppis;
1011 struct per_process_info *ppi;
1014 ppis = malloc(ppi_list_entries * sizeof(struct per_process_info *));
1019 ppi = ppi->list_next;
1022 qsort(ppis, ppi_list_entries, sizeof(ppi), ppi_name_compare);
1024 i = ppi_list_entries - 1;
1029 ppi->list_next = ppi_list;
1037 static void show_process_stats(void)
1039 struct per_process_info *ppi;
1041 sort_process_list();
1047 if (ppi->more_than_one)
1048 sprintf(name, "%s (%u, ...)", ppi->name, ppi->pid);
1050 sprintf(name, "%s (%u)", ppi->name, ppi->pid);
1052 dump_io_stats(&ppi->io_stats, name);
1053 dump_wait_stats(ppi);
1054 ppi = ppi->list_next;
1060 static void show_device_and_cpu_stats(void)
1062 struct per_dev_info *pdi;
1063 struct per_cpu_info *pci;
1064 struct io_stats total, *ios;
1065 int i, j, pci_events;
1066 char line[3 + 8/*cpu*/ + 2 + 32/*dev*/ + 3];
1069 for (pdi = devices, i = 0; i < ndevices; i++, pdi++) {
1071 memset(&total, 0, sizeof(total));
1077 for (pci = pdi->cpus, j = 0; j < pdi->ncpus; j++, pci++) {
1081 ios = &pci->io_stats;
1082 total.qreads += ios->qreads;
1083 total.qwrites += ios->qwrites;
1084 total.creads += ios->creads;
1085 total.cwrites += ios->cwrites;
1086 total.mreads += ios->mreads;
1087 total.mwrites += ios->mwrites;
1088 total.ireads += ios->ireads;
1089 total.iwrites += ios->iwrites;
1090 total.qread_kb += ios->qread_kb;
1091 total.qwrite_kb += ios->qwrite_kb;
1092 total.cread_kb += ios->cread_kb;
1093 total.cwrite_kb += ios->cwrite_kb;
1094 total.iread_kb += ios->iread_kb;
1095 total.iwrite_kb += ios->iwrite_kb;
1096 total.timer_unplugs += ios->timer_unplugs;
1097 total.io_unplugs += ios->io_unplugs;
1099 snprintf(line, sizeof(line) - 1, "CPU%d (%s):",
1100 j, get_dev_name(pdi, name, sizeof(name)));
1101 dump_io_stats(ios, line);
1105 if (pci_events > 1) {
1107 snprintf(line, sizeof(line) - 1, "Total (%s):",
1108 get_dev_name(pdi, name, sizeof(name)));
1109 dump_io_stats(&total, line);
1112 fprintf(ofp, "\nEvents (%s): %'Lu entries, %'lu skips\n",
1113 get_dev_name(pdi, line, sizeof(line)), pdi->events,
1119 * struct trace and blktrace allocation cache, we do potentially
1120 * millions of mallocs for these structures while only using at most
1121 * a few thousand at the time
1123 static inline void t_free(struct trace *t)
1125 t->next = t_alloc_list;
1129 static inline struct trace *t_alloc(void)
1131 struct trace *t = t_alloc_list;
1134 t_alloc_list = t->next;
1138 return malloc(sizeof(*t));
1141 static inline void bit_free(struct blk_io_trace *bit)
1144 * abuse a 64-bit field for a next pointer for the free item
1146 bit->time = (__u64) (unsigned long) bit_alloc_list;
1147 bit_alloc_list = (struct blk_io_trace *) bit;
1150 static inline struct blk_io_trace *bit_alloc(void)
1152 struct blk_io_trace *bit = bit_alloc_list;
1155 bit_alloc_list = (struct blk_io_trace *) (unsigned long) \
1160 return malloc(sizeof(*bit));
1163 static void find_genesis(void)
1165 struct trace *t = trace_list;
1167 genesis_time = -1ULL;
1169 if (t->bit->time < genesis_time)
1170 genesis_time = t->bit->time;
1176 static inline int check_stopwatch(struct blk_io_trace *bit)
1178 if (bit->time < stopwatch_end &&
1179 bit->time >= stopwatch_start)
1186 * return youngest entry read
1188 static int sort_entries(unsigned long long *youngest)
1196 while ((t = trace_list) != NULL) {
1197 struct blk_io_trace *bit = t->bit;
1199 trace_list = t->next;
1201 if (verify_trace(bit))
1204 bit->time -= genesis_time;
1206 if (check_stopwatch(bit)) {
1212 if (trace_rb_insert(t))
1215 if (bit->time < *youngest)
1216 *youngest = bit->time;
1222 static void show_entries_rb(int force)
1224 struct per_dev_info *pdi = NULL;
1225 struct per_cpu_info *pci = NULL;
1226 struct blk_io_trace *bit;
1230 while ((n = rb_first(&rb_sort_root)) != NULL) {
1235 t = rb_entry(n, struct trace, rb_node);
1238 if (!pdi || pdi->id != bit->device)
1239 pdi = get_dev_info(bit->device);
1242 fprintf(stderr, "Unknown device ID? (%d,%d)\n",
1243 MAJOR(bit->device), MINOR(bit->device));
1247 if (bit->cpu > pdi->ncpus) {
1248 fprintf(stderr, "Unknown CPU ID? (%d, device %d,%d)\n",
1249 bit->cpu, MAJOR(bit->device),
1250 MINOR(bit->device));
1255 * back off displaying more info if we are out of sync
1256 * on SMP systems. to prevent stalling on lost events,
1257 * only allow an event to skip us a few times
1259 if (bit->sequence != (pdi->last_sequence + 1)) {
1264 * the wanted sequence is really there,
1265 * continue because this means that the
1266 * log time is earlier on the trace we have
1269 __t = trace_rb_find(pdi->id,
1270 pdi->last_sequence + 1);
1274 fprintf(stderr, "(%d,%d): skipping %lu -> %u\n",
1275 MAJOR(pdi->id), MINOR(pdi->id),
1276 pdi->last_sequence, bit->sequence);
1281 if (!force && bit->time > last_allowed_time)
1284 pdi->last_sequence = bit->sequence;
1286 check_time(pdi, bit);
1288 if (!pci || pci->cpu != bit->cpu)
1289 pci = get_cpu_info(pdi, bit->cpu);
1291 dump_trace(bit, pci, pdi);
1293 rb_erase(&t->rb_node, &rb_sort_root);
1300 static int read_data(int fd, void *buffer, int bytes, int block)
1302 int ret, bytes_left, fl;
1305 fl = fcntl(fd, F_GETFL);
1308 fcntl(fd, F_SETFL, fl | O_NONBLOCK);
1310 fcntl(fd, F_SETFL, fl & ~O_NONBLOCK);
1314 while (bytes_left > 0) {
1315 ret = read(fd, p, bytes_left);
1319 if (errno != EAGAIN)
1332 static int read_events(int fd, int always_block)
1334 struct per_dev_info *pdi = NULL;
1337 while (!is_done() && events < rb_batch) {
1338 struct blk_io_trace *bit;
1345 if (read_data(fd, bit, sizeof(*bit), !events || always_block))
1348 magic = be32_to_cpu(bit->magic);
1349 if ((magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
1350 fprintf(stderr, "Bad magic %x\n", magic);
1354 pdu_len = be16_to_cpu(bit->pdu_len);
1356 void *ptr = realloc(bit, sizeof(*bit) + pdu_len);
1358 if (read_data(fd, ptr + sizeof(*bit), pdu_len, 1))
1365 memset(t, 0, sizeof(*t));
1370 t->next = trace_list;
1373 if (!pdi || pdi->id != bit->device)
1374 pdi = get_dev_info(bit->device);
1376 if (bit->time > pdi->last_read_time)
1377 pdi->last_read_time = bit->time;
1385 static int do_file(void)
1387 struct per_cpu_info *pci;
1388 struct per_dev_info *pdi;
1389 int i, j, events, events_added;
1392 * first prepare all files for reading
1394 for (i = 0; i < ndevices; i++) {
1397 pdi->last_sequence = 0;
1402 pci = get_cpu_info(pdi, j);
1406 snprintf(pci->fname, sizeof(pci->fname)-1,
1407 "%s.blktrace.%d", pdi->name, pci->cpu);
1408 if (stat(pci->fname, &st) < 0)
1411 pci->fd = open(pci->fname, O_RDONLY);
1418 printf("Input file %s added\n", pci->fname);
1424 * now loop over the files reading in the data
1427 unsigned long long youngest;
1430 last_allowed_time = -1ULL;
1432 for (i = 0; i < ndevices; i++) {
1435 for (j = 0; j < pdi->nfiles; j++) {
1437 pci = get_cpu_info(pdi, j);
1442 events = read_events(pci->fd, 1);
1449 if (pdi->last_read_time < last_allowed_time)
1450 last_allowed_time = pdi->last_read_time;
1452 events_added += events;
1456 if (sort_entries(&youngest))
1459 if (youngest > stopwatch_end)
1464 } while (events_added);
1466 if (rb_sort_entries)
1472 static int do_stdin(void)
1474 unsigned long long youngest;
1477 last_allowed_time = -1ULL;
1478 fd = dup(STDIN_FILENO);
1482 events = read_events(fd, 0);
1486 if (sort_entries(&youngest))
1489 if (youngest > stopwatch_end)
1495 if (rb_sort_entries)
1502 static void flush_output(void)
1507 static void handle_sigint(int sig)
1514 * Extract start and duration times from a string, allowing
1515 * us to specify a time interval of interest within a trace.
1516 * Format: "duration" (start is zero) or "start:duration".
1518 static int find_stopwatch_interval(char *string)
1523 value = strtod(string, &sp);
1525 fprintf(stderr,"Invalid stopwatch timer: %s\n", string);
1529 stopwatch_start = DOUBLE_TO_NANO_ULL(value);
1531 value = strtod(string, &sp);
1532 if (sp == string || *sp != '\0') {
1533 fprintf(stderr,"Invalid stopwatch duration time: %s\n",
1537 } else if (*sp != '\0') {
1538 fprintf(stderr,"Invalid stopwatch start timer: %s\n", string);
1541 stopwatch_end = DOUBLE_TO_NANO_ULL(value);
1542 if (stopwatch_end <= stopwatch_start) {
1543 fprintf(stderr, "Invalid stopwatch interval: %Lu -> %Lu\n",
1544 stopwatch_start, stopwatch_end);
1551 static char usage_str[] = \
1552 "[ -i <input name> ] [-o <output name> [ -s ] [ -t ] [ -q ]\n" \
1553 "[ -w start:stop ] [ -f output format ] [ -F format spec ] [ -v] \n\n" \
1554 "\t-i Input file containing trace data, or '-' for stdin\n" \
1555 "\t-o Output file. If not given, output is stdout\n" \
1556 "\t-b stdin read batching\n" \
1557 "\t-s Show per-program io statistics\n" \
1558 "\t-n Hash processes by name, not pid\n" \
1559 "\t-t Track individual ios. Will tell you the time a request took\n" \
1560 "\t to get queued, to get dispatched, and to get completed\n" \
1561 "\t-q Quiet. Don't display any stats at the end of the trace\n" \
1562 "\t-w Only parse data between the given time interval in seconds.\n" \
1563 "\t If 'start' isn't given, blkparse defaults the start time to 0\n" \
1564 "\t -f Output format. Customize the output format. The format field\n" \
1565 "\t identifies can be found in the documentation\n" \
1566 "\t-F Format specification. Can be found in the documentation\n" \
1567 "\t-v Print program version info\n\n";
1569 static void usage(char *prog)
1571 fprintf(stderr, "Usage: %s %s %s", prog, blkparse_version, usage_str);
1574 int main(int argc, char *argv[])
1578 int per_device_and_cpu_stats = 1;
1580 while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
1583 if (!strcmp(optarg, "-") && !pipeline)
1585 else if (resize_devices(optarg) != 0)
1589 output_name = optarg;
1592 rb_batch = atoi(optarg);
1594 rb_batch = RB_BATCH_DEFAULT;
1597 per_process_stats = 1;
1603 per_device_and_cpu_stats = 0;
1606 if (find_stopwatch_interval(optarg) != 0)
1610 set_all_format_specs(optarg);
1613 if (add_format_spec(optarg) != 0)
1617 ppi_hash_by_pid = 0;
1620 printf("%s version %s\n", argv[0], blkparse_version);
1628 while (optind < argc) {
1629 if (!strcmp(argv[optind], "-") && !pipeline)
1631 else if (resize_devices(argv[optind]) != 0)
1636 if (!pipeline && !ndevices) {
1641 memset(&rb_sort_root, 0, sizeof(rb_sort_root));
1642 memset(&rb_track_root, 0, sizeof(rb_track_root));
1644 signal(SIGINT, handle_sigint);
1645 signal(SIGHUP, handle_sigint);
1646 signal(SIGTERM, handle_sigint);
1648 setlocale(LC_NUMERIC, "en_US");
1651 ofp = fdopen(STDOUT_FILENO, "w");
1656 snprintf(ofname, sizeof(ofname) - 1, "%s", output_name);
1657 ofp = fopen(ofname, "w");
1666 ofp_buffer = malloc(4096);
1667 if (setvbuf(ofp, ofp_buffer, mode, 4096)) {
1677 if (per_process_stats)
1678 show_process_stats();
1680 if (per_device_and_cpu_stats)
1681 show_device_and_cpu_stats();