[PATCH] blkparse: actually include blkparse diff
[blktrace.git] / blkparse.c
... / ...
CommitLineData
1/*
2 * block queue tracing parse application
3 *
4 * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 */
21#include <sys/types.h>
22#include <sys/stat.h>
23#include <unistd.h>
24#include <stdio.h>
25#include <fcntl.h>
26#include <stdlib.h>
27#include <string.h>
28#include <getopt.h>
29#include <errno.h>
30#include <signal.h>
31#include <locale.h>
32#include <limits.h>
33
34#include "blktrace.h"
35#include "rbtree.h"
36
37#define SECONDS(x) ((unsigned long long)(x) / 1000000000)
38#define NANO_SECONDS(x) ((unsigned long long)(x) % 1000000000)
39#define DOUBLE_TO_NANO_ULL(d) ((unsigned long long)((d) * 1000000000))
40
41#define MINORBITS 20
42#define MINORMASK ((1U << MINORBITS) - 1)
43#define MAJOR(dev) ((unsigned int) ((dev) >> MINORBITS))
44#define MINOR(dev) ((unsigned int) ((dev) & MINORMASK))
45
46#define min(a, b) ((a) < (b) ? (a) : (b))
47
48struct io_stats {
49 unsigned long qreads, qwrites, creads, cwrites, mreads, mwrites;
50 unsigned long ireads, iwrites;
51 unsigned long long qread_kb, qwrite_kb, cread_kb, cwrite_kb;
52 unsigned long long iread_kb, iwrite_kb;
53 unsigned long io_unplugs, timer_unplugs;
54};
55
56struct per_cpu_info {
57 int cpu;
58 int nelems;
59
60 int fd;
61 char fname[128];
62
63 struct io_stats io_stats;
64};
65
66struct per_dev_info {
67 dev_t id;
68 char *name;
69
70 int backwards;
71 unsigned long long events;
72 unsigned long long last_reported_time;
73 struct io_stats io_stats;
74 unsigned long last_sequence;
75
76 int ncpus;
77 struct per_cpu_info *cpus;
78};
79
80struct per_process_info {
81 char name[16];
82 __u32 pid;
83 struct io_stats io_stats;
84 struct per_process_info *hash_next, *list_next;
85
86 /*
87 * individual io stats
88 */
89 unsigned long long longest_allocation_wait[2];
90 unsigned long long longest_dispatch_wait[2];
91 unsigned long long longest_completion_wait[2];
92};
93
94#define PPI_HASH_SHIFT (8)
95static struct per_process_info *ppi_hash[1 << PPI_HASH_SHIFT];
96static struct per_process_info *ppi_list;
97
98#define S_OPTS "i:o:b:stqw:"
99static struct option l_opts[] = {
100 {
101 .name = "input",
102 .has_arg = 1,
103 .flag = NULL,
104 .val = 'i'
105 },
106 {
107 .name = "output",
108 .has_arg = 1,
109 .flag = NULL,
110 .val = 'o'
111 },
112 {
113 .name = "batch",
114 .has_arg = 1,
115 .flag = NULL,
116 .val = 'b'
117 },
118 {
119 .name = "per program stats",
120 .has_arg = 0,
121 .flag = NULL,
122 .val = 's'
123 },
124 {
125 .name = "track ios",
126 .has_arg = 0,
127 .flag = NULL,
128 .val = 't'
129 },
130 {
131 .name = "quiet",
132 .has_arg = 0,
133 .flag = NULL,
134 .val = 'q'
135 },
136 {
137 .name = "stopwatch",
138 .has_arg = 1,
139 .flag = NULL,
140 .val = 'w'
141 },
142 {
143 .name = NULL,
144 .has_arg = 0,
145 .flag = NULL,
146 .val = 0
147 }
148};
149
150/*
151 * for sorting the displayed output
152 */
153struct trace {
154 struct blk_io_trace *bit;
155 struct rb_node rb_node;
156 struct trace *next;
157 int skipped;
158};
159
160static struct rb_root rb_sort_root;
161static struct rb_root rb_track_root;
162
163static struct trace *trace_list;
164
165/*
166 * for tracking individual ios
167 */
168struct io_track {
169 struct rb_node rb_node;
170
171 dev_t device;
172 __u64 sector;
173 __u32 pid;
174 unsigned long long allocation_time;
175 unsigned long long queue_time;
176 unsigned long long dispatch_time;
177 unsigned long long completion_time;
178};
179
180static int ndevices;
181static struct per_dev_info *devices;
182static char *get_dev_name(struct per_dev_info *, char *, int);
183
184static FILE *ofp;
185static char *output_name;
186
187static unsigned long long genesis_time;
188static unsigned long long stopwatch_start; /* start from zero by default */
189static unsigned long long stopwatch_end = ULONG_LONG_MAX; /* "infinity" */
190
191static int per_process_stats;
192static int track_ios;
193
194#define RB_BATCH_DEFAULT (1024)
195static int rb_batch = RB_BATCH_DEFAULT;
196
197static int pipeline;
198
199#define is_done() (*(volatile int *)(&done))
200static volatile int done;
201
202static inline unsigned long hash_long(unsigned long val)
203{
204#if __WORDSIZE == 32
205 val *= 0x9e370001UL;
206#elif __WORDSIZE == 64
207 val *= 0x9e37fffffffc0001UL;
208#else
209#error unknown word size
210#endif
211
212 return val >> (__WORDSIZE - PPI_HASH_SHIFT);
213}
214
215static inline void add_process_to_hash(struct per_process_info *ppi)
216{
217 const int hash_idx = hash_long(ppi->pid);
218
219 ppi->hash_next = ppi_hash[hash_idx];
220 ppi_hash[hash_idx] = ppi;
221}
222
223static inline void add_process_to_list(struct per_process_info *ppi)
224{
225 ppi->list_next = ppi_list;
226 ppi_list = ppi;
227}
228
229static struct per_process_info *find_process_by_pid(__u32 pid)
230{
231 const int hash_idx = hash_long(pid);
232 struct per_process_info *ppi;
233
234 ppi = ppi_hash[hash_idx];
235 while (ppi) {
236 if (ppi->pid == pid)
237 return ppi;
238
239 ppi = ppi->hash_next;
240 }
241
242 return NULL;
243}
244
245static inline int trace_rb_insert(struct trace *t)
246{
247 struct rb_node **p = &rb_sort_root.rb_node;
248 struct rb_node *parent = NULL;
249 struct trace *__t;
250
251 if (genesis_time == 0 || t->bit->time < genesis_time)
252 genesis_time = t->bit->time;
253
254 while (*p) {
255 parent = *p;
256 __t = rb_entry(parent, struct trace, rb_node);
257
258 if (t->bit->time < __t->bit->time)
259 p = &(*p)->rb_left;
260 else if (t->bit->time > __t->bit->time)
261 p = &(*p)->rb_right;
262 else if (t->bit->device < __t->bit->device)
263 p = &(*p)->rb_left;
264 else if (t->bit->device > __t->bit->device)
265 p = &(*p)->rb_right;
266 else if (t->bit->sequence < __t->bit->sequence)
267 p = &(*p)->rb_left;
268 else if (t->bit->sequence > __t->bit->sequence)
269 p = &(*p)->rb_right;
270 else if (t->bit->device == __t->bit->device) {
271 fprintf(stderr,
272 "sequence alias (%d) on device %d,%d!\n",
273 t->bit->sequence,
274 MAJOR(t->bit->device), MINOR(t->bit->device));
275 return 1;
276 }
277 }
278
279 rb_link_node(&t->rb_node, parent, p);
280 rb_insert_color(&t->rb_node, &rb_sort_root);
281 return 0;
282}
283
284static inline int track_rb_insert(struct io_track *iot)
285{
286 struct rb_node **p = &rb_track_root.rb_node;
287 struct rb_node *parent = NULL;
288 struct io_track *__iot;
289
290 while (*p) {
291 parent = *p;
292
293 __iot = rb_entry(parent, struct io_track, rb_node);
294
295 if (iot->device < __iot->device)
296 p = &(*p)->rb_left;
297 else if (iot->device > __iot->device)
298 p = &(*p)->rb_right;
299 else if (iot->sector < __iot->sector)
300 p = &(*p)->rb_left;
301 else if (iot->sector > __iot->sector)
302 p = &(*p)->rb_right;
303 else {
304 fprintf(stderr,
305 "sector alias (%llu) on device %d,%d!\n",
306 iot->sector,
307 MAJOR(iot->device), MINOR(iot->device));
308 return 1;
309 }
310 }
311
312 rb_link_node(&iot->rb_node, parent, p);
313 rb_insert_color(&iot->rb_node, &rb_track_root);
314 return 0;
315}
316
317static struct io_track *__find_track(dev_t device, __u64 sector)
318{
319 struct rb_node **p = &rb_track_root.rb_node;
320 struct rb_node *parent = NULL;
321 struct io_track *__iot;
322
323 while (*p) {
324 parent = *p;
325
326 __iot = rb_entry(parent, struct io_track, rb_node);
327
328 if (device < __iot->device)
329 p = &(*p)->rb_left;
330 else if (device > __iot->device)
331 p = &(*p)->rb_right;
332 else if (sector < __iot->sector)
333 p = &(*p)->rb_left;
334 else if (sector > __iot->sector)
335 p = &(*p)->rb_right;
336 else
337 return __iot;
338 }
339
340 return NULL;
341}
342
343static struct io_track *find_track(__u32 pid, dev_t device, __u64 sector)
344{
345 struct io_track *iot;
346
347 iot = __find_track(device, sector);
348 if (!iot) {
349 iot = malloc(sizeof(*iot));
350 iot->pid = pid;
351 iot->device = device;
352 iot->sector = sector;
353 track_rb_insert(iot);
354 }
355
356 return iot;
357}
358
359static void log_track_frontmerge(struct blk_io_trace *t)
360{
361 struct io_track *iot;
362
363 if (!track_ios)
364 return;
365
366 iot = __find_track(t->device, t->sector + (t->bytes >> 9));
367 if (!iot) {
368 fprintf(stderr, "failed to find mergeable event\n");
369 return;
370 }
371
372 rb_erase(&iot->rb_node, &rb_track_root);
373 iot->sector -= t->bytes >> 9;
374 track_rb_insert(iot);
375}
376
377static void log_track_getrq(struct blk_io_trace *t)
378{
379 struct io_track *iot;
380
381 if (!track_ios)
382 return;
383
384 iot = find_track(t->pid, t->device, t->sector);
385 iot->allocation_time = t->time;
386}
387
388
389/*
390 * return time between rq allocation and queue
391 */
392static unsigned long long log_track_queue(struct blk_io_trace *t)
393{
394 unsigned long long elapsed;
395 struct io_track *iot;
396
397 if (!track_ios)
398 return -1;
399
400 iot = find_track(t->pid, t->device, t->sector);
401 iot->queue_time = t->time;
402 elapsed = iot->queue_time - iot->allocation_time;
403
404 if (per_process_stats) {
405 struct per_process_info *ppi = find_process_by_pid(iot->pid);
406 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
407
408 if (ppi && elapsed > ppi->longest_allocation_wait[w])
409 ppi->longest_allocation_wait[w] = elapsed;
410 }
411
412 return elapsed;
413}
414
415/*
416 * return time between queue and issue
417 */
418static unsigned long long log_track_issue(struct blk_io_trace *t)
419{
420 unsigned long long elapsed;
421 struct io_track *iot;
422
423 if (!track_ios)
424 return -1;
425 if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
426 return -1;
427
428 iot = __find_track(t->device, t->sector);
429 if (!iot) {
430 fprintf(stderr, "failed to find issue event\n");
431 return -1;
432 }
433
434 iot->dispatch_time = t->time;
435 elapsed = iot->dispatch_time - iot->queue_time;
436
437 if (per_process_stats) {
438 struct per_process_info *ppi = find_process_by_pid(iot->pid);
439 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
440
441 if (ppi && elapsed > ppi->longest_dispatch_wait[w])
442 ppi->longest_dispatch_wait[w] = elapsed;
443 }
444
445 return elapsed;
446}
447
448/*
449 * return time between dispatch and complete
450 */
451static unsigned long long log_track_complete(struct blk_io_trace *t)
452{
453 unsigned long long elapsed;
454 struct io_track *iot;
455
456 if (!track_ios)
457 return -1;
458 if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
459 return -1;
460
461 iot = __find_track(t->device, t->sector);
462 if (!iot) {
463 fprintf(stderr, "failed to find complete event\n");
464 return -1;
465 }
466
467 iot->completion_time = t->time;
468 elapsed = iot->completion_time - iot->dispatch_time;
469
470 if (per_process_stats) {
471 struct per_process_info *ppi = find_process_by_pid(iot->pid);
472 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
473
474 if (ppi && elapsed > ppi->longest_completion_wait[w])
475 ppi->longest_completion_wait[w] = elapsed;
476 }
477
478 /*
479 * kill the trace, we don't need it after completion
480 */
481 rb_erase(&iot->rb_node, &rb_track_root);
482 free(iot);
483
484 return elapsed;
485}
486
487
488static struct io_stats *find_process_io_stats(__u32 pid, char *name)
489{
490 struct per_process_info *ppi = find_process_by_pid(pid);
491
492 if (!ppi) {
493 ppi = malloc(sizeof(*ppi));
494 memset(ppi, 0, sizeof(*ppi));
495 strncpy(ppi->name, name, sizeof(ppi->name));
496 ppi->pid = pid;
497 add_process_to_hash(ppi);
498 add_process_to_list(ppi);
499 }
500
501 return &ppi->io_stats;
502}
503
504
505static void resize_cpu_info(struct per_dev_info *pdi, int cpu)
506{
507 struct per_cpu_info *cpus = pdi->cpus;
508 int ncpus = pdi->ncpus;
509 int new_count = cpu + 1;
510 int new_space, size;
511 char *new_start;
512
513 size = new_count * sizeof(struct per_cpu_info);
514 cpus = realloc(cpus, size);
515 if (!cpus) {
516 char name[20];
517 fprintf(stderr, "Out of memory, CPU info for device %s (%d)\n",
518 get_dev_name(pdi, name, sizeof(name)), size);
519 exit(1);
520 }
521
522 new_start = (char *)cpus + (ncpus * sizeof(struct per_cpu_info));
523 new_space = (new_count - ncpus) * sizeof(struct per_cpu_info);
524 memset(new_start, 0, new_space);
525
526 pdi->ncpus = new_count;
527 pdi->cpus = cpus;
528}
529
530static struct per_cpu_info *get_cpu_info(struct per_dev_info *pdi, int cpu)
531{
532 struct per_cpu_info *pci;
533
534 if (cpu >= pdi->ncpus)
535 resize_cpu_info(pdi, cpu);
536
537 pci = &pdi->cpus[cpu];
538 pci->cpu = cpu;
539 return pci;
540}
541
542
543static int resize_devices(char *name)
544{
545 int size = (ndevices + 1) * sizeof(struct per_dev_info);
546
547 devices = realloc(devices, size);
548 if (!devices) {
549 fprintf(stderr, "Out of memory, device %s (%d)\n", name, size);
550 return 1;
551 }
552 memset(&devices[ndevices], 0, sizeof(struct per_dev_info));
553 devices[ndevices].name = name;
554 ndevices++;
555 return 0;
556}
557
558static struct per_dev_info *get_dev_info(dev_t id)
559{
560 struct per_dev_info *pdi;
561 int i;
562
563 for (i = 0; i < ndevices; i++)
564 if (devices[i].id == id)
565 return &devices[i];
566
567 if (resize_devices(NULL) != 0)
568 return NULL;
569
570 pdi = &devices[ndevices - 1];
571 pdi->id = id;
572 return pdi;
573}
574
575static char *get_dev_name(struct per_dev_info *pdi, char *buffer, int size)
576{
577 if (pdi->name)
578 snprintf(buffer, size, "%s", pdi->name);
579 else
580 snprintf(buffer, size, "%d,%d", MAJOR(pdi->id), MINOR(pdi->id));
581 return buffer;
582}
583
584static void check_time(struct per_dev_info *pdi, struct blk_io_trace *bit)
585{
586 unsigned long long this = bit->time;
587 unsigned long long last = pdi->last_reported_time;
588
589 pdi->backwards = (this < last) ? 'B' : ' ';
590 pdi->last_reported_time = this;
591}
592
593static inline void __account_m(struct io_stats *ios, struct blk_io_trace *t,
594 int rw)
595{
596 if (rw) {
597 ios->mwrites++;
598 ios->qwrite_kb += t->bytes >> 10;
599 } else {
600 ios->mreads++;
601 ios->qread_kb += t->bytes >> 10;
602 }
603}
604
605static inline void account_m(struct blk_io_trace *t, struct per_cpu_info *pci,
606 int rw)
607{
608 __account_m(&pci->io_stats, t, rw);
609
610 if (per_process_stats) {
611 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
612
613 __account_m(ios, t, rw);
614 }
615}
616
617static inline void __account_q(struct io_stats *ios, struct blk_io_trace *t,
618 int rw)
619{
620 if (rw) {
621 ios->qwrites++;
622 ios->qwrite_kb += t->bytes >> 10;
623 } else {
624 ios->qreads++;
625 ios->qread_kb += t->bytes >> 10;
626 }
627}
628
629static inline void account_q(struct blk_io_trace *t, struct per_cpu_info *pci,
630 int rw)
631{
632 __account_q(&pci->io_stats, t, rw);
633
634 if (per_process_stats) {
635 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
636
637 __account_q(ios, t, rw);
638 }
639}
640
641static inline void __account_c(struct io_stats *ios, int rw, unsigned int bytes)
642{
643 if (rw) {
644 ios->cwrites++;
645 ios->cwrite_kb += bytes >> 10;
646 } else {
647 ios->creads++;
648 ios->cread_kb += bytes >> 10;
649 }
650}
651
652static inline void account_c(struct blk_io_trace *t, struct per_cpu_info *pci,
653 int rw, int bytes)
654{
655 __account_c(&pci->io_stats, rw, bytes);
656
657 if (per_process_stats) {
658 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
659
660 __account_c(ios, rw, bytes);
661 }
662}
663
664static inline void __account_i(struct io_stats *ios, int rw, unsigned int bytes)
665{
666 if (rw) {
667 ios->iwrites++;
668 ios->iwrite_kb += bytes >> 10;
669 } else {
670 ios->ireads++;
671 ios->iread_kb += bytes >> 10;
672 }
673}
674
675static inline void account_i(struct blk_io_trace *t, struct per_cpu_info *pci,
676 int rw)
677{
678 __account_i(&pci->io_stats, rw, t->bytes);
679
680 if (per_process_stats) {
681 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
682
683 __account_i(ios, rw, t->bytes);
684 }
685}
686
687static inline void __account_unplug(struct io_stats *ios, int timer)
688{
689 if (timer)
690 ios->timer_unplugs++;
691 else
692 ios->io_unplugs++;
693}
694
695static inline void account_unplug(struct blk_io_trace *t,
696 struct per_cpu_info *pci, int timer)
697{
698 __account_unplug(&pci->io_stats, timer);
699
700 if (per_process_stats) {
701 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
702
703 __account_unplug(ios, timer);
704 }
705}
706
707static void output(struct per_cpu_info *pci, char *s)
708{
709 fprintf(ofp, "%s", s);
710}
711
712static char hstring[256];
713static char tstring[256];
714
715static inline char *setup_header(struct per_cpu_info *pci,
716 struct blk_io_trace *t, char *act)
717{
718 int w = t->action & BLK_TC_ACT(BLK_TC_WRITE);
719 int b = t->action & BLK_TC_ACT(BLK_TC_BARRIER);
720 int s = t->action & BLK_TC_ACT(BLK_TC_SYNC);
721 char rwbs[4];
722 int i = 0;
723
724 if (w)
725 rwbs[i++] = 'W';
726 else
727 rwbs[i++] = 'R';
728 if (b)
729 rwbs[i++] = 'B';
730 if (s)
731 rwbs[i++] = 'S';
732
733 rwbs[i] = '\0';
734
735 sprintf(hstring, "%3d,%-3d %2d %8ld %5Lu.%09Lu %5u %2s %3s",
736 MAJOR(t->device), MINOR(t->device), pci->cpu,
737 (unsigned long)t->sequence, SECONDS(t->time),
738 NANO_SECONDS(t->time), t->pid, act, rwbs);
739
740 return hstring;
741}
742
743static void log_complete(struct per_cpu_info *pci, struct blk_io_trace *t,
744 char *act)
745{
746 unsigned long long elapsed = log_track_complete(t);
747
748 if (elapsed != -1ULL) {
749 unsigned long usec = elapsed / 1000;
750
751 sprintf(tstring,"%s %Lu + %u (%8lu) [%d]\n",
752 setup_header(pci, t, act),
753 (unsigned long long)t->sector, t->bytes >> 9,
754 usec, t->error);
755 } else {
756 sprintf(tstring,"%s %Lu + %u [%d]\n", setup_header(pci, t, act),
757 (unsigned long long)t->sector, t->bytes >> 9, t->error);
758 }
759
760 output(pci, tstring);
761}
762
763static void log_queue(struct per_cpu_info *pci, struct blk_io_trace *t,
764 char *act)
765{
766 unsigned long long elapsed = log_track_queue(t);
767
768 if (elapsed != -1ULL) {
769 unsigned long usec = elapsed / 1000;
770
771 sprintf(tstring,"%s %Lu + %u (%8lu) [%s]\n",
772 setup_header(pci, t, act),
773 (unsigned long long)t->sector, t->bytes >> 9,
774 usec, t->comm);
775 } else {
776 sprintf(tstring,"%s %Lu + %u [%s]\n", setup_header(pci, t, act),
777 (unsigned long long)t->sector, t->bytes >> 9, t->comm);
778 }
779 output(pci, tstring);
780}
781
782static void log_issue(struct per_cpu_info *pci, struct blk_io_trace *t,
783 char *act)
784{
785 unsigned long long elapsed = log_track_issue(t);
786
787 if (elapsed != -1ULL) {
788 double usec = (double) elapsed / 1000;
789
790 sprintf(tstring,"%s %Lu + %u (%8.2f) [%s]\n",
791 setup_header(pci, t, act),
792 (unsigned long long)t->sector, t->bytes >> 9,
793 usec, t->comm);
794 } else {
795 sprintf(tstring,"%s %Lu + %u [%s]\n", setup_header(pci, t, act),
796 (unsigned long long)t->sector, t->bytes >> 9, t->comm);
797 }
798
799 output(pci, tstring);
800}
801
802static void log_merge(struct per_cpu_info *pci, struct blk_io_trace *t,
803 char *act)
804{
805 if (act[0] == 'F')
806 log_track_frontmerge(t);
807
808 sprintf(tstring,"%s %Lu + %u [%s]\n", setup_header(pci, t, act),
809 (unsigned long long)t->sector, t->bytes >> 9, t->comm);
810 output(pci, tstring);
811}
812
813static void log_action(struct per_cpu_info *pci, struct blk_io_trace *t,
814 char *act)
815{
816 sprintf(tstring,"%s [%s]\n", setup_header(pci, t, act), t->comm);
817 output(pci, tstring);
818}
819
820static void log_generic(struct per_cpu_info *pci, struct blk_io_trace *t,
821 char *act)
822{
823 sprintf(tstring,"%s %Lu + %u [%s]\n", setup_header(pci, t, act),
824 (unsigned long long)t->sector, t->bytes >> 9, t->comm);
825 output(pci, tstring);
826}
827
828static int log_unplug(struct per_cpu_info *pci, struct blk_io_trace *t,
829 char *act)
830{
831 __u64 *depth;
832 int len;
833
834 len = sprintf(tstring,"%s [%s] ", setup_header(pci, t, act), t->comm);
835 depth = (__u64 *) ((char *) t + sizeof(*t));
836 sprintf(tstring + len, "%u\n", (unsigned int) be64_to_cpu(*depth));
837 output(pci, tstring);
838
839 return 0;
840}
841
842static int log_pc(struct per_cpu_info *pci, struct blk_io_trace *t, char *act)
843{
844 unsigned char *buf;
845 int i;
846
847 sprintf(tstring,"%s ", setup_header(pci, t, act));
848 output(pci, tstring);
849
850 buf = (unsigned char *) t + sizeof(*t);
851 for (i = 0; i < t->pdu_len; i++) {
852 sprintf(tstring,"%02x ", buf[i]);
853 output(pci, tstring);
854 }
855
856 if (act[0] == 'C') {
857 sprintf(tstring,"[%d]\n", t->error);
858 output(pci, tstring);
859 } else {
860 sprintf(tstring,"[%s]\n", t->comm);
861 output(pci, tstring);
862 }
863 return 0;
864}
865
866static int dump_trace_pc(struct blk_io_trace *t, struct per_cpu_info *pci)
867{
868 int ret = 0;
869
870 switch (t->action & 0xffff) {
871 case __BLK_TA_QUEUE:
872 log_generic(pci, t, "Q");
873 break;
874 case __BLK_TA_GETRQ:
875 log_generic(pci, t, "G");
876 break;
877 case __BLK_TA_SLEEPRQ:
878 log_generic(pci, t, "S");
879 break;
880 case __BLK_TA_REQUEUE:
881 log_generic(pci, t, "R");
882 break;
883 case __BLK_TA_ISSUE:
884 ret = log_pc(pci, t, "D");
885 break;
886 case __BLK_TA_COMPLETE:
887 log_pc(pci, t, "C");
888 break;
889 default:
890 fprintf(stderr, "Bad pc action %x\n", t->action);
891 ret = 1;
892 break;
893 }
894
895 return ret;
896}
897
898static void dump_trace_fs(struct blk_io_trace *t, struct per_cpu_info *pci)
899{
900 int w = t->action & BLK_TC_ACT(BLK_TC_WRITE);
901 int act = t->action & 0xffff;
902
903 switch (act) {
904 case __BLK_TA_QUEUE:
905 account_q(t, pci, w);
906 log_queue(pci, t, "Q");
907 break;
908 case __BLK_TA_BACKMERGE:
909 account_m(t, pci, w);
910 log_merge(pci, t, "M");
911 break;
912 case __BLK_TA_FRONTMERGE:
913 account_m(t, pci, w);
914 log_merge(pci, t, "F");
915 break;
916 case __BLK_TA_GETRQ:
917 log_track_getrq(t);
918 log_generic(pci, t, "G");
919 break;
920 case __BLK_TA_SLEEPRQ:
921 log_generic(pci, t, "S");
922 break;
923 case __BLK_TA_REQUEUE:
924 account_c(t, pci, w, -t->bytes);
925 log_queue(pci, t, "R");
926 break;
927 case __BLK_TA_ISSUE:
928 account_i(t, pci, w);
929 log_issue(pci, t, "D");
930 break;
931 case __BLK_TA_COMPLETE:
932 account_c(t, pci, w, t->bytes);
933 log_complete(pci, t, "C");
934 break;
935 case __BLK_TA_PLUG:
936 log_action(pci, t, "P");
937 break;
938 case __BLK_TA_UNPLUG_IO:
939 account_unplug(t, pci, 0);
940 log_unplug(pci, t, "U");
941 break;
942 case __BLK_TA_UNPLUG_TIMER:
943 account_unplug(t, pci, 1);
944 log_unplug(pci, t, "UT");
945 break;
946 default:
947 fprintf(stderr, "Bad fs action %x\n", t->action);
948 break;
949 }
950}
951
952static int dump_trace(struct blk_io_trace *t, struct per_cpu_info *pci,
953 struct per_dev_info *pdi)
954{
955 int ret = 0;
956
957 if (t->action & BLK_TC_ACT(BLK_TC_PC))
958 ret = dump_trace_pc(t, pci);
959 else
960 dump_trace_fs(t, pci);
961
962 pdi->events++;
963 return ret;
964}
965
966static void dump_io_stats(struct io_stats *ios, char *msg)
967{
968 fprintf(ofp, "%s\n", msg);
969
970 fprintf(ofp, " Reads Queued: %'8lu, %'8LuKiB\t", ios->qreads, ios->qread_kb);
971 fprintf(ofp, " Writes Queued: %'8lu, %'8LuKiB\n", ios->qwrites,ios->qwrite_kb);
972
973 fprintf(ofp, " Read Dispatches: %'8lu, %'8LuKiB\t", ios->ireads, ios->iread_kb);
974 fprintf(ofp, " Write Dispatches: %'8lu, %'8LuKiB\n", ios->iwrites,ios->iwrite_kb);
975 fprintf(ofp, " Reads Completed: %'8lu, %'8LuKiB\t", ios->creads, ios->cread_kb);
976 fprintf(ofp, " Writes Completed: %'8lu, %'8LuKiB\n", ios->cwrites,ios->cwrite_kb);
977 fprintf(ofp, " Read Merges: %'8lu%8c\t", ios->mreads, ' ');
978 fprintf(ofp, " Write Merges: %'8lu\n", ios->mwrites);
979 fprintf(ofp, " IO unplugs: %'8lu%8c\t", ios->io_unplugs, ' ');
980 fprintf(ofp, " Timer unplugs: %'8lu\n", ios->timer_unplugs);
981}
982
983static void dump_wait_stats(struct per_process_info *ppi)
984{
985 unsigned long rawait = ppi->longest_allocation_wait[0] / 1000;
986 unsigned long rdwait = ppi->longest_dispatch_wait[0] / 1000;
987 unsigned long rcwait = ppi->longest_completion_wait[0] / 1000;
988 unsigned long wawait = ppi->longest_allocation_wait[1] / 1000;
989 unsigned long wdwait = ppi->longest_dispatch_wait[1] / 1000;
990 unsigned long wcwait = ppi->longest_completion_wait[1] / 1000;
991
992 fprintf(ofp, " Allocation wait: %'8lu%8c\t", rawait, ' ');
993 fprintf(ofp, " Allocation wait: %'8lu\n", wawait);
994 fprintf(ofp, " Dispatch wait: %'8lu%8c\t", rdwait, ' ');
995 fprintf(ofp, " Dispatch wait: %'8lu\n", wdwait);
996 fprintf(ofp, " Completion wait: %'8lu%8c\t", rcwait, ' ');
997 fprintf(ofp, " Completion wait: %'8lu\n", wcwait);
998}
999
1000static void show_process_stats(void)
1001{
1002 struct per_process_info *ppi;
1003
1004 ppi = ppi_list;
1005 while (ppi) {
1006 dump_io_stats(&ppi->io_stats, ppi->name);
1007 dump_wait_stats(ppi);
1008 ppi = ppi->list_next;
1009 }
1010
1011 fprintf(ofp, "\n");
1012}
1013
1014static void show_device_and_cpu_stats(void)
1015{
1016 struct per_dev_info *pdi;
1017 struct per_cpu_info *pci;
1018 struct io_stats total, *ios;
1019 int i, j, pci_events;
1020 char line[3 + 8/*cpu*/ + 2 + 32/*dev*/ + 3];
1021 char name[32];
1022
1023 for (pdi = devices, i = 0; i < ndevices; i++, pdi++) {
1024
1025 memset(&total, 0, sizeof(total));
1026 pci_events = 0;
1027
1028 if (i > 0)
1029 fprintf(ofp, "\n");
1030
1031 for (pci = pdi->cpus, j = 0; j < pdi->ncpus; j++, pci++) {
1032 if (!pci->nelems)
1033 continue;
1034
1035 ios = &pci->io_stats;
1036 total.qreads += ios->qreads;
1037 total.qwrites += ios->qwrites;
1038 total.creads += ios->creads;
1039 total.cwrites += ios->cwrites;
1040 total.mreads += ios->mreads;
1041 total.mwrites += ios->mwrites;
1042 total.ireads += ios->ireads;
1043 total.iwrites += ios->iwrites;
1044 total.qread_kb += ios->qread_kb;
1045 total.qwrite_kb += ios->qwrite_kb;
1046 total.cread_kb += ios->cread_kb;
1047 total.cwrite_kb += ios->cwrite_kb;
1048 total.iread_kb += ios->iread_kb;
1049 total.iwrite_kb += ios->iwrite_kb;
1050 total.timer_unplugs += ios->timer_unplugs;
1051 total.io_unplugs += ios->io_unplugs;
1052
1053 snprintf(line, sizeof(line) - 1, "CPU%d (%s):",
1054 j, get_dev_name(pdi, name, sizeof(name)));
1055 dump_io_stats(ios, line);
1056 pci_events++;
1057 }
1058
1059 if (pci_events > 1) {
1060 fprintf(ofp, "\n");
1061 snprintf(line, sizeof(line) - 1, "Total (%s):",
1062 get_dev_name(pdi, name, sizeof(name)));
1063 dump_io_stats(&total, line);
1064 }
1065
1066 fprintf(ofp, "Events (%s): %'Lu\n",
1067 get_dev_name(pdi, line, sizeof(line)), pdi->events);
1068 }
1069}
1070
1071static struct blk_io_trace *find_trace(void *p, unsigned long offset)
1072{
1073 unsigned long max_offset = offset;
1074 unsigned long off;
1075 struct blk_io_trace *bit;
1076 __u32 magic;
1077
1078 for (off = 0; off < max_offset; off++) {
1079 bit = p + off;
1080
1081 magic = be32_to_cpu(bit->magic);
1082 if ((magic & 0xffffff00) == BLK_IO_TRACE_MAGIC)
1083 return bit;
1084 }
1085
1086 return NULL;
1087}
1088
1089static int sort_entries(void)
1090{
1091 struct per_dev_info *pdi;
1092 struct per_cpu_info *pci;
1093 struct blk_io_trace *bit;
1094 struct trace *t;
1095 int nr = 0;
1096
1097 while ((t = trace_list) != NULL) {
1098
1099 trace_list = t->next;
1100 bit = t->bit;
1101
1102 memset(&t->rb_node, 0, sizeof(t->rb_node));
1103
1104 trace_to_cpu(bit);
1105
1106 if (verify_trace(bit))
1107 break;
1108 if (trace_rb_insert(t))
1109 return -1;
1110
1111 pdi = get_dev_info(bit->device);
1112 pci = get_cpu_info(pdi, bit->cpu);
1113 pci->nelems++;
1114
1115 nr++;
1116 }
1117
1118 return nr;
1119}
1120
1121static void show_entries_rb(int kill_entries)
1122{
1123 struct per_dev_info *pdi;
1124 struct blk_io_trace *bit;
1125 struct rb_node *n;
1126 struct trace *t;
1127 int cpu;
1128
1129 while ((n = rb_first(&rb_sort_root)) != NULL) {
1130
1131 t = rb_entry(n, struct trace, rb_node);
1132 bit = t->bit;
1133
1134 pdi = get_dev_info(bit->device);
1135 if (!pdi) {
1136 fprintf(stderr, "Unknown device ID? (%d,%d)\n",
1137 MAJOR(bit->device), MINOR(bit->device));
1138 break;
1139 }
1140 cpu = bit->cpu;
1141 if (cpu > pdi->ncpus) {
1142 fprintf(stderr, "Unknown CPU ID? (%d, device %d,%d)\n",
1143 cpu, MAJOR(bit->device), MINOR(bit->device));
1144 break;
1145 }
1146
1147 /*
1148 * back off displaying more info if we are out of sync
1149 * on SMP systems. to prevent stalling on lost events,
1150 * only allow an event to skip us once
1151 */
1152 if (bit->sequence != (pdi->last_sequence + 1)) {
1153 if (!t->skipped) {
1154 t->skipped = 1;
1155 break;
1156 }
1157 }
1158
1159 pdi->last_sequence = bit->sequence;
1160
1161 bit->time -= genesis_time;
1162 if (bit->time < stopwatch_start)
1163 continue;
1164 if (bit->time >= stopwatch_end)
1165 break;
1166
1167 check_time(pdi, bit);
1168
1169 if (dump_trace(bit, &pdi->cpus[cpu], pdi))
1170 break;
1171
1172 rb_erase(&t->rb_node, &rb_sort_root);
1173
1174 if (kill_entries) {
1175 free(bit);
1176 free(t);
1177 }
1178 }
1179}
1180
1181static int read_data(int fd, void *buffer, int bytes, int block)
1182{
1183 int ret, bytes_left, fl;
1184 void *p;
1185
1186 fl = fcntl(fd, F_GETFL);
1187
1188 if (!block)
1189 fcntl(fd, F_SETFL, fl | O_NONBLOCK);
1190 else
1191 fcntl(fd, F_SETFL, fl & ~O_NONBLOCK);
1192
1193 bytes_left = bytes;
1194 p = buffer;
1195 while (bytes_left > 0) {
1196 ret = read(fd, p, bytes_left);
1197 if (!ret)
1198 return 1;
1199 else if (ret < 0) {
1200 if (errno != EAGAIN)
1201 perror("read");
1202 return -1;
1203 } else {
1204 p += ret;
1205 bytes_left -= ret;
1206 }
1207 }
1208
1209 return 0;
1210}
1211
1212/*
1213 * Find the traces in 'tb' and add them to the list for sorting and
1214 * displaying
1215 */
1216static int find_entries(void *tb, unsigned long size)
1217{
1218 struct blk_io_trace *bit;
1219 struct trace *t;
1220 void *start = tb;
1221
1222 while (tb - start <= size - sizeof(*bit)) {
1223 bit = find_trace(tb, size - (tb - start));
1224 if (!bit)
1225 break;
1226
1227 t = malloc(sizeof(*t));
1228 memset(t, 0, sizeof(*t));
1229 t->bit = bit;
1230
1231 t->next = trace_list;
1232 trace_list = t;
1233
1234 tb += sizeof(*bit) + bit->pdu_len;
1235 }
1236
1237 return 0;
1238}
1239
1240static int do_file(void)
1241{
1242 int i, j, nfiles = 0, nelems;
1243
1244 for (i = 0; i < ndevices; i++) {
1245 for (j = 0;; j++, nfiles++) {
1246 struct per_dev_info *pdi;
1247 struct per_cpu_info *pci;
1248 struct stat st;
1249 void *tb;
1250
1251 pdi = &devices[i];
1252 pci = get_cpu_info(pdi, j);
1253 pci->cpu = j;
1254
1255 snprintf(pci->fname, sizeof(pci->fname)-1,
1256 "%s_out.%d", pdi->name, j);
1257 if (stat(pci->fname, &st) < 0)
1258 break;
1259 if (!st.st_size)
1260 continue;
1261
1262 printf("Processing %s\n", pci->fname);
1263
1264 tb = malloc(st.st_size);
1265 if (!tb) {
1266 fprintf(stderr, "Out of memory, skip file %s\n",
1267 pci->fname);
1268 continue;
1269 }
1270
1271 pci->fd = open(pci->fname, O_RDONLY);
1272 if (pci->fd < 0) {
1273 perror(pci->fname);
1274 free(tb);
1275 continue;
1276 }
1277
1278 if (read_data(pci->fd, tb, st.st_size, 1)) {
1279 close(pci->fd);
1280 free(tb);
1281 continue;
1282 }
1283
1284 if (find_entries(tb, st.st_size)) {
1285 close(pci->fd);
1286 free(tb);
1287 }
1288
1289 nelems = sort_entries();
1290 if (nelems == -1) {
1291 close(pci->fd);
1292 free(tb);
1293 continue;
1294 }
1295
1296 printf("Completed %s (CPU%d %d, entries)\n",
1297 pci->fname, j, nelems);
1298 close(pci->fd);
1299 }
1300 }
1301
1302 if (!nfiles) {
1303 fprintf(stderr, "No files found\n");
1304 return 1;
1305 }
1306
1307 show_entries_rb(0);
1308 return 0;
1309}
1310
1311static int read_sort_events(int fd)
1312{
1313 int events = 0;
1314
1315 do {
1316 struct blk_io_trace *bit;
1317 struct trace *t;
1318 int pdu_len;
1319 __u32 magic;
1320
1321 bit = malloc(sizeof(*bit));
1322
1323 if (read_data(fd, bit, sizeof(*bit), !events))
1324 break;
1325
1326 magic = be32_to_cpu(bit->magic);
1327 if ((magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
1328 fprintf(stderr, "Bad magic %x\n", magic);
1329 break;
1330 }
1331
1332 pdu_len = be16_to_cpu(bit->pdu_len);
1333 if (pdu_len) {
1334 void *ptr = realloc(bit, sizeof(*bit) + pdu_len);
1335
1336 if (read_data(fd, ptr + sizeof(*bit), pdu_len, 1))
1337 break;
1338
1339 bit = ptr;
1340 }
1341
1342 t = malloc(sizeof(*t));
1343 memset(t, 0, sizeof(*t));
1344 t->bit = bit;
1345 t->next = trace_list;
1346 trace_list = t;
1347
1348 events++;
1349 } while (!is_done() && events < rb_batch);
1350
1351 return events;
1352}
1353
1354static int do_stdin(void)
1355{
1356 int fd;
1357
1358 fd = dup(STDIN_FILENO);
1359 do {
1360 int events;
1361
1362 events = read_sort_events(fd);
1363 if (!events)
1364 break;
1365
1366 if (sort_entries() == -1)
1367 break;
1368
1369 show_entries_rb(1);
1370 } while (1);
1371
1372 close(fd);
1373 return 0;
1374}
1375
1376static void flush_output(void)
1377{
1378 fflush(ofp);
1379}
1380
1381static void handle_sigint(int sig)
1382{
1383 done = 1;
1384 flush_output();
1385}
1386
1387/*
1388 * Extract start and duration times from a string, allowing
1389 * us to specify a time interval of interest within a trace.
1390 * Format: "duration" (start is zero) or "start:duration".
1391 */
1392static int find_stopwatch_interval(char *string)
1393{
1394 double value;
1395 char *sp;
1396
1397 value = strtod(string, &sp);
1398 if (sp == string) {
1399 fprintf(stderr,"Invalid stopwatch timer: %s\n", string);
1400 return 1;
1401 }
1402 if (*sp == ':') {
1403 stopwatch_start = DOUBLE_TO_NANO_ULL(value);
1404 string = sp + 1;
1405 value = strtod(string, &sp);
1406 if (sp == string || *sp != '\0') {
1407 fprintf(stderr,"Invalid stopwatch duration time: %s\n",
1408 string);
1409 return 1;
1410 }
1411 } else if (*sp != '\0') {
1412 fprintf(stderr,"Invalid stopwatch start timer: %s\n", string);
1413 return 1;
1414 }
1415 stopwatch_end = stopwatch_start + DOUBLE_TO_NANO_ULL(value);
1416 return 0;
1417}
1418
1419static void usage(char *prog)
1420{
1421 fprintf(stderr, "Usage: %s "
1422 "[-i <name>] [-o <output>] [-s] [-w N[:n]] <name>...\n",
1423 prog);
1424}
1425
1426int main(int argc, char *argv[])
1427{
1428 char *ofp_buffer;
1429 int c, ret, mode;
1430 int per_device_and_cpu_stats = 1;
1431
1432 while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
1433 switch (c) {
1434 case 'i':
1435 if (!strcmp(optarg, "-") && !pipeline)
1436 pipeline = 1;
1437 else if (resize_devices(optarg) != 0)
1438 return 1;
1439 break;
1440 case 'o':
1441 output_name = optarg;
1442 break;
1443 case 'b':
1444 rb_batch = atoi(optarg);
1445 if (rb_batch <= 0)
1446 rb_batch = RB_BATCH_DEFAULT;
1447 break;
1448 case 's':
1449 per_process_stats = 1;
1450 break;
1451 case 't':
1452 track_ios = 1;
1453 break;
1454 case 'q':
1455 per_device_and_cpu_stats = 0;
1456 break;
1457 case 'w':
1458 if (find_stopwatch_interval(optarg) != 0)
1459 return 1;
1460 break;
1461 default:
1462 usage(argv[0]);
1463 return 1;
1464 }
1465 }
1466
1467 while (optind < argc) {
1468 if (!strcmp(argv[optind], "-") && !pipeline)
1469 pipeline = 1;
1470 else if (resize_devices(argv[optind]) != 0)
1471 return 1;
1472 optind++;
1473 }
1474
1475 if (!pipeline && !ndevices) {
1476 usage(argv[0]);
1477 return 1;
1478 }
1479
1480 memset(&rb_sort_root, 0, sizeof(rb_sort_root));
1481 memset(&rb_track_root, 0, sizeof(rb_track_root));
1482
1483 signal(SIGINT, handle_sigint);
1484 signal(SIGHUP, handle_sigint);
1485 signal(SIGTERM, handle_sigint);
1486
1487 setlocale(LC_NUMERIC, "en_US");
1488
1489 if (!output_name) {
1490 ofp = fdopen(STDOUT_FILENO, "w");
1491 mode = _IOLBF;
1492 } else {
1493 char ofname[128];
1494
1495 snprintf(ofname, sizeof(ofname) - 1, "%s.log", output_name);
1496 ofp = fopen(ofname, "w");
1497 mode = _IOFBF;
1498 }
1499
1500 if (!ofp) {
1501 perror("fopen");
1502 return 1;
1503 }
1504
1505 ofp_buffer = malloc(4096);
1506 if (setvbuf(ofp, ofp_buffer, mode, 4096)) {
1507 perror("setvbuf");
1508 return 1;
1509 }
1510
1511 if (pipeline)
1512 ret = do_stdin();
1513 else
1514 ret = do_file();
1515
1516 if (per_process_stats)
1517 show_process_stats();
1518
1519 if (per_device_and_cpu_stats)
1520 show_device_and_cpu_stats();
1521
1522 flush_output();
1523 return ret;
1524}