[PATCH] blkparse: make the rb track root per-device
[blktrace.git] / blkparse.c
1 /*
2  * block queue tracing parse application
3  *
4  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation; either version 2 of the License, or
9  *  (at your option) any later version.
10  *
11  *  This program is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with this program; if not, write to the Free Software
18  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  *
20  */
21 #include <sys/types.h>
22 #include <sys/stat.h>
23 #include <unistd.h>
24 #include <stdio.h>
25 #include <fcntl.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <getopt.h>
29 #include <errno.h>
30 #include <signal.h>
31 #include <locale.h>
32 #include <limits.h>
33
34 #include "blktrace.h"
35 #include "rbtree.h"
36 #include "jhash.h"
37
38 static char blkparse_version[] = "0.90";
39
40 struct per_dev_info {
41         dev_t dev;
42         char *name;
43
44         int backwards;
45         unsigned long long events;
46         unsigned long long last_reported_time;
47         unsigned long long last_read_time;
48         struct io_stats io_stats;
49         unsigned long last_sequence;
50         unsigned long skips;
51
52         struct rb_root rb_last;
53         unsigned long rb_last_entries;
54
55         struct rb_root rb_track;
56
57         int nfiles;
58         int ncpus;
59         struct per_cpu_info *cpus;
60 };
61
62 struct per_process_info {
63         char name[16];
64         __u32 pid;
65         struct io_stats io_stats;
66         struct per_process_info *hash_next, *list_next;
67         int more_than_one;
68
69         /*
70          * individual io stats
71          */
72         unsigned long long longest_allocation_wait[2];
73         unsigned long long longest_dispatch_wait[2];
74         unsigned long long longest_completion_wait[2];
75 };
76
77 #define PPI_HASH_SHIFT  (8)
78 #define PPI_HASH_SIZE   (1 << PPI_HASH_SHIFT)
79 #define PPI_HASH_MASK   (PPI_HASH_SIZE - 1)
80 static struct per_process_info *ppi_hash_table[PPI_HASH_SIZE];
81 static struct per_process_info *ppi_list;
82 static int ppi_list_entries;
83
84 #define S_OPTS  "i:o:b:stqw:f:F:vn"
85 static struct option l_opts[] = {
86         {
87                 .name = "input",
88                 .has_arg = required_argument,
89                 .flag = NULL,
90                 .val = 'i'
91         },
92         {
93                 .name = "output",
94                 .has_arg = required_argument,
95                 .flag = NULL,
96                 .val = 'o'
97         },
98         {
99                 .name = "batch",
100                 .has_arg = required_argument,
101                 .flag = NULL,
102                 .val = 'b'
103         },
104         {
105                 .name = "per program stats",
106                 .has_arg = no_argument,
107                 .flag = NULL,
108                 .val = 's'
109         },
110         {
111                 .name = "track ios",
112                 .has_arg = no_argument,
113                 .flag = NULL,
114                 .val = 't'
115         },
116         {
117                 .name = "quiet",
118                 .has_arg = no_argument,
119                 .flag = NULL,
120                 .val = 'q'
121         },
122         {
123                 .name = "stopwatch",
124                 .has_arg = required_argument,
125                 .flag = NULL,
126                 .val = 'w'
127         },
128         {
129                 .name = "format",
130                 .has_arg = required_argument,
131                 .flag = NULL,
132                 .val = 'f'
133         },
134         {
135                 .name = "format-spec",
136                 .has_arg = required_argument,
137                 .flag = NULL,
138                 .val = 'F'
139         },
140         {
141                 .name = "hash by name",
142                 .has_arg = no_argument,
143                 .flag = NULL,
144                 .val = 'n'
145         },
146         {
147                 .name = "version",
148                 .has_arg = no_argument,
149                 .flag = NULL,
150                 .val = 'v'
151         },
152 };
153
154 /*
155  * for sorting the displayed output
156  */
157 struct trace {
158         struct blk_io_trace *bit;
159         struct rb_node rb_node;
160         struct trace *next;
161 };
162
163 static struct rb_root rb_sort_root;
164 static unsigned long rb_sort_entries;
165
166 static struct trace *trace_list;
167
168 /*
169  * allocation cache
170  */
171 static struct blk_io_trace *bit_alloc_list;
172 static struct trace *t_alloc_list;
173
174 /*
175  * for tracking individual ios
176  */
177 struct io_track {
178         struct rb_node rb_node;
179
180         __u64 sector;
181         __u32 pid;
182         char comm[16];
183         unsigned long long allocation_time;
184         unsigned long long queue_time;
185         unsigned long long dispatch_time;
186         unsigned long long completion_time;
187 };
188
189 static int ndevices;
190 static struct per_dev_info *devices;
191 static char *get_dev_name(struct per_dev_info *, char *, int);
192
193 FILE *ofp = NULL;
194 static char *output_name;
195
196 static unsigned long long genesis_time;
197 static unsigned long long last_allowed_time;
198 static unsigned long long stopwatch_start;      /* start from zero by default */
199 static unsigned long long stopwatch_end = ULONG_LONG_MAX;       /* "infinity" */
200
201 static int per_process_stats;
202 static int track_ios;
203 static int ppi_hash_by_pid = 1;
204
205 #define RB_BATCH_DEFAULT        (512)
206 static int rb_batch = RB_BATCH_DEFAULT;
207
208 static int pipeline;
209
210 #define is_done()       (*(volatile int *)(&done))
211 static volatile int done;
212
213 #define JHASH_RANDOM    (0x3af5f2ee)
214
215 static inline int ppi_hash_pid(__u32 pid)
216 {
217         return jhash_1word(pid, JHASH_RANDOM) & PPI_HASH_MASK;
218 }
219
220 static inline int ppi_hash_name(const char *name)
221 {
222         return jhash(name, 16, JHASH_RANDOM) & PPI_HASH_MASK;
223 }
224
225 static inline int ppi_hash(struct per_process_info *ppi)
226 {
227         if (ppi_hash_by_pid)
228                 return ppi_hash_pid(ppi->pid);
229
230         return ppi_hash_name(ppi->name);
231 }
232
233 static inline void add_process_to_hash(struct per_process_info *ppi)
234 {
235         const int hash_idx = ppi_hash(ppi);
236
237         ppi->hash_next = ppi_hash_table[hash_idx];
238         ppi_hash_table[hash_idx] = ppi;
239 }
240
241 static inline void add_process_to_list(struct per_process_info *ppi)
242 {
243         ppi->list_next = ppi_list;
244         ppi_list = ppi;
245         ppi_list_entries++;
246 }
247
248 static struct per_process_info *find_process_by_name(char *name)
249 {
250         const int hash_idx = ppi_hash_name(name);
251         struct per_process_info *ppi;
252
253         ppi = ppi_hash_table[hash_idx];
254         while (ppi) {
255                 if (!strcmp(ppi->name, name))
256                         return ppi;
257
258                 ppi = ppi->hash_next;
259         }
260
261         return NULL;
262 }
263
264 static struct per_process_info *find_process_by_pid(__u32 pid)
265 {
266         const int hash_idx = ppi_hash_pid(pid);
267         struct per_process_info *ppi;
268
269         ppi = ppi_hash_table[hash_idx];
270         while (ppi) {
271                 if (ppi->pid == pid)
272                         return ppi;
273
274                 ppi = ppi->hash_next;
275         }
276
277         return NULL;
278 }
279
280 static struct per_process_info *find_process(__u32 pid, char *name)
281 {
282         struct per_process_info *ppi;
283
284         if (ppi_hash_by_pid)
285                 return find_process_by_pid(pid);
286
287         ppi = find_process_by_name(name);
288         if (ppi && ppi->pid != pid)
289                 ppi->more_than_one = 1;
290
291         return ppi;
292 }
293
294 static inline int trace_rb_insert(struct trace *t, struct rb_root *root)
295 {
296         struct rb_node **p = &root->rb_node;
297         struct rb_node *parent = NULL;
298         struct trace *__t;
299
300         while (*p) {
301                 parent = *p;
302
303                 __t = rb_entry(parent, struct trace, rb_node);
304
305                 if (t->bit->time < __t->bit->time)
306                         p = &(*p)->rb_left;
307                 else if (t->bit->time > __t->bit->time)
308                         p = &(*p)->rb_right;
309                 else if (t->bit->device < __t->bit->device)
310                         p = &(*p)->rb_left;
311                 else if (t->bit->device > __t->bit->device)
312                         p = &(*p)->rb_right;
313                 else if (t->bit->sequence < __t->bit->sequence)
314                         p = &(*p)->rb_left;
315                 else if (t->bit->sequence > __t->bit->sequence)
316                         p = &(*p)->rb_right;
317                 else if (t->bit->device == __t->bit->device) {
318                         fprintf(stderr,
319                                 "sequence alias (%d) on device %d,%d!\n",
320                                 t->bit->sequence,
321                                 MAJOR(t->bit->device), MINOR(t->bit->device));
322                         return 1;
323                 }
324         }
325
326         rb_link_node(&t->rb_node, parent, p);
327         rb_insert_color(&t->rb_node, root);
328         return 0;
329 }
330
331 static inline int trace_rb_insert_sort(struct trace *t)
332 {
333         if (!trace_rb_insert(t, &rb_sort_root)) {
334                 rb_sort_entries++;
335                 return 0;
336         }
337
338         return 1;
339 }
340
341 static inline int trace_rb_insert_last(struct per_dev_info *pdi,struct trace *t)
342 {
343         if (!trace_rb_insert(t, &pdi->rb_last)) {
344                 pdi->rb_last_entries++;
345                 return 0;
346         }
347
348         return 1;
349 }
350
351 static struct trace *trace_rb_find(dev_t device, unsigned long sequence,
352                                    struct rb_root *root, int order)
353 {
354         struct rb_node *n = root->rb_node;
355         struct rb_node *prev = NULL;
356         struct trace *__t;
357
358         while (n) {
359                 __t = rb_entry(n, struct trace, rb_node);
360                 prev = n;
361
362                 if (device < __t->bit->device)
363                         n = n->rb_left;
364                 else if (device > __t->bit->device)
365                         n = n->rb_right;
366                 else if (sequence < __t->bit->sequence)
367                         n = n->rb_left;
368                 else if (sequence > __t->bit->sequence)
369                         n = n->rb_right;
370                 else
371                         return __t;
372         }
373
374         /*
375          * hack - the list may not be sequence ordered because some
376          * events don't have sequence and time matched. so we end up
377          * being a little off in the rb lookup here, because we don't
378          * know the time we are looking for. compensate by browsing
379          * a little ahead from the last entry to find the match
380          */
381         if (order && prev) {
382                 int max = 5;
383
384                 while (((n = rb_next(prev)) != NULL) && max--) {
385                         __t = rb_entry(n, struct trace, rb_node);
386                         
387                         if (__t->bit->device == device &&
388                             __t->bit->sequence == sequence)
389                                 return __t;
390
391                         prev = n;
392                 }
393         }
394                         
395         return NULL;
396 }
397
398 static inline struct trace *trace_rb_find_sort(dev_t dev, unsigned long seq)
399 {
400         return trace_rb_find(dev, seq, &rb_sort_root, 1);
401 }
402
403 static inline struct trace *trace_rb_find_last(struct per_dev_info *pdi,
404                                                unsigned long seq)
405 {
406         return trace_rb_find(pdi->dev, seq, &pdi->rb_last, 0);
407 }
408
409 static inline int track_rb_insert(struct per_dev_info *pdi,struct io_track *iot)
410 {
411         struct rb_node **p = &pdi->rb_track.rb_node;
412         struct rb_node *parent = NULL;
413         struct io_track *__iot;
414
415         while (*p) {
416                 parent = *p;
417                 __iot = rb_entry(parent, struct io_track, rb_node);
418
419                 if (iot->sector < __iot->sector)
420                         p = &(*p)->rb_left;
421                 else if (iot->sector > __iot->sector)
422                         p = &(*p)->rb_right;
423                 else {
424                         fprintf(stderr,
425                                 "sector alias (%Lu) on device %d,%d!\n",
426                                 (unsigned long long) iot->sector,
427                                 MAJOR(pdi->dev), MINOR(pdi->dev));
428                         return 1;
429                 }
430         }
431
432         rb_link_node(&iot->rb_node, parent, p);
433         rb_insert_color(&iot->rb_node, &pdi->rb_track);
434         return 0;
435 }
436
437 static struct io_track *__find_track(struct per_dev_info *pdi, __u64 sector)
438 {
439         struct rb_node *n = pdi->rb_track.rb_node;
440         struct io_track *__iot;
441
442         while (n) {
443                 __iot = rb_entry(n, struct io_track, rb_node);
444
445                 if (sector < __iot->sector)
446                         n = n->rb_left;
447                 else if (sector > __iot->sector)
448                         n = n->rb_right;
449                 else
450                         return __iot;
451         }
452
453         return NULL;
454 }
455
456 static struct io_track *find_track(struct per_dev_info *pdi, __u32 pid,
457                                    char *comm, __u64 sector)
458 {
459         struct io_track *iot;
460
461         iot = __find_track(pdi, sector);
462         if (!iot) {
463                 iot = malloc(sizeof(*iot));
464                 iot->pid = pid;
465                 memcpy(iot->comm, comm, sizeof(iot->comm));
466                 iot->sector = sector;
467                 track_rb_insert(pdi, iot);
468         }
469
470         return iot;
471 }
472
473 static void log_track_frontmerge(struct per_dev_info *pdi,
474                                  struct blk_io_trace *t)
475 {
476         struct io_track *iot;
477
478         if (!track_ios)
479                 return;
480
481         iot = __find_track(pdi, t->sector + (t->bytes >> 9));
482         if (!iot) {
483                 fprintf(stderr, "merge not found for (%d,%d): %llu\n",
484                         MAJOR(pdi->dev), MINOR(pdi->dev),
485                         t->sector + (t->bytes >> 9));
486                 return;
487         }
488
489         rb_erase(&iot->rb_node, &pdi->rb_track);
490         iot->sector -= t->bytes >> 9;
491         track_rb_insert(pdi, iot);
492 }
493
494 static void log_track_getrq(struct per_dev_info *pdi, struct blk_io_trace *t)
495 {
496         struct io_track *iot;
497
498         if (!track_ios)
499                 return;
500
501         iot = find_track(pdi, t->pid, t->comm, t->sector);
502         iot->allocation_time = t->time;
503 }
504
505 /*
506  * return time between rq allocation and insertion
507  */
508 static unsigned long long log_track_insert(struct per_dev_info *pdi,
509                                            struct blk_io_trace *t)
510 {
511         unsigned long long elapsed;
512         struct io_track *iot;
513
514         if (!track_ios)
515                 return -1;
516
517         iot = find_track(pdi, t->pid, t->comm, t->sector);
518         iot->queue_time = t->time;
519
520         if (!iot->allocation_time)
521                 return -1;
522
523         elapsed = iot->queue_time - iot->allocation_time;
524
525         if (per_process_stats) {
526                 struct per_process_info *ppi = find_process(iot->pid,iot->comm);
527                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
528
529                 if (ppi && elapsed > ppi->longest_allocation_wait[w])
530                         ppi->longest_allocation_wait[w] = elapsed;
531         }
532
533         return elapsed;
534 }
535
536 /*
537  * return time between queue and issue
538  */
539 static unsigned long long log_track_issue(struct per_dev_info *pdi,
540                                           struct blk_io_trace *t)
541 {
542         unsigned long long elapsed;
543         struct io_track *iot;
544
545         if (!track_ios)
546                 return -1;
547         if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
548                 return -1;
549
550         iot = __find_track(pdi, t->sector);
551         if (!iot) {
552                 fprintf(stderr, "issue not found for (%d,%d): %llu\n",
553                         MAJOR(pdi->dev), MINOR(pdi->dev), t->sector);
554                 return -1;
555         }
556
557         iot->dispatch_time = t->time;
558         elapsed = iot->dispatch_time - iot->queue_time;
559
560         if (per_process_stats) {
561                 struct per_process_info *ppi = find_process(iot->pid,iot->comm);
562                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
563
564                 if (ppi && elapsed > ppi->longest_dispatch_wait[w])
565                         ppi->longest_dispatch_wait[w] = elapsed;
566         }
567
568         return elapsed;
569 }
570
571 /*
572  * return time between dispatch and complete
573  */
574 static unsigned long long log_track_complete(struct per_dev_info *pdi,
575                                              struct blk_io_trace *t)
576 {
577         unsigned long long elapsed;
578         struct io_track *iot;
579
580         if (!track_ios)
581                 return -1;
582         if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
583                 return -1;
584
585         iot = __find_track(pdi, t->sector);
586         if (!iot) {
587                 fprintf(stderr, "complete not found for (%d,%d): %llu\n",
588                         MAJOR(pdi->dev), MINOR(pdi->dev), t->sector);
589                 return -1;
590         }
591
592         iot->completion_time = t->time;
593         elapsed = iot->completion_time - iot->dispatch_time;
594
595         if (per_process_stats) {
596                 struct per_process_info *ppi = find_process(iot->pid,iot->comm);
597                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
598
599                 if (ppi && elapsed > ppi->longest_completion_wait[w])
600                         ppi->longest_completion_wait[w] = elapsed;
601         }
602
603         /*
604          * kill the trace, we don't need it after completion
605          */
606         rb_erase(&iot->rb_node, &pdi->rb_track);
607         free(iot);
608
609         return elapsed;
610 }
611
612
613 static struct io_stats *find_process_io_stats(__u32 pid, char *name)
614 {
615         struct per_process_info *ppi = find_process(pid, name);
616
617         if (!ppi) {
618                 ppi = malloc(sizeof(*ppi));
619                 memset(ppi, 0, sizeof(*ppi));
620                 memcpy(ppi->name, name, 16);
621                 ppi->pid = pid;
622                 add_process_to_hash(ppi);
623                 add_process_to_list(ppi);
624         }
625
626         return &ppi->io_stats;
627 }
628
629 static void resize_cpu_info(struct per_dev_info *pdi, int cpu)
630 {
631         struct per_cpu_info *cpus = pdi->cpus;
632         int ncpus = pdi->ncpus;
633         int new_count = cpu + 1;
634         int new_space, size;
635         char *new_start;
636
637         size = new_count * sizeof(struct per_cpu_info);
638         cpus = realloc(cpus, size);
639         if (!cpus) {
640                 char name[20];
641                 fprintf(stderr, "Out of memory, CPU info for device %s (%d)\n",
642                         get_dev_name(pdi, name, sizeof(name)), size);
643                 exit(1);
644         }
645
646         new_start = (char *)cpus + (ncpus * sizeof(struct per_cpu_info));
647         new_space = (new_count - ncpus) * sizeof(struct per_cpu_info);
648         memset(new_start, 0, new_space);
649
650         pdi->ncpus = new_count;
651         pdi->cpus = cpus;
652 }
653
654 static struct per_cpu_info *get_cpu_info(struct per_dev_info *pdi, int cpu)
655 {
656         struct per_cpu_info *pci;
657
658         if (cpu >= pdi->ncpus)
659                 resize_cpu_info(pdi, cpu);
660
661         pci = &pdi->cpus[cpu];
662         pci->cpu = cpu;
663         return pci;
664 }
665
666
667 static int resize_devices(char *name)
668 {
669         int size = (ndevices + 1) * sizeof(struct per_dev_info);
670
671         devices = realloc(devices, size);
672         if (!devices) {
673                 fprintf(stderr, "Out of memory, device %s (%d)\n", name, size);
674                 return 1;
675         }
676         memset(&devices[ndevices], 0, sizeof(struct per_dev_info));
677         devices[ndevices].name = name;
678         ndevices++;
679         return 0;
680 }
681
682 static struct per_dev_info *get_dev_info(dev_t dev)
683 {
684         struct per_dev_info *pdi;
685         int i;
686
687         for (i = 0; i < ndevices; i++) {
688                 if (!devices[i].dev)
689                         devices[i].dev = dev;
690                 if (devices[i].dev == dev)
691                         return &devices[i];
692         }
693
694         if (resize_devices(NULL))
695                 return NULL;
696
697         pdi = &devices[ndevices - 1];
698         pdi->dev = dev;
699         pdi->last_sequence = -1;
700         pdi->last_read_time = 0;
701         memset(&pdi->rb_last, 0, sizeof(pdi->rb_last));
702         pdi->rb_last_entries = 0;
703         return pdi;
704 }
705
706 static char *get_dev_name(struct per_dev_info *pdi, char *buffer, int size)
707 {
708         if (pdi->name)
709                 snprintf(buffer, size, "%s", pdi->name);
710         else
711                 snprintf(buffer, size, "%d,%d",MAJOR(pdi->dev),MINOR(pdi->dev));
712         return buffer;
713 }
714
715 static void check_time(struct per_dev_info *pdi, struct blk_io_trace *bit)
716 {
717         unsigned long long this = bit->time;
718         unsigned long long last = pdi->last_reported_time;
719
720         pdi->backwards = (this < last) ? 'B' : ' ';
721         pdi->last_reported_time = this;
722 }
723
724 static inline void __account_m(struct io_stats *ios, struct blk_io_trace *t,
725                                int rw)
726 {
727         if (rw) {
728                 ios->mwrites++;
729                 ios->qwrite_kb += t->bytes >> 10;
730         } else {
731                 ios->mreads++;
732                 ios->qread_kb += t->bytes >> 10;
733         }
734 }
735
736 static inline void account_m(struct blk_io_trace *t, struct per_cpu_info *pci,
737                              int rw)
738 {
739         __account_m(&pci->io_stats, t, rw);
740
741         if (per_process_stats) {
742                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
743
744                 __account_m(ios, t, rw);
745         }
746 }
747
748 static inline void __account_queue(struct io_stats *ios, struct blk_io_trace *t,
749                                    int rw)
750 {
751         if (rw) {
752                 ios->qwrites++;
753                 ios->qwrite_kb += t->bytes >> 10;
754         } else {
755                 ios->qreads++;
756                 ios->qread_kb += t->bytes >> 10;
757         }
758 }
759
760 static inline void account_queue(struct blk_io_trace *t,
761                                  struct per_cpu_info *pci, int rw)
762 {
763         __account_queue(&pci->io_stats, t, rw);
764
765         if (per_process_stats) {
766                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
767
768                 __account_queue(ios, t, rw);
769         }
770 }
771
772 static inline void __account_c(struct io_stats *ios, int rw, unsigned int bytes)
773 {
774         if (rw) {
775                 ios->cwrites++;
776                 ios->cwrite_kb += bytes >> 10;
777         } else {
778                 ios->creads++;
779                 ios->cread_kb += bytes >> 10;
780         }
781 }
782
783 static inline void account_c(struct blk_io_trace *t, struct per_cpu_info *pci,
784                              int rw, int bytes)
785 {
786         __account_c(&pci->io_stats, rw, bytes);
787
788         if (per_process_stats) {
789                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
790
791                 __account_c(ios, rw, bytes);
792         }
793 }
794
795 static inline void __account_issue(struct io_stats *ios, int rw,
796                                    unsigned int bytes)
797 {
798         if (rw) {
799                 ios->iwrites++;
800                 ios->iwrite_kb += bytes >> 10;
801         } else {
802                 ios->ireads++;
803                 ios->iread_kb += bytes >> 10;
804         }
805 }
806
807 static inline void account_issue(struct blk_io_trace *t,
808                                  struct per_cpu_info *pci, int rw)
809 {
810         __account_issue(&pci->io_stats, rw, t->bytes);
811
812         if (per_process_stats) {
813                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
814
815                 __account_issue(ios, rw, t->bytes);
816         }
817 }
818
819 static inline void __account_unplug(struct io_stats *ios, int timer)
820 {
821         if (timer)
822                 ios->timer_unplugs++;
823         else
824                 ios->io_unplugs++;
825 }
826
827 static inline void account_unplug(struct blk_io_trace *t,
828                                   struct per_cpu_info *pci, int timer)
829 {
830         __account_unplug(&pci->io_stats, timer);
831
832         if (per_process_stats) {
833                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
834
835                 __account_unplug(ios, timer);
836         }
837 }
838
839 static void log_complete(struct per_dev_info *pdi, struct per_cpu_info *pci,
840                          struct blk_io_trace *t, char *act)
841 {
842         process_fmt(act, pci, t, log_track_complete(pdi, t), 0, NULL);
843 }
844
845 static void log_insert(struct per_dev_info *pdi, struct per_cpu_info *pci,
846                        struct blk_io_trace *t, char *act)
847 {
848         process_fmt(act, pci, t, log_track_insert(pdi, t), 0, NULL);
849 }
850
851 static void log_queue(struct per_cpu_info *pci, struct blk_io_trace *t,
852                       char *act)
853 {
854         process_fmt(act, pci, t, -1, 0, NULL);
855 }
856
857 static void log_issue(struct per_dev_info *pdi, struct per_cpu_info *pci,
858                       struct blk_io_trace *t, char *act)
859 {
860         process_fmt(act, pci, t, log_track_issue(pdi, t), 0, NULL);
861 }
862
863 static void log_merge(struct per_dev_info *pdi, struct per_cpu_info *pci,
864                       struct blk_io_trace *t, char *act)
865 {
866         if (act[0] == 'F')
867                 log_track_frontmerge(pdi, t);
868
869         process_fmt(act, pci, t, -1ULL, 0, NULL);
870 }
871
872 static void log_action(struct per_cpu_info *pci, struct blk_io_trace *t,
873                         char *act)
874 {
875         process_fmt(act, pci, t, -1ULL, 0, NULL);
876 }
877
878 static void log_generic(struct per_cpu_info *pci, struct blk_io_trace *t,
879                         char *act)
880 {
881         process_fmt(act, pci, t, -1ULL, 0, NULL);
882 }
883
884 static void log_unplug(struct per_cpu_info *pci, struct blk_io_trace *t,
885                       char *act)
886 {
887         process_fmt(act, pci, t, -1ULL, 0, NULL);
888 }
889
890 static void log_split(struct per_cpu_info *pci, struct blk_io_trace *t,
891                       char *act)
892 {
893         process_fmt(act, pci, t, -1ULL, 0, NULL);
894 }
895
896 static void log_pc(struct per_cpu_info *pci, struct blk_io_trace *t, char *act)
897 {
898         unsigned char *buf = (unsigned char *) t + sizeof(*t);
899
900         process_fmt(act, pci, t, -1ULL, t->pdu_len, buf);
901 }
902
903 static void dump_trace_pc(struct blk_io_trace *t, struct per_cpu_info *pci)
904 {
905         int act = t->action & 0xffff;
906
907         switch (act) {
908                 case __BLK_TA_QUEUE:
909                         log_generic(pci, t, "Q");
910                         break;
911                 case __BLK_TA_GETRQ:
912                         log_generic(pci, t, "G");
913                         break;
914                 case __BLK_TA_SLEEPRQ:
915                         log_generic(pci, t, "S");
916                         break;
917                 case __BLK_TA_REQUEUE:
918                         log_generic(pci, t, "R");
919                         break;
920                 case __BLK_TA_ISSUE:
921                         log_pc(pci, t, "D");
922                         break;
923                 case __BLK_TA_COMPLETE:
924                         log_pc(pci, t, "C");
925                         break;
926                 case __BLK_TA_INSERT:
927                         log_pc(pci, t, "I");
928                         break;
929                 default:
930                         fprintf(stderr, "Bad pc action %x\n", act);
931                         break;
932         }
933 }
934
935 static void dump_trace_fs(struct blk_io_trace *t, struct per_dev_info *pdi,
936                           struct per_cpu_info *pci)
937 {
938         int w = t->action & BLK_TC_ACT(BLK_TC_WRITE);
939         int act = t->action & 0xffff;
940
941         switch (act) {
942                 case __BLK_TA_QUEUE:
943                         account_queue(t, pci, w);
944                         log_queue(pci, t, "Q");
945                         break;
946                 case __BLK_TA_INSERT:
947                         log_insert(pdi, pci, t, "I");
948                         break;
949                 case __BLK_TA_BACKMERGE:
950                         account_m(t, pci, w);
951                         log_merge(pdi, pci, t, "M");
952                         break;
953                 case __BLK_TA_FRONTMERGE:
954                         account_m(t, pci, w);
955                         log_merge(pdi, pci, t, "F");
956                         break;
957                 case __BLK_TA_GETRQ:
958                         log_track_getrq(pdi, t);
959                         log_generic(pci, t, "G");
960                         break;
961                 case __BLK_TA_SLEEPRQ:
962                         log_generic(pci, t, "S");
963                         break;
964                 case __BLK_TA_REQUEUE:
965                         account_c(t, pci, w, -t->bytes);
966                         log_queue(pci, t, "R");
967                         break;
968                 case __BLK_TA_ISSUE:
969                         account_issue(t, pci, w);
970                         log_issue(pdi, pci, t, "D");
971                         break;
972                 case __BLK_TA_COMPLETE:
973                         account_c(t, pci, w, t->bytes);
974                         log_complete(pdi, pci, t, "C");
975                         break;
976                 case __BLK_TA_PLUG:
977                         log_action(pci, t, "P");
978                         break;
979                 case __BLK_TA_UNPLUG_IO:
980                         account_unplug(t, pci, 0);
981                         log_unplug(pci, t, "U");
982                         break;
983                 case __BLK_TA_UNPLUG_TIMER:
984                         account_unplug(t, pci, 1);
985                         log_unplug(pci, t, "UT");
986                         break;
987                 case __BLK_TA_SPLIT:
988                         log_split(pci, t, "X");
989                         break;
990                 case __BLK_TA_BOUNCE:
991                         log_generic(pci, t, "B");
992                         break;
993                 default:
994                         fprintf(stderr, "Bad fs action %x\n", t->action);
995                         break;
996         }
997 }
998
999 static void dump_trace(struct blk_io_trace *t, struct per_cpu_info *pci,
1000                        struct per_dev_info *pdi)
1001 {
1002         if (t->action & BLK_TC_ACT(BLK_TC_PC))
1003                 dump_trace_pc(t, pci);
1004         else
1005                 dump_trace_fs(t, pdi, pci);
1006
1007         pdi->events++;
1008 }
1009
1010 static void dump_io_stats(struct io_stats *ios, char *msg)
1011 {
1012         fprintf(ofp, "%s\n", msg);
1013
1014         fprintf(ofp, " Reads Queued:    %'8lu, %'8LuKiB\t", ios->qreads, ios->qread_kb);
1015         fprintf(ofp, " Writes Queued:    %'8lu, %'8LuKiB\n", ios->qwrites,ios->qwrite_kb);
1016
1017         fprintf(ofp, " Read Dispatches: %'8lu, %'8LuKiB\t", ios->ireads, ios->iread_kb);
1018         fprintf(ofp, " Write Dispatches: %'8lu, %'8LuKiB\n", ios->iwrites,ios->iwrite_kb);
1019         fprintf(ofp, " Reads Completed: %'8lu, %'8LuKiB\t", ios->creads, ios->cread_kb);
1020         fprintf(ofp, " Writes Completed: %'8lu, %'8LuKiB\n", ios->cwrites,ios->cwrite_kb);
1021         fprintf(ofp, " Read Merges:     %'8lu%8c\t", ios->mreads, ' ');
1022         fprintf(ofp, " Write Merges:     %'8lu\n", ios->mwrites);
1023         fprintf(ofp, " IO unplugs:      %'8lu%8c\t", ios->io_unplugs, ' ');
1024         fprintf(ofp, " Timer unplugs:    %'8lu\n", ios->timer_unplugs);
1025 }
1026
1027 static void dump_wait_stats(struct per_process_info *ppi)
1028 {
1029         unsigned long rawait = ppi->longest_allocation_wait[0] / 1000;
1030         unsigned long rdwait = ppi->longest_dispatch_wait[0] / 1000;
1031         unsigned long rcwait = ppi->longest_completion_wait[0] / 1000;
1032         unsigned long wawait = ppi->longest_allocation_wait[1] / 1000;
1033         unsigned long wdwait = ppi->longest_dispatch_wait[1] / 1000;
1034         unsigned long wcwait = ppi->longest_completion_wait[1] / 1000;
1035
1036         fprintf(ofp, " Allocation wait: %'8lu%8c\t", rawait, ' ');
1037         fprintf(ofp, " Allocation wait:  %'8lu\n", wawait);
1038         fprintf(ofp, " Dispatch wait:   %'8lu%8c\t", rdwait, ' ');
1039         fprintf(ofp, " Dispatch wait:    %'8lu\n", wdwait);
1040         fprintf(ofp, " Completion wait: %'8lu%8c\t", rcwait, ' ');
1041         fprintf(ofp, " Completion wait:  %'8lu\n", wcwait);
1042 }
1043
1044 static int ppi_name_compare(const void *p1, const void *p2)
1045 {
1046         struct per_process_info *ppi1 = *((struct per_process_info **) p1);
1047         struct per_process_info *ppi2 = *((struct per_process_info **) p2);
1048         int res;
1049
1050         res = strverscmp(ppi1->name, ppi2->name);
1051         if (!res)
1052                 res = ppi1->pid > ppi2->pid;
1053
1054         return res;
1055 }
1056
1057 static void sort_process_list(void)
1058 {
1059         struct per_process_info **ppis;
1060         struct per_process_info *ppi;
1061         int i = 0;
1062
1063         ppis = malloc(ppi_list_entries * sizeof(struct per_process_info *));
1064
1065         ppi = ppi_list;
1066         while (ppi) {
1067                 ppis[i++] = ppi;
1068                 ppi = ppi->list_next;
1069         }
1070
1071         qsort(ppis, ppi_list_entries, sizeof(ppi), ppi_name_compare);
1072
1073         i = ppi_list_entries - 1;
1074         ppi_list = NULL;
1075         while (i >= 0) {
1076                 ppi = ppis[i];
1077
1078                 ppi->list_next = ppi_list;
1079                 ppi_list = ppi;
1080                 i--;
1081         }
1082
1083         free(ppis);
1084 }
1085
1086 static void show_process_stats(void)
1087 {
1088         struct per_process_info *ppi;
1089
1090         sort_process_list();
1091
1092         ppi = ppi_list;
1093         while (ppi) {
1094                 char name[64];
1095
1096                 if (ppi->more_than_one)
1097                         sprintf(name, "%s (%u, ...)", ppi->name, ppi->pid);
1098                 else
1099                         sprintf(name, "%s (%u)", ppi->name, ppi->pid);
1100
1101                 dump_io_stats(&ppi->io_stats, name);
1102                 dump_wait_stats(ppi);
1103                 ppi = ppi->list_next;
1104         }
1105
1106         fprintf(ofp, "\n");
1107 }
1108
1109 static void show_device_and_cpu_stats(void)
1110 {
1111         struct per_dev_info *pdi;
1112         struct per_cpu_info *pci;
1113         struct io_stats total, *ios;
1114         int i, j, pci_events;
1115         char line[3 + 8/*cpu*/ + 2 + 32/*dev*/ + 3];
1116         char name[32];
1117
1118         for (pdi = devices, i = 0; i < ndevices; i++, pdi++) {
1119
1120                 memset(&total, 0, sizeof(total));
1121                 pci_events = 0;
1122
1123                 if (i > 0)
1124                         fprintf(ofp, "\n");
1125
1126                 for (pci = pdi->cpus, j = 0; j < pdi->ncpus; j++, pci++) {
1127                         if (!pci->nelems)
1128                                 continue;
1129
1130                         ios = &pci->io_stats;
1131                         total.qreads += ios->qreads;
1132                         total.qwrites += ios->qwrites;
1133                         total.creads += ios->creads;
1134                         total.cwrites += ios->cwrites;
1135                         total.mreads += ios->mreads;
1136                         total.mwrites += ios->mwrites;
1137                         total.ireads += ios->ireads;
1138                         total.iwrites += ios->iwrites;
1139                         total.qread_kb += ios->qread_kb;
1140                         total.qwrite_kb += ios->qwrite_kb;
1141                         total.cread_kb += ios->cread_kb;
1142                         total.cwrite_kb += ios->cwrite_kb;
1143                         total.iread_kb += ios->iread_kb;
1144                         total.iwrite_kb += ios->iwrite_kb;
1145                         total.timer_unplugs += ios->timer_unplugs;
1146                         total.io_unplugs += ios->io_unplugs;
1147
1148                         snprintf(line, sizeof(line) - 1, "CPU%d (%s):",
1149                                  j, get_dev_name(pdi, name, sizeof(name)));
1150                         dump_io_stats(ios, line);
1151                         pci_events++;
1152                 }
1153
1154                 if (pci_events > 1) {
1155                         fprintf(ofp, "\n");
1156                         snprintf(line, sizeof(line) - 1, "Total (%s):",
1157                                  get_dev_name(pdi, name, sizeof(name)));
1158                         dump_io_stats(&total, line);
1159                 }
1160
1161                 fprintf(ofp, "\nEvents (%s): %'Lu entries, %'lu skips\n",
1162                         get_dev_name(pdi, line, sizeof(line)), pdi->events,
1163                         pdi->skips);
1164         }
1165 }
1166
1167 /*
1168  * struct trace and blktrace allocation cache, we do potentially
1169  * millions of mallocs for these structures while only using at most
1170  * a few thousand at the time
1171  */
1172 static inline void t_free(struct trace *t)
1173 {
1174         t->next = t_alloc_list;
1175         t_alloc_list = t;
1176 }
1177
1178 static inline struct trace *t_alloc(void)
1179 {
1180         struct trace *t = t_alloc_list;
1181
1182         if (t) {
1183                 t_alloc_list = t->next;
1184                 return t;
1185         }
1186
1187         return malloc(sizeof(*t));
1188 }
1189
1190 static inline void bit_free(struct blk_io_trace *bit)
1191 {
1192         /*
1193          * abuse a 64-bit field for a next pointer for the free item
1194          */
1195         bit->time = (__u64) (unsigned long) bit_alloc_list;
1196         bit_alloc_list = (struct blk_io_trace *) bit;
1197 }
1198
1199 static inline struct blk_io_trace *bit_alloc(void)
1200 {
1201         struct blk_io_trace *bit = bit_alloc_list;
1202
1203         if (bit) {
1204                 bit_alloc_list = (struct blk_io_trace *) (unsigned long) \
1205                                  bit->time;
1206                 return bit;
1207         }
1208
1209         return malloc(sizeof(*bit));
1210 }
1211
1212 static void find_genesis(void)
1213 {
1214         struct trace *t = trace_list;
1215
1216         genesis_time = -1ULL;
1217         while (t != NULL) {
1218                 if (t->bit->time < genesis_time)
1219                         genesis_time = t->bit->time;
1220
1221                 t = t->next;
1222         }
1223 }
1224
1225 static inline int check_stopwatch(struct blk_io_trace *bit)
1226 {
1227         if (bit->time < stopwatch_end &&
1228             bit->time >= stopwatch_start)
1229                 return 0;
1230
1231         return 1;
1232 }
1233
1234 /*
1235  * return youngest entry read
1236  */
1237 static int sort_entries(unsigned long long *youngest)
1238 {
1239         struct trace *t;
1240
1241         if (!genesis_time)
1242                 find_genesis();
1243
1244         *youngest = 0;
1245         while ((t = trace_list) != NULL) {
1246                 struct blk_io_trace *bit = t->bit;
1247
1248                 trace_list = t->next;
1249
1250                 bit->time -= genesis_time;
1251
1252                 if (bit->time < *youngest || !*youngest)
1253                         *youngest = bit->time;
1254
1255                 if (check_stopwatch(bit)) {
1256                         bit_free(bit);
1257                         t_free(t);
1258                         continue;
1259                 }
1260
1261                 if (trace_rb_insert_sort(t))
1262                         return -1;
1263         }
1264
1265         return 0;
1266 }
1267
1268 static inline void put_trace(struct per_dev_info *pdi, struct trace *t)
1269 {
1270         rb_erase(&t->rb_node, &rb_sort_root);
1271         rb_sort_entries--;
1272
1273         trace_rb_insert_last(pdi, t);
1274
1275         if (pdi->rb_last_entries > 1024) {
1276                 struct rb_node *n = rb_first(&pdi->rb_last);
1277
1278                 t = rb_entry(n, struct trace, rb_node);
1279                 rb_erase(n, &pdi->rb_last);
1280                 pdi->rb_last_entries--;
1281         
1282                 bit_free(t->bit);
1283                 t_free(t);
1284         }
1285 }
1286
1287 static int check_sequence(struct per_dev_info *pdi, struct blk_io_trace *bit,
1288                           int force)
1289 {
1290         unsigned long expected_sequence = pdi->last_sequence + 1;
1291         struct trace *t;
1292         
1293         /*
1294          * first entry, always ok
1295          */
1296         if (!expected_sequence)
1297                 return 0;
1298
1299         if (bit->sequence == expected_sequence)
1300                 return 0;
1301
1302         if (bit->sequence < expected_sequence &&
1303             bit->time > pdi->last_reported_time)
1304                 return 0;
1305
1306         /*
1307          * the wanted sequence is really there, continue
1308          * because this means that the log time is earlier
1309          * on the trace we have now */
1310         t = trace_rb_find_sort(pdi->dev, expected_sequence);
1311         if (t)
1312                 return 0;
1313
1314         t = trace_rb_find_last(pdi, expected_sequence);
1315         if (t)
1316                 return 0;
1317
1318         /*
1319          * unless this is the last run, break and wait for more entries
1320          */
1321         if (!force)
1322                 return 1;
1323
1324         fprintf(stderr, "(%d,%d): skipping %lu -> %u\n", MAJOR(pdi->dev),
1325                         MINOR(pdi->dev), pdi->last_sequence, bit->sequence);
1326         pdi->skips++;
1327         return 0;
1328 }
1329
1330 static void show_entries_rb(int force)
1331 {
1332         struct per_dev_info *pdi = NULL;
1333         struct per_cpu_info *pci = NULL;
1334         struct blk_io_trace *bit;
1335         struct rb_node *n;
1336         struct trace *t;
1337
1338         while ((n = rb_first(&rb_sort_root)) != NULL) {
1339                 if (done)
1340                         break;
1341
1342                 t = rb_entry(n, struct trace, rb_node);
1343                 bit = t->bit;
1344
1345                 if (!pdi || pdi->dev != bit->device)
1346                         pdi = get_dev_info(bit->device);
1347
1348                 if (!pdi) {
1349                         fprintf(stderr, "Unknown device ID? (%d,%d)\n",
1350                                 MAJOR(bit->device), MINOR(bit->device));
1351                         break;
1352                 }
1353
1354                 if (check_sequence(pdi, bit, force))
1355                         break;
1356
1357                 if (!force && bit->time > last_allowed_time)
1358                         break;
1359
1360                 pdi->last_sequence = bit->sequence;
1361
1362                 check_time(pdi, bit);
1363
1364                 if (!pci || pci->cpu != bit->cpu)
1365                         pci = get_cpu_info(pdi, bit->cpu);
1366
1367                 dump_trace(bit, pci, pdi);
1368
1369                 put_trace(pdi, t);
1370         }
1371 }
1372
1373 static int read_data(int fd, void *buffer, int bytes, int block)
1374 {
1375         int ret, bytes_left, fl;
1376         void *p;
1377
1378         fl = fcntl(fd, F_GETFL);
1379
1380         if (!block)
1381                 fcntl(fd, F_SETFL, fl | O_NONBLOCK);
1382         else
1383                 fcntl(fd, F_SETFL, fl & ~O_NONBLOCK);
1384
1385         bytes_left = bytes;
1386         p = buffer;
1387         while (bytes_left > 0) {
1388                 ret = read(fd, p, bytes_left);
1389                 if (!ret)
1390                         return 1;
1391                 else if (ret < 0) {
1392                         if (errno != EAGAIN)
1393                                 perror("read");
1394
1395                         return -1;
1396                 } else {
1397                         p += ret;
1398                         bytes_left -= ret;
1399                 }
1400         }
1401
1402         return 0;
1403 }
1404
1405 static int read_events(int fd, int always_block)
1406 {
1407         struct per_dev_info *pdi = NULL;
1408         int events = 0;
1409
1410         while (!is_done() && events < rb_batch) {
1411                 struct blk_io_trace *bit;
1412                 struct trace *t;
1413                 int pdu_len;
1414                 __u32 magic;
1415
1416                 bit = bit_alloc();
1417
1418                 if (read_data(fd, bit, sizeof(*bit), !events || always_block))
1419                         break;
1420
1421                 magic = be32_to_cpu(bit->magic);
1422                 if ((magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
1423                         fprintf(stderr, "Bad magic %x\n", magic);
1424                         break;
1425                 }
1426
1427                 pdu_len = be16_to_cpu(bit->pdu_len);
1428                 if (pdu_len) {
1429                         void *ptr = realloc(bit, sizeof(*bit) + pdu_len);
1430
1431                         if (read_data(fd, ptr + sizeof(*bit), pdu_len, 1))
1432                                 break;
1433
1434                         bit = ptr;
1435                 }
1436
1437                 trace_to_cpu(bit);
1438
1439                 if (verify_trace(bit)) {
1440                         bit_free(bit);
1441                         continue;
1442                 }
1443
1444                 t = t_alloc();
1445                 memset(t, 0, sizeof(*t));
1446                 t->bit = bit;
1447
1448                 t->next = trace_list;
1449                 trace_list = t;
1450
1451                 if (!pdi || pdi->dev != bit->device)
1452                         pdi = get_dev_info(bit->device);
1453
1454                 if (bit->time > pdi->last_read_time)
1455                         pdi->last_read_time = bit->time;
1456
1457                 events++;
1458         }
1459
1460         return events;
1461 }
1462
1463 static int do_file(void)
1464 {
1465         struct per_cpu_info *pci;
1466         struct per_dev_info *pdi;
1467         int i, j, events, events_added;
1468
1469         /*
1470          * first prepare all files for reading
1471          */
1472         for (i = 0; i < ndevices; i++) {
1473                 pdi = &devices[i];
1474                 pdi->nfiles = 0;
1475                 pdi->last_sequence = -1;
1476
1477                 for (j = 0;; j++) {
1478                         struct stat st;
1479
1480                         pci = get_cpu_info(pdi, j);
1481                         pci->cpu = j;
1482                         pci->fd = -1;
1483
1484                         snprintf(pci->fname, sizeof(pci->fname)-1,
1485                                  "%s.blktrace.%d", pdi->name, pci->cpu);
1486                         if (stat(pci->fname, &st) < 0)
1487                                 break;
1488                         if (st.st_size) {
1489                                 pci->fd = open(pci->fname, O_RDONLY);
1490                                 if (pci->fd < 0) {
1491                                         perror(pci->fname);
1492                                         continue;
1493                                 }
1494                         }
1495
1496                         printf("Input file %s added\n", pci->fname);
1497                         pdi->nfiles++;
1498                 }
1499         }
1500
1501         /*
1502          * now loop over the files reading in the data
1503          */
1504         do {
1505                 unsigned long long youngest;
1506
1507                 events_added = 0;
1508                 last_allowed_time = -1ULL;
1509
1510                 for (i = 0; i < ndevices; i++) {
1511                         pdi = &devices[i];
1512
1513                         for (j = 0; j < pdi->nfiles; j++) {
1514
1515                                 pci = get_cpu_info(pdi, j);
1516
1517                                 if (pci->fd == -1)
1518                                         continue;
1519
1520                                 events = read_events(pci->fd, 1);
1521                                 if (!events) {
1522                                         close(pci->fd);
1523                                         pci->fd = -1;
1524                                         continue;
1525                                 }
1526
1527                                 if (pdi->last_read_time < last_allowed_time)
1528                                         last_allowed_time = pdi->last_read_time;
1529
1530                                 events_added += events;
1531                         }
1532                 }
1533
1534                 if (sort_entries(&youngest))
1535                         break;
1536
1537                 if (youngest > stopwatch_end)
1538                         break;
1539
1540                 show_entries_rb(0);
1541
1542         } while (events_added);
1543
1544         if (rb_sort_entries)
1545                 show_entries_rb(1);
1546
1547         return 0;
1548 }
1549
1550 static int do_stdin(void)
1551 {
1552         unsigned long long youngest;
1553         int fd;
1554
1555         last_allowed_time = -1ULL;
1556         fd = dup(STDIN_FILENO);
1557         do {
1558                 int events;
1559
1560                 events = read_events(fd, 0);
1561                 if (!events)
1562                         break;
1563         
1564                 if (sort_entries(&youngest))
1565                         break;
1566
1567                 if (youngest > stopwatch_end)
1568                         break;
1569
1570                 show_entries_rb(0);
1571         } while (1);
1572
1573         if (rb_sort_entries)
1574                 show_entries_rb(1);
1575
1576         close(fd);
1577         return 0;
1578 }
1579
1580 static void flush_output(void)
1581 {
1582         fflush(ofp);
1583 }
1584
1585 static void handle_sigint(int sig)
1586 {
1587         done = 1;
1588         flush_output();
1589 }
1590
1591 /*
1592  * Extract start and duration times from a string, allowing
1593  * us to specify a time interval of interest within a trace.
1594  * Format: "duration" (start is zero) or "start:duration".
1595  */
1596 static int find_stopwatch_interval(char *string)
1597 {
1598         double value;
1599         char *sp;
1600
1601         value = strtod(string, &sp);
1602         if (sp == string) {
1603                 fprintf(stderr,"Invalid stopwatch timer: %s\n", string);
1604                 return 1;
1605         }
1606         if (*sp == ':') {
1607                 stopwatch_start = DOUBLE_TO_NANO_ULL(value);
1608                 string = sp + 1;
1609                 value = strtod(string, &sp);
1610                 if (sp == string || *sp != '\0') {
1611                         fprintf(stderr,"Invalid stopwatch duration time: %s\n",
1612                                 string);
1613                         return 1;
1614                 }
1615         } else if (*sp != '\0') {
1616                 fprintf(stderr,"Invalid stopwatch start timer: %s\n", string);
1617                 return 1;
1618         }
1619         stopwatch_end = DOUBLE_TO_NANO_ULL(value);
1620         if (stopwatch_end <= stopwatch_start) {
1621                 fprintf(stderr, "Invalid stopwatch interval: %Lu -> %Lu\n",
1622                         stopwatch_start, stopwatch_end);
1623                 return 1;
1624         }
1625
1626         return 0;
1627 }
1628
1629 static char usage_str[] = \
1630         "[ -i <input name> ] [-o <output name> [ -s ] [ -t ] [ -q ]\n" \
1631         "[ -w start:stop ] [ -f output format ] [ -F format spec ] [ -v] \n\n" \
1632         "\t-i Input file containing trace data, or '-' for stdin\n" \
1633         "\t-o Output file. If not given, output is stdout\n" \
1634         "\t-b stdin read batching\n" \
1635         "\t-s Show per-program io statistics\n" \
1636         "\t-n Hash processes by name, not pid\n" \
1637         "\t-t Track individual ios. Will tell you the time a request took\n" \
1638         "\t   to get queued, to get dispatched, and to get completed\n" \
1639         "\t-q Quiet. Don't display any stats at the end of the trace\n" \
1640         "\t-w Only parse data between the given time interval in seconds.\n" \
1641         "\t   If 'start' isn't given, blkparse defaults the start time to 0\n" \
1642         "\t -f Output format. Customize the output format. The format field\n" \
1643         "\t    identifies can be found in the documentation\n" \
1644         "\t-F Format specification. Can be found in the documentation\n" \
1645         "\t-v Print program version info\n\n";
1646
1647 static void usage(char *prog)
1648 {
1649         fprintf(stderr, "Usage: %s %s %s", prog, blkparse_version, usage_str);
1650 }
1651
1652 int main(int argc, char *argv[])
1653 {
1654         char *ofp_buffer;
1655         int c, ret, mode;
1656         int per_device_and_cpu_stats = 1;
1657
1658         while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
1659                 switch (c) {
1660                 case 'i':
1661                         if (!strcmp(optarg, "-") && !pipeline)
1662                                 pipeline = 1;
1663                         else if (resize_devices(optarg) != 0)
1664                                 return 1;
1665                         break;
1666                 case 'o':
1667                         output_name = optarg;
1668                         break;
1669                 case 'b':
1670                         rb_batch = atoi(optarg);
1671                         if (rb_batch <= 0)
1672                                 rb_batch = RB_BATCH_DEFAULT;
1673                         break;
1674                 case 's':
1675                         per_process_stats = 1;
1676                         break;
1677                 case 't':
1678                         track_ios = 1;
1679                         break;
1680                 case 'q':
1681                         per_device_and_cpu_stats = 0;
1682                         break;
1683                 case 'w':
1684                         if (find_stopwatch_interval(optarg) != 0)
1685                                 return 1;
1686                         break;
1687                 case 'f':
1688                         set_all_format_specs(optarg);
1689                         break;
1690                 case 'F':
1691                         if (add_format_spec(optarg) != 0)
1692                                 return 1;
1693                         break;
1694                 case 'n':
1695                         ppi_hash_by_pid = 0;
1696                         break;
1697                 case 'v':
1698                         printf("%s version %s\n", argv[0], blkparse_version);
1699                         return 0;
1700                 default:
1701                         usage(argv[0]);
1702                         return 1;
1703                 }
1704         }
1705
1706         while (optind < argc) {
1707                 if (!strcmp(argv[optind], "-") && !pipeline)
1708                         pipeline = 1;
1709                 else if (resize_devices(argv[optind]) != 0)
1710                         return 1;
1711                 optind++;
1712         }
1713
1714         if (!pipeline && !ndevices) {
1715                 usage(argv[0]);
1716                 return 1;
1717         }
1718
1719         memset(&rb_sort_root, 0, sizeof(rb_sort_root));
1720
1721         signal(SIGINT, handle_sigint);
1722         signal(SIGHUP, handle_sigint);
1723         signal(SIGTERM, handle_sigint);
1724
1725         setlocale(LC_NUMERIC, "en_US");
1726
1727         if (!output_name) {
1728                 ofp = fdopen(STDOUT_FILENO, "w");
1729                 mode = _IOLBF;
1730         } else {
1731                 char ofname[128];
1732
1733                 snprintf(ofname, sizeof(ofname) - 1, "%s", output_name);
1734                 ofp = fopen(ofname, "w");
1735                 mode = _IOFBF;
1736         }
1737
1738         if (!ofp) {
1739                 perror("fopen");
1740                 return 1;
1741         }
1742
1743         ofp_buffer = malloc(4096);      
1744         if (setvbuf(ofp, ofp_buffer, mode, 4096)) {
1745                 perror("setvbuf");
1746                 return 1;
1747         }
1748
1749         if (pipeline)
1750                 ret = do_stdin();
1751         else
1752                 ret = do_file();
1753
1754         if (per_process_stats)
1755                 show_process_stats();
1756
1757         if (per_device_and_cpu_stats)
1758                 show_device_and_cpu_stats();
1759
1760         flush_output();
1761         return ret;
1762 }