[PATCH] blkparse: invert alloc checks, more intuitive
[blktrace.git] / blkparse.c
1 /*
2  * block queue tracing parse application
3  *
4  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation; either version 2 of the License, or
9  *  (at your option) any later version.
10  *
11  *  This program is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with this program; if not, write to the Free Software
18  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  *
20  */
21 #include <sys/types.h>
22 #include <sys/stat.h>
23 #include <unistd.h>
24 #include <stdio.h>
25 #include <fcntl.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <getopt.h>
29 #include <errno.h>
30 #include <signal.h>
31 #include <locale.h>
32 #include <limits.h>
33
34 #include "blktrace.h"
35 #include "rbtree.h"
36 #include "jhash.h"
37
38 static char blkparse_version[] = "0.90";
39
40 struct per_dev_info {
41         dev_t dev;
42         char *name;
43
44         int backwards;
45         unsigned long long events;
46         unsigned long long last_reported_time;
47         unsigned long long last_read_time;
48         struct io_stats io_stats;
49         unsigned long last_sequence;
50         unsigned long skips;
51
52         struct rb_root rb_last;
53         unsigned long rb_last_entries;
54
55         struct rb_root rb_track;
56
57         int nfiles;
58         int ncpus;
59         struct per_cpu_info *cpus;
60 };
61
62 struct per_process_info {
63         char name[16];
64         __u32 pid;
65         struct io_stats io_stats;
66         struct per_process_info *hash_next, *list_next;
67         int more_than_one;
68
69         /*
70          * individual io stats
71          */
72         unsigned long long longest_allocation_wait[2];
73         unsigned long long longest_dispatch_wait[2];
74         unsigned long long longest_completion_wait[2];
75 };
76
77 #define PPI_HASH_SHIFT  (8)
78 #define PPI_HASH_SIZE   (1 << PPI_HASH_SHIFT)
79 #define PPI_HASH_MASK   (PPI_HASH_SIZE - 1)
80 static struct per_process_info *ppi_hash_table[PPI_HASH_SIZE];
81 static struct per_process_info *ppi_list;
82 static int ppi_list_entries;
83
84 #define S_OPTS  "i:o:b:stqw:f:F:vn"
85 static struct option l_opts[] = {
86         {
87                 .name = "input",
88                 .has_arg = required_argument,
89                 .flag = NULL,
90                 .val = 'i'
91         },
92         {
93                 .name = "output",
94                 .has_arg = required_argument,
95                 .flag = NULL,
96                 .val = 'o'
97         },
98         {
99                 .name = "batch",
100                 .has_arg = required_argument,
101                 .flag = NULL,
102                 .val = 'b'
103         },
104         {
105                 .name = "per program stats",
106                 .has_arg = no_argument,
107                 .flag = NULL,
108                 .val = 's'
109         },
110         {
111                 .name = "track ios",
112                 .has_arg = no_argument,
113                 .flag = NULL,
114                 .val = 't'
115         },
116         {
117                 .name = "quiet",
118                 .has_arg = no_argument,
119                 .flag = NULL,
120                 .val = 'q'
121         },
122         {
123                 .name = "stopwatch",
124                 .has_arg = required_argument,
125                 .flag = NULL,
126                 .val = 'w'
127         },
128         {
129                 .name = "format",
130                 .has_arg = required_argument,
131                 .flag = NULL,
132                 .val = 'f'
133         },
134         {
135                 .name = "format-spec",
136                 .has_arg = required_argument,
137                 .flag = NULL,
138                 .val = 'F'
139         },
140         {
141                 .name = "hash by name",
142                 .has_arg = no_argument,
143                 .flag = NULL,
144                 .val = 'n'
145         },
146         {
147                 .name = "version",
148                 .has_arg = no_argument,
149                 .flag = NULL,
150                 .val = 'v'
151         },
152 };
153
154 /*
155  * for sorting the displayed output
156  */
157 struct trace {
158         struct blk_io_trace *bit;
159         struct rb_node rb_node;
160         struct trace *next;
161         unsigned int skipped;
162 };
163
164 static struct rb_root rb_sort_root;
165 static unsigned long rb_sort_entries;
166
167 static struct trace *trace_list;
168
169 /*
170  * allocation cache
171  */
172 static struct blk_io_trace *bit_alloc_list;
173 static struct trace *t_alloc_list;
174
175 /*
176  * for tracking individual ios
177  */
178 struct io_track {
179         struct rb_node rb_node;
180
181         __u64 sector;
182         __u32 pid;
183         char comm[16];
184         unsigned long long allocation_time;
185         unsigned long long queue_time;
186         unsigned long long dispatch_time;
187         unsigned long long completion_time;
188 };
189
190 static int ndevices;
191 static struct per_dev_info *devices;
192 static char *get_dev_name(struct per_dev_info *, char *, int);
193
194 FILE *ofp = NULL;
195 static char *output_name;
196
197 static unsigned long long genesis_time;
198 static unsigned long long last_allowed_time;
199 static unsigned long long stopwatch_start;      /* start from zero by default */
200 static unsigned long long stopwatch_end = ULONG_LONG_MAX;       /* "infinity" */
201
202 static int per_process_stats;
203 static int track_ios;
204 static int ppi_hash_by_pid = 1;
205
206 static unsigned long last_skipped;
207 static unsigned int skip_runs;
208
209 static unsigned int t_alloc_cache;
210 static unsigned int bit_alloc_cache;
211
212 #define RB_BATCH_DEFAULT        (512)
213 static unsigned int rb_batch = RB_BATCH_DEFAULT;
214
215 static int pipeline;
216
217 #define is_done()       (*(volatile int *)(&done))
218 static volatile int done;
219
220 #define JHASH_RANDOM    (0x3af5f2ee)
221
222 static inline int ppi_hash_pid(__u32 pid)
223 {
224         return jhash_1word(pid, JHASH_RANDOM) & PPI_HASH_MASK;
225 }
226
227 static inline int ppi_hash_name(const char *name)
228 {
229         return jhash(name, 16, JHASH_RANDOM) & PPI_HASH_MASK;
230 }
231
232 static inline int ppi_hash(struct per_process_info *ppi)
233 {
234         if (ppi_hash_by_pid)
235                 return ppi_hash_pid(ppi->pid);
236
237         return ppi_hash_name(ppi->name);
238 }
239
240 static inline void add_process_to_hash(struct per_process_info *ppi)
241 {
242         const int hash_idx = ppi_hash(ppi);
243
244         ppi->hash_next = ppi_hash_table[hash_idx];
245         ppi_hash_table[hash_idx] = ppi;
246 }
247
248 static inline void add_process_to_list(struct per_process_info *ppi)
249 {
250         ppi->list_next = ppi_list;
251         ppi_list = ppi;
252         ppi_list_entries++;
253 }
254
255 static struct per_process_info *find_process_by_name(char *name)
256 {
257         const int hash_idx = ppi_hash_name(name);
258         struct per_process_info *ppi;
259
260         ppi = ppi_hash_table[hash_idx];
261         while (ppi) {
262                 if (!strcmp(ppi->name, name))
263                         return ppi;
264
265                 ppi = ppi->hash_next;
266         }
267
268         return NULL;
269 }
270
271 static struct per_process_info *find_process_by_pid(__u32 pid)
272 {
273         const int hash_idx = ppi_hash_pid(pid);
274         struct per_process_info *ppi;
275
276         ppi = ppi_hash_table[hash_idx];
277         while (ppi) {
278                 if (ppi->pid == pid)
279                         return ppi;
280
281                 ppi = ppi->hash_next;
282         }
283
284         return NULL;
285 }
286
287 static struct per_process_info *find_process(__u32 pid, char *name)
288 {
289         struct per_process_info *ppi;
290
291         if (ppi_hash_by_pid)
292                 return find_process_by_pid(pid);
293
294         ppi = find_process_by_name(name);
295         if (ppi && ppi->pid != pid)
296                 ppi->more_than_one = 1;
297
298         return ppi;
299 }
300
301 static inline int trace_rb_insert(struct trace *t, struct rb_root *root,
302                                   int check_time)
303 {
304         struct rb_node **p = &root->rb_node;
305         struct rb_node *parent = NULL;
306         struct trace *__t;
307
308         while (*p) {
309                 parent = *p;
310
311                 __t = rb_entry(parent, struct trace, rb_node);
312
313                 if (check_time) {
314                         if (t->bit->time < __t->bit->time) {
315                                 p = &(*p)->rb_left;
316                                 continue;
317                         } else if (t->bit->time > __t->bit->time) {
318                                 p = &(*p)->rb_right;
319                                 continue;
320                         }
321                 }
322                 if (t->bit->device < __t->bit->device)
323                         p = &(*p)->rb_left;
324                 else if (t->bit->device > __t->bit->device)
325                         p = &(*p)->rb_right;
326                 else if (t->bit->sequence < __t->bit->sequence)
327                         p = &(*p)->rb_left;
328                 else if (t->bit->sequence > __t->bit->sequence)
329                         p = &(*p)->rb_right;
330                 else if (t->bit->device == __t->bit->device) {
331                         fprintf(stderr,
332                                 "sequence alias (%d) on device %d,%d!\n",
333                                 t->bit->sequence,
334                                 MAJOR(t->bit->device), MINOR(t->bit->device));
335                         return 1;
336                 }
337         }
338
339         rb_link_node(&t->rb_node, parent, p);
340         rb_insert_color(&t->rb_node, root);
341         return 0;
342 }
343
344 static inline int trace_rb_insert_sort(struct trace *t)
345 {
346         if (!trace_rb_insert(t, &rb_sort_root, 1)) {
347                 rb_sort_entries++;
348                 return 0;
349         }
350
351         return 1;
352 }
353
354 static inline int trace_rb_insert_last(struct per_dev_info *pdi,struct trace *t)
355 {
356         if (!trace_rb_insert(t, &pdi->rb_last, 1)) {
357                 pdi->rb_last_entries++;
358                 return 0;
359         }
360
361         return 1;
362 }
363
364 static struct trace *trace_rb_find(dev_t device, unsigned long sequence,
365                                    struct rb_root *root, int order)
366 {
367         struct rb_node *n = root->rb_node;
368         struct rb_node *prev = NULL;
369         struct trace *__t;
370
371         while (n) {
372                 __t = rb_entry(n, struct trace, rb_node);
373                 prev = n;
374
375                 if (device < __t->bit->device)
376                         n = n->rb_left;
377                 else if (device > __t->bit->device)
378                         n = n->rb_right;
379                 else if (sequence < __t->bit->sequence)
380                         n = n->rb_left;
381                 else if (sequence > __t->bit->sequence)
382                         n = n->rb_right;
383                 else
384                         return __t;
385         }
386
387         /*
388          * hack - the list may not be sequence ordered because some
389          * events don't have sequence and time matched. so we end up
390          * being a little off in the rb lookup here, because we don't
391          * know the time we are looking for. compensate by browsing
392          * a little ahead from the last entry to find the match
393          */
394         if (order && prev) {
395                 int max = 5;
396
397                 while (((n = rb_next(prev)) != NULL) && max--) {
398                         __t = rb_entry(n, struct trace, rb_node);
399                         
400                         if (__t->bit->device == device &&
401                             __t->bit->sequence == sequence)
402                                 return __t;
403
404                         prev = n;
405                 }
406         }
407                         
408         return NULL;
409 }
410
411 static inline struct trace *trace_rb_find_sort(dev_t dev, unsigned long seq)
412 {
413         return trace_rb_find(dev, seq, &rb_sort_root, 1);
414 }
415
416 static inline struct trace *trace_rb_find_last(struct per_dev_info *pdi,
417                                                unsigned long seq)
418 {
419         return trace_rb_find(pdi->dev, seq, &pdi->rb_last, 0);
420 }
421
422 static inline int track_rb_insert(struct per_dev_info *pdi,struct io_track *iot)
423 {
424         struct rb_node **p = &pdi->rb_track.rb_node;
425         struct rb_node *parent = NULL;
426         struct io_track *__iot;
427
428         while (*p) {
429                 parent = *p;
430                 __iot = rb_entry(parent, struct io_track, rb_node);
431
432                 if (iot->sector < __iot->sector)
433                         p = &(*p)->rb_left;
434                 else if (iot->sector > __iot->sector)
435                         p = &(*p)->rb_right;
436                 else {
437                         fprintf(stderr,
438                                 "sector alias (%Lu) on device %d,%d!\n",
439                                 (unsigned long long) iot->sector,
440                                 MAJOR(pdi->dev), MINOR(pdi->dev));
441                         return 1;
442                 }
443         }
444
445         rb_link_node(&iot->rb_node, parent, p);
446         rb_insert_color(&iot->rb_node, &pdi->rb_track);
447         return 0;
448 }
449
450 static struct io_track *__find_track(struct per_dev_info *pdi, __u64 sector)
451 {
452         struct rb_node *n = pdi->rb_track.rb_node;
453         struct io_track *__iot;
454
455         while (n) {
456                 __iot = rb_entry(n, struct io_track, rb_node);
457
458                 if (sector < __iot->sector)
459                         n = n->rb_left;
460                 else if (sector > __iot->sector)
461                         n = n->rb_right;
462                 else
463                         return __iot;
464         }
465
466         return NULL;
467 }
468
469 static struct io_track *find_track(struct per_dev_info *pdi, __u32 pid,
470                                    char *comm, __u64 sector)
471 {
472         struct io_track *iot;
473
474         iot = __find_track(pdi, sector);
475         if (!iot) {
476                 iot = malloc(sizeof(*iot));
477                 iot->pid = pid;
478                 memcpy(iot->comm, comm, sizeof(iot->comm));
479                 iot->sector = sector;
480                 track_rb_insert(pdi, iot);
481         }
482
483         return iot;
484 }
485
486 static void log_track_frontmerge(struct per_dev_info *pdi,
487                                  struct blk_io_trace *t)
488 {
489         struct io_track *iot;
490
491         if (!track_ios)
492                 return;
493
494         iot = __find_track(pdi, t->sector + (t->bytes >> 9));
495         if (!iot) {
496                 fprintf(stderr, "merge not found for (%d,%d): %llu\n",
497                         MAJOR(pdi->dev), MINOR(pdi->dev),
498                         (unsigned long long) t->sector + (t->bytes >> 9));
499                 return;
500         }
501
502         rb_erase(&iot->rb_node, &pdi->rb_track);
503         iot->sector -= t->bytes >> 9;
504         track_rb_insert(pdi, iot);
505 }
506
507 static void log_track_getrq(struct per_dev_info *pdi, struct blk_io_trace *t)
508 {
509         struct io_track *iot;
510
511         if (!track_ios)
512                 return;
513
514         iot = find_track(pdi, t->pid, t->comm, t->sector);
515         iot->allocation_time = t->time;
516 }
517
518 /*
519  * return time between rq allocation and insertion
520  */
521 static unsigned long long log_track_insert(struct per_dev_info *pdi,
522                                            struct blk_io_trace *t)
523 {
524         unsigned long long elapsed;
525         struct io_track *iot;
526
527         if (!track_ios)
528                 return -1;
529
530         iot = find_track(pdi, t->pid, t->comm, t->sector);
531         iot->queue_time = t->time;
532
533         if (!iot->allocation_time)
534                 return -1;
535
536         elapsed = iot->queue_time - iot->allocation_time;
537
538         if (per_process_stats) {
539                 struct per_process_info *ppi = find_process(iot->pid,iot->comm);
540                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
541
542                 if (ppi && elapsed > ppi->longest_allocation_wait[w])
543                         ppi->longest_allocation_wait[w] = elapsed;
544         }
545
546         return elapsed;
547 }
548
549 /*
550  * return time between queue and issue
551  */
552 static unsigned long long log_track_issue(struct per_dev_info *pdi,
553                                           struct blk_io_trace *t)
554 {
555         unsigned long long elapsed;
556         struct io_track *iot;
557
558         if (!track_ios)
559                 return -1;
560         if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
561                 return -1;
562
563         iot = __find_track(pdi, t->sector);
564         if (!iot) {
565                 fprintf(stderr, "issue not found for (%d,%d): %llu\n",
566                         MAJOR(pdi->dev), MINOR(pdi->dev),
567                         (unsigned long long) t->sector);
568                 return -1;
569         }
570
571         iot->dispatch_time = t->time;
572         elapsed = iot->dispatch_time - iot->queue_time;
573
574         if (per_process_stats) {
575                 struct per_process_info *ppi = find_process(iot->pid,iot->comm);
576                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
577
578                 if (ppi && elapsed > ppi->longest_dispatch_wait[w])
579                         ppi->longest_dispatch_wait[w] = elapsed;
580         }
581
582         return elapsed;
583 }
584
585 /*
586  * return time between dispatch and complete
587  */
588 static unsigned long long log_track_complete(struct per_dev_info *pdi,
589                                              struct blk_io_trace *t)
590 {
591         unsigned long long elapsed;
592         struct io_track *iot;
593
594         if (!track_ios)
595                 return -1;
596         if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
597                 return -1;
598
599         iot = __find_track(pdi, t->sector);
600         if (!iot) {
601                 fprintf(stderr, "complete not found for (%d,%d): %llu\n",
602                         MAJOR(pdi->dev), MINOR(pdi->dev),
603                         (unsigned long long) t->sector);
604                 return -1;
605         }
606
607         iot->completion_time = t->time;
608         elapsed = iot->completion_time - iot->dispatch_time;
609
610         if (per_process_stats) {
611                 struct per_process_info *ppi = find_process(iot->pid,iot->comm);
612                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
613
614                 if (ppi && elapsed > ppi->longest_completion_wait[w])
615                         ppi->longest_completion_wait[w] = elapsed;
616         }
617
618         /*
619          * kill the trace, we don't need it after completion
620          */
621         rb_erase(&iot->rb_node, &pdi->rb_track);
622         free(iot);
623
624         return elapsed;
625 }
626
627
628 static struct io_stats *find_process_io_stats(__u32 pid, char *name)
629 {
630         struct per_process_info *ppi = find_process(pid, name);
631
632         if (!ppi) {
633                 ppi = malloc(sizeof(*ppi));
634                 memset(ppi, 0, sizeof(*ppi));
635                 memcpy(ppi->name, name, 16);
636                 ppi->pid = pid;
637                 add_process_to_hash(ppi);
638                 add_process_to_list(ppi);
639         }
640
641         return &ppi->io_stats;
642 }
643
644 static void resize_cpu_info(struct per_dev_info *pdi, int cpu)
645 {
646         struct per_cpu_info *cpus = pdi->cpus;
647         int ncpus = pdi->ncpus;
648         int new_count = cpu + 1;
649         int new_space, size;
650         char *new_start;
651
652         size = new_count * sizeof(struct per_cpu_info);
653         cpus = realloc(cpus, size);
654         if (!cpus) {
655                 char name[20];
656                 fprintf(stderr, "Out of memory, CPU info for device %s (%d)\n",
657                         get_dev_name(pdi, name, sizeof(name)), size);
658                 exit(1);
659         }
660
661         new_start = (char *)cpus + (ncpus * sizeof(struct per_cpu_info));
662         new_space = (new_count - ncpus) * sizeof(struct per_cpu_info);
663         memset(new_start, 0, new_space);
664
665         pdi->ncpus = new_count;
666         pdi->cpus = cpus;
667 }
668
669 static struct per_cpu_info *get_cpu_info(struct per_dev_info *pdi, int cpu)
670 {
671         struct per_cpu_info *pci;
672
673         if (cpu >= pdi->ncpus)
674                 resize_cpu_info(pdi, cpu);
675
676         pci = &pdi->cpus[cpu];
677         pci->cpu = cpu;
678         return pci;
679 }
680
681
682 static int resize_devices(char *name)
683 {
684         int size = (ndevices + 1) * sizeof(struct per_dev_info);
685
686         devices = realloc(devices, size);
687         if (!devices) {
688                 fprintf(stderr, "Out of memory, device %s (%d)\n", name, size);
689                 return 1;
690         }
691         memset(&devices[ndevices], 0, sizeof(struct per_dev_info));
692         devices[ndevices].name = name;
693         ndevices++;
694         return 0;
695 }
696
697 static struct per_dev_info *get_dev_info(dev_t dev)
698 {
699         struct per_dev_info *pdi;
700         int i;
701
702         for (i = 0; i < ndevices; i++) {
703                 if (!devices[i].dev)
704                         devices[i].dev = dev;
705                 if (devices[i].dev == dev)
706                         return &devices[i];
707         }
708
709         if (resize_devices(NULL))
710                 return NULL;
711
712         pdi = &devices[ndevices - 1];
713         pdi->dev = dev;
714         pdi->last_sequence = -1;
715         pdi->last_read_time = 0;
716         memset(&pdi->rb_last, 0, sizeof(pdi->rb_last));
717         pdi->rb_last_entries = 0;
718         return pdi;
719 }
720
721 static char *get_dev_name(struct per_dev_info *pdi, char *buffer, int size)
722 {
723         if (pdi->name)
724                 snprintf(buffer, size, "%s", pdi->name);
725         else
726                 snprintf(buffer, size, "%d,%d",MAJOR(pdi->dev),MINOR(pdi->dev));
727         return buffer;
728 }
729
730 static void check_time(struct per_dev_info *pdi, struct blk_io_trace *bit)
731 {
732         unsigned long long this = bit->time;
733         unsigned long long last = pdi->last_reported_time;
734
735         pdi->backwards = (this < last) ? 'B' : ' ';
736         pdi->last_reported_time = this;
737 }
738
739 static inline void __account_m(struct io_stats *ios, struct blk_io_trace *t,
740                                int rw)
741 {
742         if (rw) {
743                 ios->mwrites++;
744                 ios->qwrite_kb += t->bytes >> 10;
745         } else {
746                 ios->mreads++;
747                 ios->qread_kb += t->bytes >> 10;
748         }
749 }
750
751 static inline void account_m(struct blk_io_trace *t, struct per_cpu_info *pci,
752                              int rw)
753 {
754         __account_m(&pci->io_stats, t, rw);
755
756         if (per_process_stats) {
757                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
758
759                 __account_m(ios, t, rw);
760         }
761 }
762
763 static inline void __account_queue(struct io_stats *ios, struct blk_io_trace *t,
764                                    int rw)
765 {
766         if (rw) {
767                 ios->qwrites++;
768                 ios->qwrite_kb += t->bytes >> 10;
769         } else {
770                 ios->qreads++;
771                 ios->qread_kb += t->bytes >> 10;
772         }
773 }
774
775 static inline void account_queue(struct blk_io_trace *t,
776                                  struct per_cpu_info *pci, int rw)
777 {
778         __account_queue(&pci->io_stats, t, rw);
779
780         if (per_process_stats) {
781                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
782
783                 __account_queue(ios, t, rw);
784         }
785 }
786
787 static inline void __account_c(struct io_stats *ios, int rw, unsigned int bytes)
788 {
789         if (rw) {
790                 ios->cwrites++;
791                 ios->cwrite_kb += bytes >> 10;
792         } else {
793                 ios->creads++;
794                 ios->cread_kb += bytes >> 10;
795         }
796 }
797
798 static inline void account_c(struct blk_io_trace *t, struct per_cpu_info *pci,
799                              int rw, int bytes)
800 {
801         __account_c(&pci->io_stats, rw, bytes);
802
803         if (per_process_stats) {
804                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
805
806                 __account_c(ios, rw, bytes);
807         }
808 }
809
810 static inline void __account_issue(struct io_stats *ios, int rw,
811                                    unsigned int bytes)
812 {
813         if (rw) {
814                 ios->iwrites++;
815                 ios->iwrite_kb += bytes >> 10;
816         } else {
817                 ios->ireads++;
818                 ios->iread_kb += bytes >> 10;
819         }
820 }
821
822 static inline void account_issue(struct blk_io_trace *t,
823                                  struct per_cpu_info *pci, int rw)
824 {
825         __account_issue(&pci->io_stats, rw, t->bytes);
826
827         if (per_process_stats) {
828                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
829
830                 __account_issue(ios, rw, t->bytes);
831         }
832 }
833
834 static inline void __account_unplug(struct io_stats *ios, int timer)
835 {
836         if (timer)
837                 ios->timer_unplugs++;
838         else
839                 ios->io_unplugs++;
840 }
841
842 static inline void account_unplug(struct blk_io_trace *t,
843                                   struct per_cpu_info *pci, int timer)
844 {
845         __account_unplug(&pci->io_stats, timer);
846
847         if (per_process_stats) {
848                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
849
850                 __account_unplug(ios, timer);
851         }
852 }
853
854 static void log_complete(struct per_dev_info *pdi, struct per_cpu_info *pci,
855                          struct blk_io_trace *t, char *act)
856 {
857         process_fmt(act, pci, t, log_track_complete(pdi, t), 0, NULL);
858 }
859
860 static void log_insert(struct per_dev_info *pdi, struct per_cpu_info *pci,
861                        struct blk_io_trace *t, char *act)
862 {
863         process_fmt(act, pci, t, log_track_insert(pdi, t), 0, NULL);
864 }
865
866 static void log_queue(struct per_cpu_info *pci, struct blk_io_trace *t,
867                       char *act)
868 {
869         process_fmt(act, pci, t, -1, 0, NULL);
870 }
871
872 static void log_issue(struct per_dev_info *pdi, struct per_cpu_info *pci,
873                       struct blk_io_trace *t, char *act)
874 {
875         process_fmt(act, pci, t, log_track_issue(pdi, t), 0, NULL);
876 }
877
878 static void log_merge(struct per_dev_info *pdi, struct per_cpu_info *pci,
879                       struct blk_io_trace *t, char *act)
880 {
881         if (act[0] == 'F')
882                 log_track_frontmerge(pdi, t);
883
884         process_fmt(act, pci, t, -1ULL, 0, NULL);
885 }
886
887 static void log_action(struct per_cpu_info *pci, struct blk_io_trace *t,
888                         char *act)
889 {
890         process_fmt(act, pci, t, -1ULL, 0, NULL);
891 }
892
893 static void log_generic(struct per_cpu_info *pci, struct blk_io_trace *t,
894                         char *act)
895 {
896         process_fmt(act, pci, t, -1ULL, 0, NULL);
897 }
898
899 static void log_unplug(struct per_cpu_info *pci, struct blk_io_trace *t,
900                       char *act)
901 {
902         process_fmt(act, pci, t, -1ULL, 0, NULL);
903 }
904
905 static void log_split(struct per_cpu_info *pci, struct blk_io_trace *t,
906                       char *act)
907 {
908         process_fmt(act, pci, t, -1ULL, 0, NULL);
909 }
910
911 static void log_pc(struct per_cpu_info *pci, struct blk_io_trace *t, char *act)
912 {
913         unsigned char *buf = (unsigned char *) t + sizeof(*t);
914
915         process_fmt(act, pci, t, -1ULL, t->pdu_len, buf);
916 }
917
918 static void dump_trace_pc(struct blk_io_trace *t, struct per_cpu_info *pci)
919 {
920         int act = t->action & 0xffff;
921
922         switch (act) {
923                 case __BLK_TA_QUEUE:
924                         log_generic(pci, t, "Q");
925                         break;
926                 case __BLK_TA_GETRQ:
927                         log_generic(pci, t, "G");
928                         break;
929                 case __BLK_TA_SLEEPRQ:
930                         log_generic(pci, t, "S");
931                         break;
932                 case __BLK_TA_REQUEUE:
933                         log_generic(pci, t, "R");
934                         break;
935                 case __BLK_TA_ISSUE:
936                         log_pc(pci, t, "D");
937                         break;
938                 case __BLK_TA_COMPLETE:
939                         log_pc(pci, t, "C");
940                         break;
941                 case __BLK_TA_INSERT:
942                         log_pc(pci, t, "I");
943                         break;
944                 default:
945                         fprintf(stderr, "Bad pc action %x\n", act);
946                         break;
947         }
948 }
949
950 static void dump_trace_fs(struct blk_io_trace *t, struct per_dev_info *pdi,
951                           struct per_cpu_info *pci)
952 {
953         int w = t->action & BLK_TC_ACT(BLK_TC_WRITE);
954         int act = t->action & 0xffff;
955
956         switch (act) {
957                 case __BLK_TA_QUEUE:
958                         account_queue(t, pci, w);
959                         log_queue(pci, t, "Q");
960                         break;
961                 case __BLK_TA_INSERT:
962                         log_insert(pdi, pci, t, "I");
963                         break;
964                 case __BLK_TA_BACKMERGE:
965                         account_m(t, pci, w);
966                         log_merge(pdi, pci, t, "M");
967                         break;
968                 case __BLK_TA_FRONTMERGE:
969                         account_m(t, pci, w);
970                         log_merge(pdi, pci, t, "F");
971                         break;
972                 case __BLK_TA_GETRQ:
973                         log_track_getrq(pdi, t);
974                         log_generic(pci, t, "G");
975                         break;
976                 case __BLK_TA_SLEEPRQ:
977                         log_generic(pci, t, "S");
978                         break;
979                 case __BLK_TA_REQUEUE:
980                         account_c(t, pci, w, -t->bytes);
981                         log_queue(pci, t, "R");
982                         break;
983                 case __BLK_TA_ISSUE:
984                         account_issue(t, pci, w);
985                         log_issue(pdi, pci, t, "D");
986                         break;
987                 case __BLK_TA_COMPLETE:
988                         account_c(t, pci, w, t->bytes);
989                         log_complete(pdi, pci, t, "C");
990                         break;
991                 case __BLK_TA_PLUG:
992                         log_action(pci, t, "P");
993                         break;
994                 case __BLK_TA_UNPLUG_IO:
995                         account_unplug(t, pci, 0);
996                         log_unplug(pci, t, "U");
997                         break;
998                 case __BLK_TA_UNPLUG_TIMER:
999                         account_unplug(t, pci, 1);
1000                         log_unplug(pci, t, "UT");
1001                         break;
1002                 case __BLK_TA_SPLIT:
1003                         log_split(pci, t, "X");
1004                         break;
1005                 case __BLK_TA_BOUNCE:
1006                         log_generic(pci, t, "B");
1007                         break;
1008                 default:
1009                         fprintf(stderr, "Bad fs action %x\n", t->action);
1010                         break;
1011         }
1012 }
1013
1014 static void dump_trace(struct blk_io_trace *t, struct per_cpu_info *pci,
1015                        struct per_dev_info *pdi)
1016 {
1017         if (t->action & BLK_TC_ACT(BLK_TC_PC))
1018                 dump_trace_pc(t, pci);
1019         else
1020                 dump_trace_fs(t, pdi, pci);
1021
1022         pdi->events++;
1023 }
1024
1025 static void dump_io_stats(struct io_stats *ios, char *msg)
1026 {
1027         fprintf(ofp, "%s\n", msg);
1028
1029         fprintf(ofp, " Reads Queued:    %'8lu, %'8LuKiB\t", ios->qreads, ios->qread_kb);
1030         fprintf(ofp, " Writes Queued:    %'8lu, %'8LuKiB\n", ios->qwrites,ios->qwrite_kb);
1031
1032         fprintf(ofp, " Read Dispatches: %'8lu, %'8LuKiB\t", ios->ireads, ios->iread_kb);
1033         fprintf(ofp, " Write Dispatches: %'8lu, %'8LuKiB\n", ios->iwrites,ios->iwrite_kb);
1034         fprintf(ofp, " Reads Completed: %'8lu, %'8LuKiB\t", ios->creads, ios->cread_kb);
1035         fprintf(ofp, " Writes Completed: %'8lu, %'8LuKiB\n", ios->cwrites,ios->cwrite_kb);
1036         fprintf(ofp, " Read Merges:     %'8lu%8c\t", ios->mreads, ' ');
1037         fprintf(ofp, " Write Merges:     %'8lu\n", ios->mwrites);
1038         fprintf(ofp, " IO unplugs:      %'8lu%8c\t", ios->io_unplugs, ' ');
1039         fprintf(ofp, " Timer unplugs:    %'8lu\n", ios->timer_unplugs);
1040 }
1041
1042 static void dump_wait_stats(struct per_process_info *ppi)
1043 {
1044         unsigned long rawait = ppi->longest_allocation_wait[0] / 1000;
1045         unsigned long rdwait = ppi->longest_dispatch_wait[0] / 1000;
1046         unsigned long rcwait = ppi->longest_completion_wait[0] / 1000;
1047         unsigned long wawait = ppi->longest_allocation_wait[1] / 1000;
1048         unsigned long wdwait = ppi->longest_dispatch_wait[1] / 1000;
1049         unsigned long wcwait = ppi->longest_completion_wait[1] / 1000;
1050
1051         fprintf(ofp, " Allocation wait: %'8lu%8c\t", rawait, ' ');
1052         fprintf(ofp, " Allocation wait:  %'8lu\n", wawait);
1053         fprintf(ofp, " Dispatch wait:   %'8lu%8c\t", rdwait, ' ');
1054         fprintf(ofp, " Dispatch wait:    %'8lu\n", wdwait);
1055         fprintf(ofp, " Completion wait: %'8lu%8c\t", rcwait, ' ');
1056         fprintf(ofp, " Completion wait:  %'8lu\n", wcwait);
1057 }
1058
1059 static int ppi_name_compare(const void *p1, const void *p2)
1060 {
1061         struct per_process_info *ppi1 = *((struct per_process_info **) p1);
1062         struct per_process_info *ppi2 = *((struct per_process_info **) p2);
1063         int res;
1064
1065         res = strverscmp(ppi1->name, ppi2->name);
1066         if (!res)
1067                 res = ppi1->pid > ppi2->pid;
1068
1069         return res;
1070 }
1071
1072 static void sort_process_list(void)
1073 {
1074         struct per_process_info **ppis;
1075         struct per_process_info *ppi;
1076         int i = 0;
1077
1078         ppis = malloc(ppi_list_entries * sizeof(struct per_process_info *));
1079
1080         ppi = ppi_list;
1081         while (ppi) {
1082                 ppis[i++] = ppi;
1083                 ppi = ppi->list_next;
1084         }
1085
1086         qsort(ppis, ppi_list_entries, sizeof(ppi), ppi_name_compare);
1087
1088         i = ppi_list_entries - 1;
1089         ppi_list = NULL;
1090         while (i >= 0) {
1091                 ppi = ppis[i];
1092
1093                 ppi->list_next = ppi_list;
1094                 ppi_list = ppi;
1095                 i--;
1096         }
1097
1098         free(ppis);
1099 }
1100
1101 static void show_process_stats(void)
1102 {
1103         struct per_process_info *ppi;
1104
1105         sort_process_list();
1106
1107         ppi = ppi_list;
1108         while (ppi) {
1109                 char name[64];
1110
1111                 if (ppi->more_than_one)
1112                         sprintf(name, "%s (%u, ...)", ppi->name, ppi->pid);
1113                 else
1114                         sprintf(name, "%s (%u)", ppi->name, ppi->pid);
1115
1116                 dump_io_stats(&ppi->io_stats, name);
1117                 dump_wait_stats(ppi);
1118                 ppi = ppi->list_next;
1119         }
1120
1121         fprintf(ofp, "\n");
1122 }
1123
1124 static void show_device_and_cpu_stats(void)
1125 {
1126         struct per_dev_info *pdi;
1127         struct per_cpu_info *pci;
1128         struct io_stats total, *ios;
1129         int i, j, pci_events;
1130         char line[3 + 8/*cpu*/ + 2 + 32/*dev*/ + 3];
1131         char name[32];
1132
1133         for (pdi = devices, i = 0; i < ndevices; i++, pdi++) {
1134
1135                 memset(&total, 0, sizeof(total));
1136                 pci_events = 0;
1137
1138                 if (i > 0)
1139                         fprintf(ofp, "\n");
1140
1141                 for (pci = pdi->cpus, j = 0; j < pdi->ncpus; j++, pci++) {
1142                         if (!pci->nelems)
1143                                 continue;
1144
1145                         ios = &pci->io_stats;
1146                         total.qreads += ios->qreads;
1147                         total.qwrites += ios->qwrites;
1148                         total.creads += ios->creads;
1149                         total.cwrites += ios->cwrites;
1150                         total.mreads += ios->mreads;
1151                         total.mwrites += ios->mwrites;
1152                         total.ireads += ios->ireads;
1153                         total.iwrites += ios->iwrites;
1154                         total.qread_kb += ios->qread_kb;
1155                         total.qwrite_kb += ios->qwrite_kb;
1156                         total.cread_kb += ios->cread_kb;
1157                         total.cwrite_kb += ios->cwrite_kb;
1158                         total.iread_kb += ios->iread_kb;
1159                         total.iwrite_kb += ios->iwrite_kb;
1160                         total.timer_unplugs += ios->timer_unplugs;
1161                         total.io_unplugs += ios->io_unplugs;
1162
1163                         snprintf(line, sizeof(line) - 1, "CPU%d (%s):",
1164                                  j, get_dev_name(pdi, name, sizeof(name)));
1165                         dump_io_stats(ios, line);
1166                         pci_events++;
1167                 }
1168
1169                 if (pci_events > 1) {
1170                         fprintf(ofp, "\n");
1171                         snprintf(line, sizeof(line) - 1, "Total (%s):",
1172                                  get_dev_name(pdi, name, sizeof(name)));
1173                         dump_io_stats(&total, line);
1174                 }
1175
1176                 fprintf(ofp, "\nEvents (%s): %'Lu entries, %'lu skips\n",
1177                         get_dev_name(pdi, line, sizeof(line)), pdi->events,
1178                         pdi->skips);
1179         }
1180 }
1181
1182 /*
1183  * struct trace and blktrace allocation cache, we do potentially
1184  * millions of mallocs for these structures while only using at most
1185  * a few thousand at the time
1186  */
1187 static inline void t_free(struct trace *t)
1188 {
1189         if (t_alloc_cache < 1024) {
1190                 t->next = t_alloc_list;
1191                 t_alloc_list = t;
1192                 t_alloc_cache++;
1193         } else
1194                 free(t);
1195 }
1196
1197 static inline struct trace *t_alloc(void)
1198 {
1199         struct trace *t = t_alloc_list;
1200
1201         if (t) {
1202                 t_alloc_list = t->next;
1203                 t_alloc_cache--;
1204                 return t;
1205         }
1206
1207         return malloc(sizeof(*t));
1208 }
1209
1210 static inline void bit_free(struct blk_io_trace *bit)
1211 {
1212         if (bit_alloc_cache < 1024) {
1213                 /*
1214                  * abuse a 64-bit field for a next pointer for the free item
1215                  */
1216                 bit->time = (__u64) (unsigned long) bit_alloc_list;
1217                 bit_alloc_list = (struct blk_io_trace *) bit;
1218                 bit_alloc_cache++;
1219         } else
1220                 free(bit);
1221 }
1222
1223 static inline struct blk_io_trace *bit_alloc(void)
1224 {
1225         struct blk_io_trace *bit = bit_alloc_list;
1226
1227         if (bit) {
1228                 bit_alloc_list = (struct blk_io_trace *) (unsigned long) \
1229                                  bit->time;
1230                 bit_alloc_cache--;
1231                 return bit;
1232         }
1233
1234         return malloc(sizeof(*bit));
1235 }
1236
1237 static void find_genesis(void)
1238 {
1239         struct trace *t = trace_list;
1240
1241         genesis_time = -1ULL;
1242         while (t != NULL) {
1243                 if (t->bit->time < genesis_time)
1244                         genesis_time = t->bit->time;
1245
1246                 t = t->next;
1247         }
1248 }
1249
1250 static inline int check_stopwatch(struct blk_io_trace *bit)
1251 {
1252         if (bit->time < stopwatch_end &&
1253             bit->time >= stopwatch_start)
1254                 return 0;
1255
1256         return 1;
1257 }
1258
1259 /*
1260  * return youngest entry read
1261  */
1262 static int sort_entries(unsigned long long *youngest)
1263 {
1264         struct trace *t;
1265
1266         if (!genesis_time)
1267                 find_genesis();
1268
1269         *youngest = 0;
1270         while ((t = trace_list) != NULL) {
1271                 struct blk_io_trace *bit = t->bit;
1272
1273                 trace_list = t->next;
1274
1275                 bit->time -= genesis_time;
1276
1277                 if (bit->time < *youngest || !*youngest)
1278                         *youngest = bit->time;
1279
1280                 if (check_stopwatch(bit)) {
1281                         bit_free(bit);
1282                         t_free(t);
1283                         continue;
1284                 }
1285
1286                 if (trace_rb_insert_sort(t))
1287                         return -1;
1288         }
1289
1290         return 0;
1291 }
1292
1293 static inline void __put_trace_last(struct per_dev_info *pdi, struct trace *t)
1294 {
1295         rb_erase(&t->rb_node, &pdi->rb_last);
1296         pdi->rb_last_entries--;
1297
1298         bit_free(t->bit);
1299         t_free(t);
1300 }
1301
1302 static void put_trace(struct per_dev_info *pdi, struct trace *t)
1303 {
1304         rb_erase(&t->rb_node, &rb_sort_root);
1305         rb_sort_entries--;
1306
1307         trace_rb_insert_last(pdi, t);
1308
1309         if (pdi->rb_last_entries > rb_batch * pdi->nfiles) {
1310                 struct rb_node *n = rb_first(&pdi->rb_last);
1311
1312                 t = rb_entry(n, struct trace, rb_node);
1313                 __put_trace_last(pdi, t);
1314         }
1315 }
1316
1317 static int check_sequence(struct per_dev_info *pdi, struct trace *t, int force)
1318 {
1319         unsigned long expected_sequence = pdi->last_sequence + 1;
1320         struct blk_io_trace *bit = t->bit;
1321         struct trace *__t;
1322         
1323         /*
1324          * first entry, always ok
1325          */
1326         if (!expected_sequence)
1327                 return 0;
1328
1329         if (bit->sequence == expected_sequence)
1330                 return 0;
1331
1332         if (bit->sequence < expected_sequence &&
1333             bit->time > pdi->last_reported_time)
1334                 return 0;
1335
1336         /*
1337          * the wanted sequence is really there, continue
1338          * because this means that the log time is earlier
1339          * on the trace we have now
1340          */
1341         __t = trace_rb_find_sort(pdi->dev, expected_sequence);
1342         if (__t)
1343                 return 0;
1344
1345         /*
1346          * we already displayed this entry, it's ok to continue.
1347          * kill old entry now
1348          */
1349         __t = trace_rb_find_last(pdi, expected_sequence);
1350         if (__t) {
1351                 __put_trace_last(pdi, __t);
1352                 return 0;
1353         }
1354
1355         if (last_skipped == expected_sequence)
1356                 t->skipped++;
1357
1358         last_skipped = expected_sequence;
1359
1360         /*
1361          * unless this is the last run, break and wait for more entries
1362          */
1363         if (!force && t->skipped <= skip_runs)
1364                 return 1;
1365
1366         fprintf(stderr, "(%d,%d): skipping %lu -> %u\n", MAJOR(pdi->dev),
1367                         MINOR(pdi->dev), pdi->last_sequence, bit->sequence);
1368         pdi->skips++;
1369         return 0;
1370 }
1371
1372 static void show_entries_rb(int force)
1373 {
1374         struct per_dev_info *pdi = NULL;
1375         struct per_cpu_info *pci = NULL;
1376         struct blk_io_trace *bit;
1377         struct rb_node *n;
1378         struct trace *t;
1379
1380         if (force) {
1381                 n = rb_first(&rb_sort_root);
1382                 t = rb_entry(n, struct trace, rb_node);
1383                 fprintf(stderr, "first force %u\n", t->bit->sequence);
1384         }
1385
1386         while ((n = rb_first(&rb_sort_root)) != NULL) {
1387                 if (done)
1388                         break;
1389
1390                 t = rb_entry(n, struct trace, rb_node);
1391                 bit = t->bit;
1392
1393                 if (!pdi || pdi->dev != bit->device)
1394                         pdi = get_dev_info(bit->device);
1395
1396                 if (!pdi) {
1397                         fprintf(stderr, "Unknown device ID? (%d,%d)\n",
1398                                 MAJOR(bit->device), MINOR(bit->device));
1399                         break;
1400                 }
1401
1402                 if (check_sequence(pdi, t, force))
1403                         break;
1404
1405                 if (!force && bit->time > last_allowed_time)
1406                         break;
1407
1408                 pdi->last_sequence = bit->sequence;
1409
1410                 check_time(pdi, bit);
1411
1412                 if (!pci || pci->cpu != bit->cpu)
1413                         pci = get_cpu_info(pdi, bit->cpu);
1414
1415                 dump_trace(bit, pci, pdi);
1416
1417                 put_trace(pdi, t);
1418         }
1419 }
1420
1421 static int read_data(int fd, void *buffer, int bytes, int block)
1422 {
1423         int ret, bytes_left, fl;
1424         void *p;
1425
1426         fl = fcntl(fd, F_GETFL);
1427
1428         if (!block)
1429                 fcntl(fd, F_SETFL, fl | O_NONBLOCK);
1430         else
1431                 fcntl(fd, F_SETFL, fl & ~O_NONBLOCK);
1432
1433         bytes_left = bytes;
1434         p = buffer;
1435         while (bytes_left > 0) {
1436                 ret = read(fd, p, bytes_left);
1437                 if (!ret)
1438                         return 1;
1439                 else if (ret < 0) {
1440                         if (errno != EAGAIN)
1441                                 perror("read");
1442
1443                         return -1;
1444                 } else {
1445                         p += ret;
1446                         bytes_left -= ret;
1447                 }
1448         }
1449
1450         return 0;
1451 }
1452
1453 static int read_events(int fd, int always_block)
1454 {
1455         struct per_dev_info *pdi = NULL;
1456         unsigned int events = 0;
1457
1458         while (!is_done() && events < rb_batch) {
1459                 struct blk_io_trace *bit;
1460                 struct trace *t;
1461                 int pdu_len;
1462                 __u32 magic;
1463
1464                 bit = bit_alloc();
1465
1466                 if (read_data(fd, bit, sizeof(*bit), !events || always_block))
1467                         break;
1468
1469                 magic = be32_to_cpu(bit->magic);
1470                 if ((magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
1471                         fprintf(stderr, "Bad magic %x\n", magic);
1472                         break;
1473                 }
1474
1475                 pdu_len = be16_to_cpu(bit->pdu_len);
1476                 if (pdu_len) {
1477                         void *ptr = realloc(bit, sizeof(*bit) + pdu_len);
1478
1479                         if (read_data(fd, ptr + sizeof(*bit), pdu_len, 1))
1480                                 break;
1481
1482                         bit = ptr;
1483                 }
1484
1485                 trace_to_cpu(bit);
1486
1487                 if (verify_trace(bit)) {
1488                         bit_free(bit);
1489                         continue;
1490                 }
1491
1492                 t = t_alloc();
1493                 memset(t, 0, sizeof(*t));
1494                 t->bit = bit;
1495
1496                 t->next = trace_list;
1497                 trace_list = t;
1498
1499                 if (!pdi || pdi->dev != bit->device)
1500                         pdi = get_dev_info(bit->device);
1501
1502                 if (bit->time > pdi->last_read_time)
1503                         pdi->last_read_time = bit->time;
1504
1505                 events++;
1506         }
1507
1508         return events;
1509 }
1510
1511 static int do_file(void)
1512 {
1513         struct per_cpu_info *pci;
1514         struct per_dev_info *pdi;
1515         int i, j, events, events_added;
1516
1517         skip_runs = 4;
1518
1519         /*
1520          * first prepare all files for reading
1521          */
1522         for (i = 0; i < ndevices; i++) {
1523                 pdi = &devices[i];
1524                 pdi->nfiles = 0;
1525                 pdi->last_sequence = -1;
1526
1527                 for (j = 0;; j++) {
1528                         struct stat st;
1529
1530                         pci = get_cpu_info(pdi, j);
1531                         pci->cpu = j;
1532                         pci->fd = -1;
1533
1534                         snprintf(pci->fname, sizeof(pci->fname)-1,
1535                                  "%s.blktrace.%d", pdi->name, pci->cpu);
1536                         if (stat(pci->fname, &st) < 0)
1537                                 break;
1538                         if (st.st_size) {
1539                                 pci->fd = open(pci->fname, O_RDONLY);
1540                                 if (pci->fd < 0) {
1541                                         perror(pci->fname);
1542                                         continue;
1543                                 }
1544                         }
1545
1546                         printf("Input file %s added\n", pci->fname);
1547                         pdi->nfiles++;
1548                 }
1549         }
1550
1551         /*
1552          * now loop over the files reading in the data
1553          */
1554         do {
1555                 unsigned long long youngest;
1556
1557                 events_added = 0;
1558                 last_allowed_time = -1ULL;
1559
1560                 for (i = 0; i < ndevices; i++) {
1561                         pdi = &devices[i];
1562
1563                         for (j = 0; j < pdi->nfiles; j++) {
1564
1565                                 pci = get_cpu_info(pdi, j);
1566
1567                                 if (pci->fd == -1)
1568                                         continue;
1569
1570                                 events = read_events(pci->fd, 1);
1571                                 if (!events) {
1572                                         close(pci->fd);
1573                                         pci->fd = -1;
1574                                         continue;
1575                                 }
1576
1577                                 if (pdi->last_read_time < last_allowed_time)
1578                                         last_allowed_time = pdi->last_read_time;
1579
1580                                 events_added += events;
1581                         }
1582                 }
1583
1584                 if (sort_entries(&youngest))
1585                         break;
1586
1587                 if (youngest > stopwatch_end)
1588                         break;
1589
1590                 show_entries_rb(0);
1591
1592         } while (events_added);
1593
1594         if (rb_sort_entries)
1595                 show_entries_rb(1);
1596
1597         return 0;
1598 }
1599
1600 static int do_stdin(void)
1601 {
1602         unsigned long long youngest;
1603         int fd;
1604
1605         skip_runs = -1U;
1606         last_allowed_time = -1ULL;
1607         fd = dup(STDIN_FILENO);
1608         do {
1609                 int events;
1610
1611                 events = read_events(fd, 0);
1612                 if (!events)
1613                         break;
1614         
1615                 if (sort_entries(&youngest))
1616                         break;
1617
1618                 if (youngest > stopwatch_end)
1619                         break;
1620
1621                 show_entries_rb(0);
1622         } while (1);
1623
1624         if (rb_sort_entries)
1625                 show_entries_rb(1);
1626
1627         close(fd);
1628         return 0;
1629 }
1630
1631 static void flush_output(void)
1632 {
1633         fflush(ofp);
1634 }
1635
1636 static void handle_sigint(__attribute__((__unused__)) int sig)
1637 {
1638         done = 1;
1639         flush_output();
1640 }
1641
1642 /*
1643  * Extract start and duration times from a string, allowing
1644  * us to specify a time interval of interest within a trace.
1645  * Format: "duration" (start is zero) or "start:duration".
1646  */
1647 static int find_stopwatch_interval(char *string)
1648 {
1649         double value;
1650         char *sp;
1651
1652         value = strtod(string, &sp);
1653         if (sp == string) {
1654                 fprintf(stderr,"Invalid stopwatch timer: %s\n", string);
1655                 return 1;
1656         }
1657         if (*sp == ':') {
1658                 stopwatch_start = DOUBLE_TO_NANO_ULL(value);
1659                 string = sp + 1;
1660                 value = strtod(string, &sp);
1661                 if (sp == string || *sp != '\0') {
1662                         fprintf(stderr,"Invalid stopwatch duration time: %s\n",
1663                                 string);
1664                         return 1;
1665                 }
1666         } else if (*sp != '\0') {
1667                 fprintf(stderr,"Invalid stopwatch start timer: %s\n", string);
1668                 return 1;
1669         }
1670         stopwatch_end = DOUBLE_TO_NANO_ULL(value);
1671         if (stopwatch_end <= stopwatch_start) {
1672                 fprintf(stderr, "Invalid stopwatch interval: %Lu -> %Lu\n",
1673                         stopwatch_start, stopwatch_end);
1674                 return 1;
1675         }
1676
1677         return 0;
1678 }
1679
1680 static char usage_str[] = \
1681         "[ -i <input name> ] [-o <output name> [ -s ] [ -t ] [ -q ]\n" \
1682         "[ -w start:stop ] [ -f output format ] [ -F format spec ] [ -v] \n\n" \
1683         "\t-i Input file containing trace data, or '-' for stdin\n" \
1684         "\t-o Output file. If not given, output is stdout\n" \
1685         "\t-b stdin read batching\n" \
1686         "\t-s Show per-program io statistics\n" \
1687         "\t-n Hash processes by name, not pid\n" \
1688         "\t-t Track individual ios. Will tell you the time a request took\n" \
1689         "\t   to get queued, to get dispatched, and to get completed\n" \
1690         "\t-q Quiet. Don't display any stats at the end of the trace\n" \
1691         "\t-w Only parse data between the given time interval in seconds.\n" \
1692         "\t   If 'start' isn't given, blkparse defaults the start time to 0\n" \
1693         "\t -f Output format. Customize the output format. The format field\n" \
1694         "\t    identifies can be found in the documentation\n" \
1695         "\t-F Format specification. Can be found in the documentation\n" \
1696         "\t-v Print program version info\n\n";
1697
1698 static void usage(char *prog)
1699 {
1700         fprintf(stderr, "Usage: %s %s %s", prog, blkparse_version, usage_str);
1701 }
1702
1703 int main(int argc, char *argv[])
1704 {
1705         char *ofp_buffer;
1706         int c, ret, mode;
1707         int per_device_and_cpu_stats = 1;
1708
1709         while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
1710                 switch (c) {
1711                 case 'i':
1712                         if (!strcmp(optarg, "-") && !pipeline)
1713                                 pipeline = 1;
1714                         else if (resize_devices(optarg) != 0)
1715                                 return 1;
1716                         break;
1717                 case 'o':
1718                         output_name = optarg;
1719                         break;
1720                 case 'b':
1721                         rb_batch = atoi(optarg);
1722                         if (rb_batch <= 0)
1723                                 rb_batch = RB_BATCH_DEFAULT;
1724                         break;
1725                 case 's':
1726                         per_process_stats = 1;
1727                         break;
1728                 case 't':
1729                         track_ios = 1;
1730                         break;
1731                 case 'q':
1732                         per_device_and_cpu_stats = 0;
1733                         break;
1734                 case 'w':
1735                         if (find_stopwatch_interval(optarg) != 0)
1736                                 return 1;
1737                         break;
1738                 case 'f':
1739                         set_all_format_specs(optarg);
1740                         break;
1741                 case 'F':
1742                         if (add_format_spec(optarg) != 0)
1743                                 return 1;
1744                         break;
1745                 case 'n':
1746                         ppi_hash_by_pid = 0;
1747                         break;
1748                 case 'v':
1749                         printf("%s version %s\n", argv[0], blkparse_version);
1750                         return 0;
1751                 default:
1752                         usage(argv[0]);
1753                         return 1;
1754                 }
1755         }
1756
1757         while (optind < argc) {
1758                 if (!strcmp(argv[optind], "-") && !pipeline)
1759                         pipeline = 1;
1760                 else if (resize_devices(argv[optind]) != 0)
1761                         return 1;
1762                 optind++;
1763         }
1764
1765         if (!pipeline && !ndevices) {
1766                 usage(argv[0]);
1767                 return 1;
1768         }
1769
1770         memset(&rb_sort_root, 0, sizeof(rb_sort_root));
1771
1772         signal(SIGINT, handle_sigint);
1773         signal(SIGHUP, handle_sigint);
1774         signal(SIGTERM, handle_sigint);
1775
1776         setlocale(LC_NUMERIC, "en_US");
1777
1778         if (!output_name) {
1779                 ofp = fdopen(STDOUT_FILENO, "w");
1780                 mode = _IOLBF;
1781         } else {
1782                 char ofname[128];
1783
1784                 snprintf(ofname, sizeof(ofname) - 1, "%s", output_name);
1785                 ofp = fopen(ofname, "w");
1786                 mode = _IOFBF;
1787         }
1788
1789         if (!ofp) {
1790                 perror("fopen");
1791                 return 1;
1792         }
1793
1794         ofp_buffer = malloc(4096);      
1795         if (setvbuf(ofp, ofp_buffer, mode, 4096)) {
1796                 perror("setvbuf");
1797                 return 1;
1798         }
1799
1800         if (pipeline)
1801                 ret = do_stdin();
1802         else
1803                 ret = do_file();
1804
1805         if (per_process_stats)
1806                 show_process_stats();
1807
1808         if (per_device_and_cpu_stats)
1809                 show_device_and_cpu_stats();
1810
1811         flush_output();
1812         return ret;
1813 }