[PATCH] stopwatch usage may misorder first event
[blktrace.git] / blkparse.c
1 /*
2  * block queue tracing parse application
3  *
4  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation; either version 2 of the License, or
9  *  (at your option) any later version.
10  *
11  *  This program is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with this program; if not, write to the Free Software
18  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  *
20  */
21 #include <sys/types.h>
22 #include <sys/stat.h>
23 #include <unistd.h>
24 #include <stdio.h>
25 #include <fcntl.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <getopt.h>
29 #include <errno.h>
30 #include <signal.h>
31 #include <locale.h>
32 #include <limits.h>
33
34 #include "blktrace.h"
35 #include "rbtree.h"
36 #include "jhash.h"
37
38 static char blkparse_version[] = "0.90";
39
40 struct per_dev_info {
41         dev_t dev;
42         char *name;
43
44         int backwards;
45         unsigned long long events;
46         unsigned long long last_reported_time;
47         unsigned long long last_read_time;
48         struct io_stats io_stats;
49         unsigned long last_sequence;
50         unsigned long skips;
51
52         struct rb_root rb_last;
53         unsigned long rb_last_entries;
54
55         struct rb_root rb_track;
56
57         int nfiles;
58         int nopenfiles;
59         int ncpus;
60         struct per_cpu_info *cpus;
61 };
62
63 struct per_process_info {
64         char name[16];
65         __u32 pid;
66         struct io_stats io_stats;
67         struct per_process_info *hash_next, *list_next;
68         int more_than_one;
69
70         /*
71          * individual io stats
72          */
73         unsigned long long longest_allocation_wait[2];
74         unsigned long long longest_dispatch_wait[2];
75         unsigned long long longest_completion_wait[2];
76 };
77
78 #define PPI_HASH_SHIFT  (8)
79 #define PPI_HASH_SIZE   (1 << PPI_HASH_SHIFT)
80 #define PPI_HASH_MASK   (PPI_HASH_SIZE - 1)
81 static struct per_process_info *ppi_hash_table[PPI_HASH_SIZE];
82 static struct per_process_info *ppi_list;
83 static int ppi_list_entries;
84
85 #define S_OPTS  "a:A:i:o:b:stqw:f:F:vVhD:"
86 static struct option l_opts[] = {
87         {
88                 .name = "act-mask",
89                 .has_arg = required_argument,
90                 .flag = NULL,
91                 .val = 'a'
92         },
93         {
94                 .name = "set-mask",
95                 .has_arg = required_argument,
96                 .flag = NULL,
97                 .val = 'A'
98         },
99         {
100                 .name = "input",
101                 .has_arg = required_argument,
102                 .flag = NULL,
103                 .val = 'i'
104         },
105         {
106                 .name = "output",
107                 .has_arg = required_argument,
108                 .flag = NULL,
109                 .val = 'o'
110         },
111         {
112                 .name = "batch",
113                 .has_arg = required_argument,
114                 .flag = NULL,
115                 .val = 'b'
116         },
117         {
118                 .name = "per-program-stats",
119                 .has_arg = no_argument,
120                 .flag = NULL,
121                 .val = 's'
122         },
123         {
124                 .name = "track-ios",
125                 .has_arg = no_argument,
126                 .flag = NULL,
127                 .val = 't'
128         },
129         {
130                 .name = "quiet",
131                 .has_arg = no_argument,
132                 .flag = NULL,
133                 .val = 'q'
134         },
135         {
136                 .name = "stopwatch",
137                 .has_arg = required_argument,
138                 .flag = NULL,
139                 .val = 'w'
140         },
141         {
142                 .name = "format",
143                 .has_arg = required_argument,
144                 .flag = NULL,
145                 .val = 'f'
146         },
147         {
148                 .name = "format-spec",
149                 .has_arg = required_argument,
150                 .flag = NULL,
151                 .val = 'F'
152         },
153         {
154                 .name = "hash-by-name",
155                 .has_arg = no_argument,
156                 .flag = NULL,
157                 .val = 'h'
158         },
159         {
160                 .name = "verbose",
161                 .has_arg = no_argument,
162                 .flag = NULL,
163                 .val = 'v'
164         },
165         {
166                 .name = "version",
167                 .has_arg = no_argument,
168                 .flag = NULL,
169                 .val = 'V'
170         },
171         {
172                 .name = "input-directory",
173                 .has_arg = required_argument,
174                 .flag = NULL,
175                 .val = 'D'
176         },
177         {
178                 .name = NULL,
179         }
180 };
181
182 /*
183  * for sorting the displayed output
184  */
185 struct trace {
186         struct blk_io_trace *bit;
187         struct rb_node rb_node;
188         struct trace *next;
189 };
190
191 static struct rb_root rb_sort_root;
192 static unsigned long rb_sort_entries;
193
194 static struct trace *trace_list;
195
196 /*
197  * allocation cache
198  */
199 static struct blk_io_trace *bit_alloc_list;
200 static struct trace *t_alloc_list;
201
202 /*
203  * for tracking individual ios
204  */
205 struct io_track {
206         struct rb_node rb_node;
207
208         __u64 sector;
209         __u32 pid;
210         char comm[16];
211         unsigned long long allocation_time;
212         unsigned long long queue_time;
213         unsigned long long dispatch_time;
214         unsigned long long completion_time;
215 };
216
217 static int ndevices;
218 static struct per_dev_info *devices;
219 static char *get_dev_name(struct per_dev_info *, char *, int);
220
221 FILE *ofp = NULL;
222 static char *output_name;
223 static char *input_dir;
224
225 static unsigned long long genesis_time;
226 static unsigned long long last_allowed_time;
227 static unsigned int smallest_seq_read;
228 static unsigned long long stopwatch_start;      /* start from zero by default */
229 static unsigned long long stopwatch_end = ULONG_LONG_MAX;       /* "infinity" */
230
231 static int per_process_stats;
232 static int per_device_and_cpu_stats = 1;
233 static int track_ios;
234 static int ppi_hash_by_pid = 1;
235 static int verbose;
236 static unsigned int act_mask = -1U;
237 static int stats_printed;
238
239 static unsigned int t_alloc_cache;
240 static unsigned int bit_alloc_cache;
241
242 #define RB_BATCH_DEFAULT        (512)
243 static unsigned int rb_batch = RB_BATCH_DEFAULT;
244
245 static int pipeline;
246
247 #define is_done()       (*(volatile int *)(&done))
248 static volatile int done;
249
250 #define JHASH_RANDOM    (0x3af5f2ee)
251
252 static inline int ppi_hash_pid(__u32 pid)
253 {
254         return jhash_1word(pid, JHASH_RANDOM) & PPI_HASH_MASK;
255 }
256
257 static inline int ppi_hash_name(const char *name)
258 {
259         return jhash(name, 16, JHASH_RANDOM) & PPI_HASH_MASK;
260 }
261
262 static inline int ppi_hash(struct per_process_info *ppi)
263 {
264         if (ppi_hash_by_pid)
265                 return ppi_hash_pid(ppi->pid);
266
267         return ppi_hash_name(ppi->name);
268 }
269
270 static inline void add_process_to_hash(struct per_process_info *ppi)
271 {
272         const int hash_idx = ppi_hash(ppi);
273
274         ppi->hash_next = ppi_hash_table[hash_idx];
275         ppi_hash_table[hash_idx] = ppi;
276 }
277
278 static inline void add_process_to_list(struct per_process_info *ppi)
279 {
280         ppi->list_next = ppi_list;
281         ppi_list = ppi;
282         ppi_list_entries++;
283 }
284
285 static struct per_process_info *find_process_by_name(char *name)
286 {
287         const int hash_idx = ppi_hash_name(name);
288         struct per_process_info *ppi;
289
290         ppi = ppi_hash_table[hash_idx];
291         while (ppi) {
292                 if (!strcmp(ppi->name, name))
293                         return ppi;
294
295                 ppi = ppi->hash_next;
296         }
297
298         return NULL;
299 }
300
301 static struct per_process_info *find_process_by_pid(__u32 pid)
302 {
303         const int hash_idx = ppi_hash_pid(pid);
304         struct per_process_info *ppi;
305
306         ppi = ppi_hash_table[hash_idx];
307         while (ppi) {
308                 if (ppi->pid == pid)
309                         return ppi;
310
311                 ppi = ppi->hash_next;
312         }
313
314         return NULL;
315 }
316
317 static struct per_process_info *find_process(__u32 pid, char *name)
318 {
319         struct per_process_info *ppi;
320
321         if (ppi_hash_by_pid)
322                 return find_process_by_pid(pid);
323
324         ppi = find_process_by_name(name);
325         if (ppi && ppi->pid != pid)
326                 ppi->more_than_one = 1;
327
328         return ppi;
329 }
330
331 static inline int trace_rb_insert(struct trace *t, struct rb_root *root,
332                                   int check_time)
333 {
334         struct rb_node **p = &root->rb_node;
335         struct rb_node *parent = NULL;
336         struct trace *__t;
337
338         while (*p) {
339                 parent = *p;
340
341                 __t = rb_entry(parent, struct trace, rb_node);
342
343                 if (check_time) {
344                         if (t->bit->time < __t->bit->time) {
345                                 p = &(*p)->rb_left;
346                                 continue;
347                         } else if (t->bit->time > __t->bit->time) {
348                                 p = &(*p)->rb_right;
349                                 continue;
350                         }
351                 }
352                 if (t->bit->device < __t->bit->device)
353                         p = &(*p)->rb_left;
354                 else if (t->bit->device > __t->bit->device)
355                         p = &(*p)->rb_right;
356                 else if (t->bit->sequence < __t->bit->sequence)
357                         p = &(*p)->rb_left;
358                 else    /* >= sequence */
359                         p = &(*p)->rb_right;
360         }
361
362         rb_link_node(&t->rb_node, parent, p);
363         rb_insert_color(&t->rb_node, root);
364         return 0;
365 }
366
367 static inline int trace_rb_insert_sort(struct trace *t)
368 {
369         if (!trace_rb_insert(t, &rb_sort_root, 1)) {
370                 rb_sort_entries++;
371                 return 0;
372         }
373
374         return 1;
375 }
376
377 static inline int trace_rb_insert_last(struct per_dev_info *pdi,struct trace *t)
378 {
379         if (!trace_rb_insert(t, &pdi->rb_last, 1)) {
380                 pdi->rb_last_entries++;
381                 return 0;
382         }
383
384         return 1;
385 }
386
387 static struct trace *trace_rb_find(dev_t device, unsigned long sequence,
388                                    struct rb_root *root, int order)
389 {
390         struct rb_node *n = root->rb_node;
391         struct rb_node *prev = NULL;
392         struct trace *__t;
393
394         while (n) {
395                 __t = rb_entry(n, struct trace, rb_node);
396                 prev = n;
397
398                 if (device < __t->bit->device)
399                         n = n->rb_left;
400                 else if (device > __t->bit->device)
401                         n = n->rb_right;
402                 else if (sequence < __t->bit->sequence)
403                         n = n->rb_left;
404                 else if (sequence > __t->bit->sequence)
405                         n = n->rb_right;
406                 else
407                         return __t;
408         }
409
410         /*
411          * hack - the list may not be sequence ordered because some
412          * events don't have sequence and time matched. so we end up
413          * being a little off in the rb lookup here, because we don't
414          * know the time we are looking for. compensate by browsing
415          * a little ahead from the last entry to find the match
416          */
417         if (order && prev) {
418                 int max = 5;
419
420                 while (((n = rb_next(prev)) != NULL) && max--) {
421                         __t = rb_entry(n, struct trace, rb_node);
422                         
423                         if (__t->bit->device == device &&
424                             __t->bit->sequence == sequence)
425                                 return __t;
426
427                         prev = n;
428                 }
429         }
430                         
431         return NULL;
432 }
433
434 static inline struct trace *trace_rb_find_sort(dev_t dev, unsigned long seq)
435 {
436         return trace_rb_find(dev, seq, &rb_sort_root, 1);
437 }
438
439 static inline struct trace *trace_rb_find_last(struct per_dev_info *pdi,
440                                                unsigned long seq)
441 {
442         return trace_rb_find(pdi->dev, seq, &pdi->rb_last, 0);
443 }
444
445 static inline int track_rb_insert(struct per_dev_info *pdi,struct io_track *iot)
446 {
447         struct rb_node **p = &pdi->rb_track.rb_node;
448         struct rb_node *parent = NULL;
449         struct io_track *__iot;
450
451         while (*p) {
452                 parent = *p;
453                 __iot = rb_entry(parent, struct io_track, rb_node);
454
455                 if (iot->sector < __iot->sector)
456                         p = &(*p)->rb_left;
457                 else if (iot->sector > __iot->sector)
458                         p = &(*p)->rb_right;
459                 else {
460                         fprintf(stderr,
461                                 "sector alias (%Lu) on device %d,%d!\n",
462                                 (unsigned long long) iot->sector,
463                                 MAJOR(pdi->dev), MINOR(pdi->dev));
464                         return 1;
465                 }
466         }
467
468         rb_link_node(&iot->rb_node, parent, p);
469         rb_insert_color(&iot->rb_node, &pdi->rb_track);
470         return 0;
471 }
472
473 static struct io_track *__find_track(struct per_dev_info *pdi, __u64 sector)
474 {
475         struct rb_node *n = pdi->rb_track.rb_node;
476         struct io_track *__iot;
477
478         while (n) {
479                 __iot = rb_entry(n, struct io_track, rb_node);
480
481                 if (sector < __iot->sector)
482                         n = n->rb_left;
483                 else if (sector > __iot->sector)
484                         n = n->rb_right;
485                 else
486                         return __iot;
487         }
488
489         return NULL;
490 }
491
492 static struct io_track *find_track(struct per_dev_info *pdi, __u32 pid,
493                                    char *comm, __u64 sector)
494 {
495         struct io_track *iot;
496
497         iot = __find_track(pdi, sector);
498         if (!iot) {
499                 iot = malloc(sizeof(*iot));
500                 iot->pid = pid;
501                 memcpy(iot->comm, comm, sizeof(iot->comm));
502                 iot->sector = sector;
503                 track_rb_insert(pdi, iot);
504         }
505
506         return iot;
507 }
508
509 static void log_track_frontmerge(struct per_dev_info *pdi,
510                                  struct blk_io_trace *t)
511 {
512         struct io_track *iot;
513
514         if (!track_ios)
515                 return;
516
517         iot = __find_track(pdi, t->sector + t_sec(t));
518         if (!iot) {
519                 if (verbose)
520                         fprintf(stderr, "merge not found for (%d,%d): %llu\n",
521                                 MAJOR(pdi->dev), MINOR(pdi->dev),
522                                 (unsigned long long) t->sector + t_sec(t));
523                 return;
524         }
525
526         rb_erase(&iot->rb_node, &pdi->rb_track);
527         iot->sector -= t_sec(t);
528         track_rb_insert(pdi, iot);
529 }
530
531 static void log_track_getrq(struct per_dev_info *pdi, struct blk_io_trace *t)
532 {
533         struct io_track *iot;
534
535         if (!track_ios)
536                 return;
537
538         iot = find_track(pdi, t->pid, t->comm, t->sector);
539         iot->allocation_time = t->time;
540 }
541
542 /*
543  * return time between rq allocation and insertion
544  */
545 static unsigned long long log_track_insert(struct per_dev_info *pdi,
546                                            struct blk_io_trace *t)
547 {
548         unsigned long long elapsed;
549         struct io_track *iot;
550
551         if (!track_ios)
552                 return -1;
553
554         iot = find_track(pdi, t->pid, t->comm, t->sector);
555         iot->queue_time = t->time;
556
557         if (!iot->allocation_time)
558                 return -1;
559
560         elapsed = iot->queue_time - iot->allocation_time;
561
562         if (per_process_stats) {
563                 struct per_process_info *ppi = find_process(iot->pid,iot->comm);
564                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
565
566                 if (ppi && elapsed > ppi->longest_allocation_wait[w])
567                         ppi->longest_allocation_wait[w] = elapsed;
568         }
569
570         return elapsed;
571 }
572
573 /*
574  * return time between queue and issue
575  */
576 static unsigned long long log_track_issue(struct per_dev_info *pdi,
577                                           struct blk_io_trace *t)
578 {
579         unsigned long long elapsed;
580         struct io_track *iot;
581
582         if (!track_ios)
583                 return -1;
584         if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
585                 return -1;
586
587         iot = __find_track(pdi, t->sector);
588         if (!iot) {
589                 if (verbose)
590                         fprintf(stderr, "issue not found for (%d,%d): %llu\n",
591                                 MAJOR(pdi->dev), MINOR(pdi->dev),
592                                 (unsigned long long) t->sector);
593                 return -1;
594         }
595
596         iot->dispatch_time = t->time;
597         elapsed = iot->dispatch_time - iot->queue_time;
598
599         if (per_process_stats) {
600                 struct per_process_info *ppi = find_process(iot->pid,iot->comm);
601                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
602
603                 if (ppi && elapsed > ppi->longest_dispatch_wait[w])
604                         ppi->longest_dispatch_wait[w] = elapsed;
605         }
606
607         return elapsed;
608 }
609
610 /*
611  * return time between dispatch and complete
612  */
613 static unsigned long long log_track_complete(struct per_dev_info *pdi,
614                                              struct blk_io_trace *t)
615 {
616         unsigned long long elapsed;
617         struct io_track *iot;
618
619         if (!track_ios)
620                 return -1;
621         if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
622                 return -1;
623
624         iot = __find_track(pdi, t->sector);
625         if (!iot) {
626                 if (verbose)
627                         fprintf(stderr,"complete not found for (%d,%d): %llu\n",
628                                 MAJOR(pdi->dev), MINOR(pdi->dev),
629                                 (unsigned long long) t->sector);
630                 return -1;
631         }
632
633         iot->completion_time = t->time;
634         elapsed = iot->completion_time - iot->dispatch_time;
635
636         if (per_process_stats) {
637                 struct per_process_info *ppi = find_process(iot->pid,iot->comm);
638                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
639
640                 if (ppi && elapsed > ppi->longest_completion_wait[w])
641                         ppi->longest_completion_wait[w] = elapsed;
642         }
643
644         /*
645          * kill the trace, we don't need it after completion
646          */
647         rb_erase(&iot->rb_node, &pdi->rb_track);
648         free(iot);
649
650         return elapsed;
651 }
652
653
654 static struct io_stats *find_process_io_stats(__u32 pid, char *name)
655 {
656         struct per_process_info *ppi = find_process(pid, name);
657
658         if (!ppi) {
659                 ppi = malloc(sizeof(*ppi));
660                 memset(ppi, 0, sizeof(*ppi));
661                 memcpy(ppi->name, name, 16);
662                 ppi->pid = pid;
663                 add_process_to_hash(ppi);
664                 add_process_to_list(ppi);
665         }
666
667         return &ppi->io_stats;
668 }
669
670 static void resize_cpu_info(struct per_dev_info *pdi, int cpu)
671 {
672         struct per_cpu_info *cpus = pdi->cpus;
673         int ncpus = pdi->ncpus;
674         int new_count = cpu + 1;
675         int new_space, size;
676         char *new_start;
677
678         size = new_count * sizeof(struct per_cpu_info);
679         cpus = realloc(cpus, size);
680         if (!cpus) {
681                 char name[20];
682                 fprintf(stderr, "Out of memory, CPU info for device %s (%d)\n",
683                         get_dev_name(pdi, name, sizeof(name)), size);
684                 exit(1);
685         }
686
687         new_start = (char *)cpus + (ncpus * sizeof(struct per_cpu_info));
688         new_space = (new_count - ncpus) * sizeof(struct per_cpu_info);
689         memset(new_start, 0, new_space);
690
691         pdi->ncpus = new_count;
692         pdi->cpus = cpus;
693 }
694
695 static struct per_cpu_info *get_cpu_info(struct per_dev_info *pdi, int cpu)
696 {
697         struct per_cpu_info *pci;
698
699         if (cpu >= pdi->ncpus)
700                 resize_cpu_info(pdi, cpu);
701
702         pci = &pdi->cpus[cpu];
703         pci->cpu = cpu;
704         return pci;
705 }
706
707
708 static int resize_devices(char *name)
709 {
710         int size = (ndevices + 1) * sizeof(struct per_dev_info);
711
712         devices = realloc(devices, size);
713         if (!devices) {
714                 fprintf(stderr, "Out of memory, device %s (%d)\n", name, size);
715                 return 1;
716         }
717         memset(&devices[ndevices], 0, sizeof(struct per_dev_info));
718         devices[ndevices].name = name;
719         ndevices++;
720         return 0;
721 }
722
723 static struct per_dev_info *get_dev_info(dev_t dev)
724 {
725         struct per_dev_info *pdi;
726         int i;
727
728         for (i = 0; i < ndevices; i++) {
729                 if (!devices[i].dev)
730                         devices[i].dev = dev;
731                 if (devices[i].dev == dev)
732                         return &devices[i];
733         }
734
735         if (resize_devices(NULL))
736                 return NULL;
737
738         pdi = &devices[ndevices - 1];
739         pdi->dev = dev;
740         pdi->last_sequence = -1;
741         pdi->last_read_time = 0;
742         memset(&pdi->rb_last, 0, sizeof(pdi->rb_last));
743         pdi->rb_last_entries = 0;
744         return pdi;
745 }
746
747 static char *get_dev_name(struct per_dev_info *pdi, char *buffer, int size)
748 {
749         if (pdi->name)
750                 snprintf(buffer, size, "%s", pdi->name);
751         else
752                 snprintf(buffer, size, "%d,%d",MAJOR(pdi->dev),MINOR(pdi->dev));
753         return buffer;
754 }
755
756 static void check_time(struct per_dev_info *pdi, struct blk_io_trace *bit)
757 {
758         unsigned long long this = bit->time;
759         unsigned long long last = pdi->last_reported_time;
760
761         pdi->backwards = (this < last) ? 'B' : ' ';
762         pdi->last_reported_time = this;
763 }
764
765 static inline void __account_m(struct io_stats *ios, struct blk_io_trace *t,
766                                int rw)
767 {
768         if (rw) {
769                 ios->mwrites++;
770                 ios->qwrite_kb += t_kb(t);
771         } else {
772                 ios->mreads++;
773                 ios->qread_kb += t_kb(t);
774         }
775 }
776
777 static inline void account_m(struct blk_io_trace *t, struct per_cpu_info *pci,
778                              int rw)
779 {
780         __account_m(&pci->io_stats, t, rw);
781
782         if (per_process_stats) {
783                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
784
785                 __account_m(ios, t, rw);
786         }
787 }
788
789 static inline void __account_queue(struct io_stats *ios, struct blk_io_trace *t,
790                                    int rw)
791 {
792         if (rw) {
793                 ios->qwrites++;
794                 ios->qwrite_kb += t_kb(t);
795         } else {
796                 ios->qreads++;
797                 ios->qread_kb += t_kb(t);
798         }
799 }
800
801 static inline void account_queue(struct blk_io_trace *t,
802                                  struct per_cpu_info *pci, int rw)
803 {
804         __account_queue(&pci->io_stats, t, rw);
805
806         if (per_process_stats) {
807                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
808
809                 __account_queue(ios, t, rw);
810         }
811 }
812
813 static inline void __account_c(struct io_stats *ios, int rw, unsigned int bytes)
814 {
815         if (rw) {
816                 ios->cwrites++;
817                 ios->cwrite_kb += bytes >> 10;
818         } else {
819                 ios->creads++;
820                 ios->cread_kb += bytes >> 10;
821         }
822 }
823
824 static inline void account_c(struct blk_io_trace *t, struct per_cpu_info *pci,
825                              int rw, int bytes)
826 {
827         __account_c(&pci->io_stats, rw, bytes);
828
829         if (per_process_stats) {
830                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
831
832                 __account_c(ios, rw, bytes);
833         }
834 }
835
836 static inline void __account_issue(struct io_stats *ios, int rw,
837                                    unsigned int bytes)
838 {
839         if (rw) {
840                 ios->iwrites++;
841                 ios->iwrite_kb += bytes >> 10;
842         } else {
843                 ios->ireads++;
844                 ios->iread_kb += bytes >> 10;
845         }
846 }
847
848 static inline void account_issue(struct blk_io_trace *t,
849                                  struct per_cpu_info *pci, int rw)
850 {
851         __account_issue(&pci->io_stats, rw, t->bytes);
852
853         if (per_process_stats) {
854                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
855
856                 __account_issue(ios, rw, t->bytes);
857         }
858 }
859
860 static inline void __account_unplug(struct io_stats *ios, int timer)
861 {
862         if (timer)
863                 ios->timer_unplugs++;
864         else
865                 ios->io_unplugs++;
866 }
867
868 static inline void account_unplug(struct blk_io_trace *t,
869                                   struct per_cpu_info *pci, int timer)
870 {
871         __account_unplug(&pci->io_stats, timer);
872
873         if (per_process_stats) {
874                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
875
876                 __account_unplug(ios, timer);
877         }
878 }
879
880 static void log_complete(struct per_dev_info *pdi, struct per_cpu_info *pci,
881                          struct blk_io_trace *t, char *act)
882 {
883         process_fmt(act, pci, t, log_track_complete(pdi, t), 0, NULL);
884 }
885
886 static void log_insert(struct per_dev_info *pdi, struct per_cpu_info *pci,
887                        struct blk_io_trace *t, char *act)
888 {
889         process_fmt(act, pci, t, log_track_insert(pdi, t), 0, NULL);
890 }
891
892 static void log_queue(struct per_cpu_info *pci, struct blk_io_trace *t,
893                       char *act)
894 {
895         process_fmt(act, pci, t, -1, 0, NULL);
896 }
897
898 static void log_issue(struct per_dev_info *pdi, struct per_cpu_info *pci,
899                       struct blk_io_trace *t, char *act)
900 {
901         process_fmt(act, pci, t, log_track_issue(pdi, t), 0, NULL);
902 }
903
904 static void log_merge(struct per_dev_info *pdi, struct per_cpu_info *pci,
905                       struct blk_io_trace *t, char *act)
906 {
907         if (act[0] == 'F')
908                 log_track_frontmerge(pdi, t);
909
910         process_fmt(act, pci, t, -1ULL, 0, NULL);
911 }
912
913 static void log_action(struct per_cpu_info *pci, struct blk_io_trace *t,
914                         char *act)
915 {
916         process_fmt(act, pci, t, -1ULL, 0, NULL);
917 }
918
919 static void log_generic(struct per_cpu_info *pci, struct blk_io_trace *t,
920                         char *act)
921 {
922         process_fmt(act, pci, t, -1ULL, 0, NULL);
923 }
924
925 static void log_unplug(struct per_cpu_info *pci, struct blk_io_trace *t,
926                       char *act)
927 {
928         process_fmt(act, pci, t, -1ULL, 0, NULL);
929 }
930
931 static void log_split(struct per_cpu_info *pci, struct blk_io_trace *t,
932                       char *act)
933 {
934         process_fmt(act, pci, t, -1ULL, 0, NULL);
935 }
936
937 static void log_pc(struct per_cpu_info *pci, struct blk_io_trace *t, char *act)
938 {
939         unsigned char *buf = (unsigned char *) t + sizeof(*t);
940
941         process_fmt(act, pci, t, -1ULL, t->pdu_len, buf);
942 }
943
944 static void dump_trace_pc(struct blk_io_trace *t, struct per_cpu_info *pci)
945 {
946         int act = t->action & 0xffff;
947
948         switch (act) {
949                 case __BLK_TA_QUEUE:
950                         log_generic(pci, t, "Q");
951                         break;
952                 case __BLK_TA_GETRQ:
953                         log_generic(pci, t, "G");
954                         break;
955                 case __BLK_TA_SLEEPRQ:
956                         log_generic(pci, t, "S");
957                         break;
958                 case __BLK_TA_REQUEUE:
959                         log_generic(pci, t, "R");
960                         break;
961                 case __BLK_TA_ISSUE:
962                         log_pc(pci, t, "D");
963                         break;
964                 case __BLK_TA_COMPLETE:
965                         log_pc(pci, t, "C");
966                         break;
967                 case __BLK_TA_INSERT:
968                         log_pc(pci, t, "I");
969                         break;
970                 default:
971                         fprintf(stderr, "Bad pc action %x\n", act);
972                         break;
973         }
974 }
975
976 static void dump_trace_fs(struct blk_io_trace *t, struct per_dev_info *pdi,
977                           struct per_cpu_info *pci)
978 {
979         int w = t->action & BLK_TC_ACT(BLK_TC_WRITE);
980         int act = t->action & 0xffff;
981
982         switch (act) {
983                 case __BLK_TA_QUEUE:
984                         account_queue(t, pci, w);
985                         log_queue(pci, t, "Q");
986                         break;
987                 case __BLK_TA_INSERT:
988                         log_insert(pdi, pci, t, "I");
989                         break;
990                 case __BLK_TA_BACKMERGE:
991                         account_m(t, pci, w);
992                         log_merge(pdi, pci, t, "M");
993                         break;
994                 case __BLK_TA_FRONTMERGE:
995                         account_m(t, pci, w);
996                         log_merge(pdi, pci, t, "F");
997                         break;
998                 case __BLK_TA_GETRQ:
999                         log_track_getrq(pdi, t);
1000                         log_generic(pci, t, "G");
1001                         break;
1002                 case __BLK_TA_SLEEPRQ:
1003                         log_generic(pci, t, "S");
1004                         break;
1005                 case __BLK_TA_REQUEUE:
1006                         account_c(t, pci, w, -t->bytes);
1007                         log_queue(pci, t, "R");
1008                         break;
1009                 case __BLK_TA_ISSUE:
1010                         account_issue(t, pci, w);
1011                         log_issue(pdi, pci, t, "D");
1012                         break;
1013                 case __BLK_TA_COMPLETE:
1014                         account_c(t, pci, w, t->bytes);
1015                         log_complete(pdi, pci, t, "C");
1016                         break;
1017                 case __BLK_TA_PLUG:
1018                         log_action(pci, t, "P");
1019                         break;
1020                 case __BLK_TA_UNPLUG_IO:
1021                         account_unplug(t, pci, 0);
1022                         log_unplug(pci, t, "U");
1023                         break;
1024                 case __BLK_TA_UNPLUG_TIMER:
1025                         account_unplug(t, pci, 1);
1026                         log_unplug(pci, t, "UT");
1027                         break;
1028                 case __BLK_TA_SPLIT:
1029                         log_split(pci, t, "X");
1030                         break;
1031                 case __BLK_TA_BOUNCE:
1032                         log_generic(pci, t, "B");
1033                         break;
1034                 case __BLK_TA_REMAP:
1035                         log_generic(pci, t, "A");
1036                         break;
1037                 default:
1038                         fprintf(stderr, "Bad fs action %x\n", t->action);
1039                         break;
1040         }
1041 }
1042
1043 static void dump_trace(struct blk_io_trace *t, struct per_cpu_info *pci,
1044                        struct per_dev_info *pdi)
1045 {
1046         if (t->action & BLK_TC_ACT(BLK_TC_PC))
1047                 dump_trace_pc(t, pci);
1048         else
1049                 dump_trace_fs(t, pdi, pci);
1050
1051         pdi->events++;
1052 }
1053
1054 static void dump_io_stats(struct io_stats *ios, char *msg)
1055 {
1056         fprintf(ofp, "%s\n", msg);
1057
1058         fprintf(ofp, " Reads Queued:    %'8lu, %'8LuKiB\t", ios->qreads, ios->qread_kb);
1059         fprintf(ofp, " Writes Queued:    %'8lu, %'8LuKiB\n", ios->qwrites,ios->qwrite_kb);
1060
1061         fprintf(ofp, " Read Dispatches: %'8lu, %'8LuKiB\t", ios->ireads, ios->iread_kb);
1062         fprintf(ofp, " Write Dispatches: %'8lu, %'8LuKiB\n", ios->iwrites,ios->iwrite_kb);
1063         fprintf(ofp, " Reads Completed: %'8lu, %'8LuKiB\t", ios->creads, ios->cread_kb);
1064         fprintf(ofp, " Writes Completed: %'8lu, %'8LuKiB\n", ios->cwrites,ios->cwrite_kb);
1065         fprintf(ofp, " Read Merges:     %'8lu%8c\t", ios->mreads, ' ');
1066         fprintf(ofp, " Write Merges:     %'8lu\n", ios->mwrites);
1067         fprintf(ofp, " IO unplugs:      %'8lu%8c\t", ios->io_unplugs, ' ');
1068         fprintf(ofp, " Timer unplugs:    %'8lu\n", ios->timer_unplugs);
1069 }
1070
1071 static void dump_wait_stats(struct per_process_info *ppi)
1072 {
1073         unsigned long rawait = ppi->longest_allocation_wait[0] / 1000;
1074         unsigned long rdwait = ppi->longest_dispatch_wait[0] / 1000;
1075         unsigned long rcwait = ppi->longest_completion_wait[0] / 1000;
1076         unsigned long wawait = ppi->longest_allocation_wait[1] / 1000;
1077         unsigned long wdwait = ppi->longest_dispatch_wait[1] / 1000;
1078         unsigned long wcwait = ppi->longest_completion_wait[1] / 1000;
1079
1080         fprintf(ofp, " Allocation wait: %'8lu%8c\t", rawait, ' ');
1081         fprintf(ofp, " Allocation wait:  %'8lu\n", wawait);
1082         fprintf(ofp, " Dispatch wait:   %'8lu%8c\t", rdwait, ' ');
1083         fprintf(ofp, " Dispatch wait:    %'8lu\n", wdwait);
1084         fprintf(ofp, " Completion wait: %'8lu%8c\t", rcwait, ' ');
1085         fprintf(ofp, " Completion wait:  %'8lu\n", wcwait);
1086 }
1087
1088 static int ppi_name_compare(const void *p1, const void *p2)
1089 {
1090         struct per_process_info *ppi1 = *((struct per_process_info **) p1);
1091         struct per_process_info *ppi2 = *((struct per_process_info **) p2);
1092         int res;
1093
1094         res = strverscmp(ppi1->name, ppi2->name);
1095         if (!res)
1096                 res = ppi1->pid > ppi2->pid;
1097
1098         return res;
1099 }
1100
1101 static void sort_process_list(void)
1102 {
1103         struct per_process_info **ppis;
1104         struct per_process_info *ppi;
1105         int i = 0;
1106
1107         ppis = malloc(ppi_list_entries * sizeof(struct per_process_info *));
1108
1109         ppi = ppi_list;
1110         while (ppi) {
1111                 ppis[i++] = ppi;
1112                 ppi = ppi->list_next;
1113         }
1114
1115         qsort(ppis, ppi_list_entries, sizeof(ppi), ppi_name_compare);
1116
1117         i = ppi_list_entries - 1;
1118         ppi_list = NULL;
1119         while (i >= 0) {
1120                 ppi = ppis[i];
1121
1122                 ppi->list_next = ppi_list;
1123                 ppi_list = ppi;
1124                 i--;
1125         }
1126
1127         free(ppis);
1128 }
1129
1130 static void show_process_stats(void)
1131 {
1132         struct per_process_info *ppi;
1133
1134         sort_process_list();
1135
1136         ppi = ppi_list;
1137         while (ppi) {
1138                 char name[64];
1139
1140                 if (ppi->more_than_one)
1141                         sprintf(name, "%s (%u, ...)", ppi->name, ppi->pid);
1142                 else
1143                         sprintf(name, "%s (%u)", ppi->name, ppi->pid);
1144
1145                 dump_io_stats(&ppi->io_stats, name);
1146                 dump_wait_stats(ppi);
1147                 ppi = ppi->list_next;
1148         }
1149
1150         fprintf(ofp, "\n");
1151 }
1152
1153 static void show_device_and_cpu_stats(void)
1154 {
1155         struct per_dev_info *pdi;
1156         struct per_cpu_info *pci;
1157         struct io_stats total, *ios;
1158         int i, j, pci_events;
1159         char line[3 + 8/*cpu*/ + 2 + 32/*dev*/ + 3];
1160         char name[32];
1161
1162         for (pdi = devices, i = 0; i < ndevices; i++, pdi++) {
1163
1164                 memset(&total, 0, sizeof(total));
1165                 pci_events = 0;
1166
1167                 if (i > 0)
1168                         fprintf(ofp, "\n");
1169
1170                 for (pci = pdi->cpus, j = 0; j < pdi->ncpus; j++, pci++) {
1171                         if (!pci->nelems)
1172                                 continue;
1173
1174                         ios = &pci->io_stats;
1175                         total.qreads += ios->qreads;
1176                         total.qwrites += ios->qwrites;
1177                         total.creads += ios->creads;
1178                         total.cwrites += ios->cwrites;
1179                         total.mreads += ios->mreads;
1180                         total.mwrites += ios->mwrites;
1181                         total.ireads += ios->ireads;
1182                         total.iwrites += ios->iwrites;
1183                         total.qread_kb += ios->qread_kb;
1184                         total.qwrite_kb += ios->qwrite_kb;
1185                         total.cread_kb += ios->cread_kb;
1186                         total.cwrite_kb += ios->cwrite_kb;
1187                         total.iread_kb += ios->iread_kb;
1188                         total.iwrite_kb += ios->iwrite_kb;
1189                         total.timer_unplugs += ios->timer_unplugs;
1190                         total.io_unplugs += ios->io_unplugs;
1191
1192                         snprintf(line, sizeof(line) - 1, "CPU%d (%s):",
1193                                  j, get_dev_name(pdi, name, sizeof(name)));
1194                         dump_io_stats(ios, line);
1195                         pci_events++;
1196                 }
1197
1198                 if (pci_events > 1) {
1199                         fprintf(ofp, "\n");
1200                         snprintf(line, sizeof(line) - 1, "Total (%s):",
1201                                  get_dev_name(pdi, name, sizeof(name)));
1202                         dump_io_stats(&total, line);
1203                 }
1204
1205                 fprintf(ofp, "\nEvents (%s): %'Lu entries, %'lu skips\n",
1206                         get_dev_name(pdi, line, sizeof(line)), pdi->events,
1207                         pdi->skips);
1208         }
1209 }
1210
1211 /*
1212  * struct trace and blktrace allocation cache, we do potentially
1213  * millions of mallocs for these structures while only using at most
1214  * a few thousand at the time
1215  */
1216 static inline void t_free(struct trace *t)
1217 {
1218         if (t_alloc_cache < 1024) {
1219                 t->next = t_alloc_list;
1220                 t_alloc_list = t;
1221                 t_alloc_cache++;
1222         } else
1223                 free(t);
1224 }
1225
1226 static inline struct trace *t_alloc(void)
1227 {
1228         struct trace *t = t_alloc_list;
1229
1230         if (t) {
1231                 t_alloc_list = t->next;
1232                 t_alloc_cache--;
1233                 return t;
1234         }
1235
1236         return malloc(sizeof(*t));
1237 }
1238
1239 static inline void bit_free(struct blk_io_trace *bit)
1240 {
1241         if (bit_alloc_cache < 1024) {
1242                 /*
1243                  * abuse a 64-bit field for a next pointer for the free item
1244                  */
1245                 bit->time = (__u64) (unsigned long) bit_alloc_list;
1246                 bit_alloc_list = (struct blk_io_trace *) bit;
1247                 bit_alloc_cache++;
1248         } else
1249                 free(bit);
1250 }
1251
1252 static inline struct blk_io_trace *bit_alloc(void)
1253 {
1254         struct blk_io_trace *bit = bit_alloc_list;
1255
1256         if (bit) {
1257                 bit_alloc_list = (struct blk_io_trace *) (unsigned long) \
1258                                  bit->time;
1259                 bit_alloc_cache--;
1260                 return bit;
1261         }
1262
1263         return malloc(sizeof(*bit));
1264 }
1265
1266 static void find_genesis(void)
1267 {
1268         struct trace *t = trace_list;
1269
1270         genesis_time = -1ULL;
1271         while (t != NULL) {
1272                 if (t->bit->time < genesis_time)
1273                         genesis_time = t->bit->time;
1274
1275                 t = t->next;
1276         }
1277 }
1278
1279 static inline int check_stopwatch(struct blk_io_trace *bit)
1280 {
1281         if (bit->time < stopwatch_end &&
1282             bit->time >= stopwatch_start)
1283                 return 0;
1284
1285         return 1;
1286 }
1287
1288 /*
1289  * return youngest entry read
1290  */
1291 static int sort_entries(unsigned long long *youngest)
1292 {
1293         struct trace *t;
1294
1295         if (!genesis_time)
1296                 find_genesis();
1297
1298         *youngest = 0;
1299         while ((t = trace_list) != NULL) {
1300                 struct blk_io_trace *bit = t->bit;
1301
1302                 trace_list = t->next;
1303
1304                 bit->time -= genesis_time;
1305
1306                 if (bit->time < *youngest || !*youngest)
1307                         *youngest = bit->time;
1308
1309                 if (bit->sequence < smallest_seq_read)
1310                         smallest_seq_read = bit->sequence;
1311
1312                 if (check_stopwatch(bit)) {
1313                         bit_free(bit);
1314                         t_free(t);
1315                         continue;
1316                 }
1317
1318                 if (trace_rb_insert_sort(t))
1319                         return -1;
1320         }
1321
1322         return 0;
1323 }
1324
1325 static inline void __put_trace_last(struct per_dev_info *pdi, struct trace *t)
1326 {
1327         rb_erase(&t->rb_node, &pdi->rb_last);
1328         pdi->rb_last_entries--;
1329
1330         bit_free(t->bit);
1331         t_free(t);
1332 }
1333
1334 static void put_trace(struct per_dev_info *pdi, struct trace *t)
1335 {
1336         rb_erase(&t->rb_node, &rb_sort_root);
1337         rb_sort_entries--;
1338
1339         trace_rb_insert_last(pdi, t);
1340
1341         if (pdi->rb_last_entries > rb_batch * pdi->nfiles) {
1342                 struct rb_node *n = rb_first(&pdi->rb_last);
1343
1344                 t = rb_entry(n, struct trace, rb_node);
1345                 __put_trace_last(pdi, t);
1346         }
1347 }
1348
1349 static int check_sequence(struct per_dev_info *pdi, struct trace *t, int force)
1350 {
1351         unsigned long expected_sequence = pdi->last_sequence + 1;
1352         struct blk_io_trace *bit = t->bit;
1353         struct trace *__t;
1354         
1355         if (!expected_sequence) {
1356                 struct rb_node *n;
1357                 char *cpu_seen;
1358                 int cpus;
1359
1360                 /*
1361                  * 1 should be the first entry, just allow it
1362                  */
1363                 if (bit->sequence == 1)
1364                         return 0;
1365
1366                 /*
1367                  * if we are starting somewhere else, check that we have
1368                  * entries from all cpus in the tree before dumping one
1369                  */
1370                 cpu_seen = malloc(pdi->nopenfiles);
1371                 n = rb_first(&rb_sort_root);
1372                 cpus = 0;
1373                 while (n) {
1374                         __t = rb_entry(n, struct trace, rb_node);
1375
1376                         if (!cpu_seen[__t->bit->cpu]) {
1377                                 cpu_seen[__t->bit->cpu] = 1;
1378                                 cpus++;
1379                         }
1380                         n = rb_next(n);
1381                 }
1382
1383                 free(cpu_seen);
1384                 return cpus != pdi->nopenfiles;
1385         }
1386
1387         if (bit->sequence == expected_sequence)
1388                 return 0;
1389
1390         /*
1391          * we may not have seen that sequence yet. if we are not doing
1392          * the final run, break and wait for more entries.
1393          */
1394         if (expected_sequence < smallest_seq_read) {
1395                 __t = trace_rb_find_last(pdi, expected_sequence);
1396                 if (!__t)
1397                         goto skip;
1398
1399                 __put_trace_last(pdi, __t);
1400                 return 0;
1401         } else if (!force) {
1402                 return 1;
1403         } else {
1404 skip:
1405                 if (verbose) {
1406                         fprintf(stderr, "(%d,%d): skipping %lu -> %u\n",
1407                                 MAJOR(pdi->dev), MINOR(pdi->dev),
1408                                 pdi->last_sequence, bit->sequence);
1409                 }
1410                 pdi->skips++;
1411                 return 0;
1412         }
1413 }
1414
1415 static void show_entries_rb(int force)
1416 {
1417         struct per_dev_info *pdi = NULL;
1418         struct per_cpu_info *pci = NULL;
1419         struct blk_io_trace *bit;
1420         struct rb_node *n;
1421         struct trace *t;
1422
1423         while ((n = rb_first(&rb_sort_root)) != NULL) {
1424                 if (is_done() && !force && !pipeline)
1425                         break;
1426
1427                 t = rb_entry(n, struct trace, rb_node);
1428                 bit = t->bit;
1429
1430                 if (!pdi || pdi->dev != bit->device)
1431                         pdi = get_dev_info(bit->device);
1432
1433                 if (!pdi) {
1434                         fprintf(stderr, "Unknown device ID? (%d,%d)\n",
1435                                 MAJOR(bit->device), MINOR(bit->device));
1436                         break;
1437                 }
1438
1439                 if (check_sequence(pdi, t, force))
1440                         break;
1441
1442                 if (!force && bit->time > last_allowed_time)
1443                         break;
1444
1445                 pdi->last_sequence = bit->sequence;
1446
1447                 check_time(pdi, bit);
1448
1449                 if (!pci || pci->cpu != bit->cpu)
1450                         pci = get_cpu_info(pdi, bit->cpu);
1451
1452                 pci->nelems++;
1453
1454                 if (bit->action & (act_mask << BLK_TC_SHIFT)) 
1455                         dump_trace(bit, pci, pdi);
1456
1457                 put_trace(pdi, t);
1458         }
1459 }
1460
1461 static int read_data(int fd, void *buffer, int bytes, int block)
1462 {
1463         int ret, bytes_left, fl;
1464         void *p;
1465
1466         fl = fcntl(fd, F_GETFL);
1467
1468         if (!block)
1469                 fcntl(fd, F_SETFL, fl | O_NONBLOCK);
1470         else
1471                 fcntl(fd, F_SETFL, fl & ~O_NONBLOCK);
1472
1473         bytes_left = bytes;
1474         p = buffer;
1475         while (bytes_left > 0) {
1476                 ret = read(fd, p, bytes_left);
1477                 if (!ret)
1478                         return 1;
1479                 else if (ret < 0) {
1480                         if (errno != EAGAIN)
1481                                 perror("read");
1482
1483                         return -1;
1484                 } else {
1485                         p += ret;
1486                         bytes_left -= ret;
1487                 }
1488         }
1489
1490         return 0;
1491 }
1492
1493 static int read_events(int fd, int always_block)
1494 {
1495         struct per_dev_info *pdi = NULL;
1496         unsigned int events = 0;
1497
1498         while (!is_done() && events < rb_batch) {
1499                 struct blk_io_trace *bit;
1500                 struct trace *t;
1501                 int pdu_len;
1502                 __u32 magic;
1503
1504                 bit = bit_alloc();
1505
1506                 if (read_data(fd, bit, sizeof(*bit), !events || always_block))
1507                         break;
1508
1509                 magic = be32_to_cpu(bit->magic);
1510                 if ((magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
1511                         fprintf(stderr, "Bad magic %x\n", magic);
1512                         break;
1513                 }
1514
1515                 pdu_len = be16_to_cpu(bit->pdu_len);
1516                 if (pdu_len) {
1517                         void *ptr = realloc(bit, sizeof(*bit) + pdu_len);
1518
1519                         if (read_data(fd, ptr + sizeof(*bit), pdu_len, 1))
1520                                 break;
1521
1522                         bit = ptr;
1523                 }
1524
1525                 trace_to_cpu(bit);
1526
1527                 if (verify_trace(bit)) {
1528                         bit_free(bit);
1529                         continue;
1530                 }
1531
1532                 t = t_alloc();
1533                 memset(t, 0, sizeof(*t));
1534                 t->bit = bit;
1535
1536                 t->next = trace_list;
1537                 trace_list = t;
1538
1539                 if (!pdi || pdi->dev != bit->device)
1540                         pdi = get_dev_info(bit->device);
1541
1542                 if (bit->time > pdi->last_read_time)
1543                         pdi->last_read_time = bit->time;
1544
1545                 events++;
1546         }
1547
1548         return events;
1549 }
1550
1551 static int do_file(void)
1552 {
1553         struct per_cpu_info *pci;
1554         struct per_dev_info *pdi;
1555         int i, j, events, events_added;
1556
1557         /*
1558          * first prepare all files for reading
1559          */
1560         for (i = 0; i < ndevices; i++) {
1561                 pdi = &devices[i];
1562                 pdi->nfiles = 0;
1563                 pdi->last_sequence = -1;
1564
1565                 for (j = 0;; j++) {
1566                         struct stat st;
1567                         int len = 0;
1568
1569                         pci = get_cpu_info(pdi, j);
1570                         pci->cpu = j;
1571                         pci->fd = -1;
1572
1573                         if (input_dir)
1574                                 len = sprintf(pci->fname, "%s/", input_dir);
1575
1576                         snprintf(pci->fname + len, sizeof(pci->fname)-1-len,
1577                                  "%s.blktrace.%d", pdi->name, pci->cpu);
1578                         if (stat(pci->fname, &st) < 0)
1579                                 break;
1580                         if (st.st_size) {
1581                                 pci->fd = open(pci->fname, O_RDONLY);
1582                                 if (pci->fd < 0) {
1583                                         perror(pci->fname);
1584                                         continue;
1585                                 }
1586                         }
1587
1588                         printf("Input file %s added\n", pci->fname);
1589                         pdi->nfiles++;
1590                         pdi->nopenfiles++;
1591                 }
1592         }
1593
1594         /*
1595          * now loop over the files reading in the data
1596          */
1597         do {
1598                 unsigned long long youngest;
1599
1600                 events_added = 0;
1601                 last_allowed_time = -1ULL;
1602                 smallest_seq_read = -1U;
1603
1604                 for (i = 0; i < ndevices; i++) {
1605                         pdi = &devices[i];
1606
1607                         for (j = 0; j < pdi->nfiles; j++) {
1608
1609                                 pci = get_cpu_info(pdi, j);
1610
1611                                 if (pci->fd == -1)
1612                                         continue;
1613
1614                                 events = read_events(pci->fd, 1);
1615                                 if (!events) {
1616                                         close(pci->fd);
1617                                         pci->fd = -1;
1618                                         pdi->nopenfiles--;
1619                                         continue;
1620                                 }
1621
1622                                 if (pdi->last_read_time < last_allowed_time)
1623                                         last_allowed_time = pdi->last_read_time;
1624
1625                                 events_added += events;
1626                         }
1627                 }
1628
1629                 if (sort_entries(&youngest))
1630                         break;
1631
1632                 if (youngest > stopwatch_end)
1633                         break;
1634
1635                 show_entries_rb(0);
1636
1637         } while (events_added);
1638
1639         if (rb_sort_entries)
1640                 show_entries_rb(1);
1641
1642         return 0;
1643 }
1644
1645 static int do_stdin(void)
1646 {
1647         unsigned long long youngest;
1648         int fd, events;
1649
1650         last_allowed_time = -1ULL;
1651         fd = dup(STDIN_FILENO);
1652         if (fd == -1) {
1653                 perror("dup stdin");
1654                 return -1;
1655         }
1656
1657         while ((events = read_events(fd, 0)) != 0) {
1658         
1659                 smallest_seq_read = -1U;
1660
1661                 if (sort_entries(&youngest))
1662                         break;
1663
1664                 if (youngest > stopwatch_end)
1665                         break;
1666
1667                 show_entries_rb(0);
1668         }
1669
1670         if (rb_sort_entries)
1671                 show_entries_rb(1);
1672
1673         close(fd);
1674         return 0;
1675 }
1676
1677 static void show_stats(void)
1678 {
1679         if (!ofp)
1680                 return;
1681         if (stats_printed)
1682                 return;
1683
1684         stats_printed = 1;
1685
1686         if (per_process_stats)
1687                 show_process_stats();
1688
1689         if (per_device_and_cpu_stats)
1690                 show_device_and_cpu_stats();
1691
1692         fflush(ofp);
1693 }
1694
1695 static void handle_sigint(__attribute__((__unused__)) int sig)
1696 {
1697         done = 1;
1698         show_stats();
1699 }
1700
1701 /*
1702  * Extract start and duration times from a string, allowing
1703  * us to specify a time interval of interest within a trace.
1704  * Format: "duration" (start is zero) or "start:duration".
1705  */
1706 static int find_stopwatch_interval(char *string)
1707 {
1708         double value;
1709         char *sp;
1710
1711         value = strtod(string, &sp);
1712         if (sp == string) {
1713                 fprintf(stderr,"Invalid stopwatch timer: %s\n", string);
1714                 return 1;
1715         }
1716         if (*sp == ':') {
1717                 stopwatch_start = DOUBLE_TO_NANO_ULL(value);
1718                 string = sp + 1;
1719                 value = strtod(string, &sp);
1720                 if (sp == string || *sp != '\0') {
1721                         fprintf(stderr,"Invalid stopwatch duration time: %s\n",
1722                                 string);
1723                         return 1;
1724                 }
1725         } else if (*sp != '\0') {
1726                 fprintf(stderr,"Invalid stopwatch start timer: %s\n", string);
1727                 return 1;
1728         }
1729         stopwatch_end = DOUBLE_TO_NANO_ULL(value);
1730         if (stopwatch_end <= stopwatch_start) {
1731                 fprintf(stderr, "Invalid stopwatch interval: %Lu -> %Lu\n",
1732                         stopwatch_start, stopwatch_end);
1733                 return 1;
1734         }
1735
1736         return 0;
1737 }
1738
1739 static char usage_str[] = \
1740         "[ -i <input name> ] [-o <output name> [ -s ] [ -t ] [ -q ]\n" \
1741         "[ -w start:stop ] [ -f output format ] [ -F format spec ] [ -v] \n\n" \
1742         "\t-i Input file containing trace data, or '-' for stdin\n" \
1743         "\t-D Directory to prepend to input file names\n" \
1744         "\t-o Output file. If not given, output is stdout\n" \
1745         "\t-b stdin read batching\n" \
1746         "\t-s Show per-program io statistics\n" \
1747         "\t-n Hash processes by name, not pid\n" \
1748         "\t-t Track individual ios. Will tell you the time a request took\n" \
1749         "\t   to get queued, to get dispatched, and to get completed\n" \
1750         "\t-q Quiet. Don't display any stats at the end of the trace\n" \
1751         "\t-w Only parse data between the given time interval in seconds.\n" \
1752         "\t   If 'start' isn't given, blkparse defaults the start time to 0\n" \
1753         "\t-f Output format. Customize the output format. The format field\n" \
1754         "\t   identifies can be found in the documentation\n" \
1755         "\t-F Format specification. Can be found in the documentation\n" \
1756         "\t-v More verbose for marginal errors\n" \
1757         "\t-V Print program version info\n\n";
1758
1759 static void usage(char *prog)
1760 {
1761         fprintf(stderr, "Usage: %s %s %s", prog, blkparse_version, usage_str);
1762 }
1763
1764 int main(int argc, char *argv[])
1765 {
1766         char *ofp_buffer;
1767         int i, c, ret, mode;
1768         int act_mask_tmp = 0;
1769
1770         while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
1771                 switch (c) {
1772                 case 'a':
1773                         i = find_mask_map(optarg);
1774                         if (i < 0) {
1775                                 fprintf(stderr,"Invalid action mask %s\n",
1776                                         optarg);
1777                                 return 1;
1778                         }
1779                         act_mask_tmp |= i;
1780                         break;
1781
1782                 case 'A':
1783                         if ((sscanf(optarg, "%x", &i) != 1) || 
1784                                                         !valid_act_opt(i)) {
1785                                 fprintf(stderr,
1786                                         "Invalid set action mask %s/0x%x\n",
1787                                         optarg, i);
1788                                 return 1;
1789                         }
1790                         act_mask_tmp = i;
1791                         break;
1792                 case 'i':
1793                         if (!strcmp(optarg, "-") && !pipeline)
1794                                 pipeline = 1;
1795                         else if (resize_devices(optarg) != 0)
1796                                 return 1;
1797                         break;
1798                 case 'D':
1799                         input_dir = optarg;
1800                         break;
1801                 case 'o':
1802                         output_name = optarg;
1803                         break;
1804                 case 'b':
1805                         rb_batch = atoi(optarg);
1806                         if (rb_batch <= 0)
1807                                 rb_batch = RB_BATCH_DEFAULT;
1808                         break;
1809                 case 's':
1810                         per_process_stats = 1;
1811                         break;
1812                 case 't':
1813                         track_ios = 1;
1814                         break;
1815                 case 'q':
1816                         per_device_and_cpu_stats = 0;
1817                         break;
1818                 case 'w':
1819                         if (find_stopwatch_interval(optarg) != 0)
1820                                 return 1;
1821                         break;
1822                 case 'f':
1823                         set_all_format_specs(optarg);
1824                         break;
1825                 case 'F':
1826                         if (add_format_spec(optarg) != 0)
1827                                 return 1;
1828                         break;
1829                 case 'h':
1830                         ppi_hash_by_pid = 0;
1831                         break;
1832                 case 'v':
1833                         verbose++;
1834                         break;
1835                 case 'V':
1836                         printf("%s version %s\n", argv[0], blkparse_version);
1837                         return 0;
1838                 default:
1839                         usage(argv[0]);
1840                         return 1;
1841                 }
1842         }
1843
1844         while (optind < argc) {
1845                 if (!strcmp(argv[optind], "-") && !pipeline)
1846                         pipeline = 1;
1847                 else if (resize_devices(argv[optind]) != 0)
1848                         return 1;
1849                 optind++;
1850         }
1851
1852         if (!pipeline && !ndevices) {
1853                 usage(argv[0]);
1854                 return 1;
1855         }
1856
1857         if (act_mask_tmp != 0)
1858                 act_mask = act_mask_tmp;
1859
1860         memset(&rb_sort_root, 0, sizeof(rb_sort_root));
1861
1862         signal(SIGINT, handle_sigint);
1863         signal(SIGHUP, handle_sigint);
1864         signal(SIGTERM, handle_sigint);
1865
1866         setlocale(LC_NUMERIC, "en_US");
1867
1868         if (!output_name) {
1869                 ofp = fdopen(STDOUT_FILENO, "w");
1870                 mode = _IOLBF;
1871         } else {
1872                 char ofname[128];
1873
1874                 snprintf(ofname, sizeof(ofname) - 1, "%s", output_name);
1875                 ofp = fopen(ofname, "w");
1876                 mode = _IOFBF;
1877         }
1878
1879         if (!ofp) {
1880                 perror("fopen");
1881                 return 1;
1882         }
1883
1884         ofp_buffer = malloc(4096);      
1885         if (setvbuf(ofp, ofp_buffer, mode, 4096)) {
1886                 perror("setvbuf");
1887                 return 1;
1888         }
1889
1890         if (pipeline)
1891                 ret = do_stdin();
1892         else
1893                 ret = do_file();
1894
1895         show_stats();
1896         return ret;
1897 }