[PATCH] blktrace: we don't need ->closed anymore
[blktrace.git] / blkparse.c
1 /*
2  * block queue tracing parse application
3  *
4  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation; either version 2 of the License, or
9  *  (at your option) any later version.
10  *
11  *  This program is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with this program; if not, write to the Free Software
18  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  *
20  */
21 #include <sys/types.h>
22 #include <sys/stat.h>
23 #include <unistd.h>
24 #include <stdio.h>
25 #include <fcntl.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <getopt.h>
29 #include <errno.h>
30 #include <signal.h>
31 #include <locale.h>
32 #include <libgen.h>
33
34 #include "blktrace.h"
35 #include "rbtree.h"
36 #include "jhash.h"
37
38 static char blkparse_version[] = "0.99";
39
40 struct per_dev_info {
41         dev_t dev;
42         char *name;
43
44         int backwards;
45         unsigned long long events;
46         unsigned long long first_reported_time;
47         unsigned long long last_reported_time;
48         unsigned long long last_read_time;
49         struct io_stats io_stats;
50         unsigned long last_sequence;
51         unsigned long skips, nskips;
52         unsigned long long seq_skips, seq_nskips;
53
54         struct rb_root rb_last;
55         unsigned long rb_last_entries;
56
57         struct rb_root rb_track;
58
59         int nfiles;
60         int ncpus;
61
62         unsigned long *cpu_map;
63         unsigned int cpu_map_max;
64
65         struct per_cpu_info *cpus;
66 };
67
68 struct per_process_info {
69         char name[16];
70         __u32 pid;
71         struct io_stats io_stats;
72         struct per_process_info *hash_next, *list_next;
73         int more_than_one;
74
75         /*
76          * individual io stats
77          */
78         unsigned long long longest_allocation_wait[2];
79         unsigned long long longest_dispatch_wait[2];
80         unsigned long long longest_completion_wait[2];
81 };
82
83 #define PPI_HASH_SHIFT  (8)
84 #define PPI_HASH_SIZE   (1 << PPI_HASH_SHIFT)
85 #define PPI_HASH_MASK   (PPI_HASH_SIZE - 1)
86 static struct per_process_info *ppi_hash_table[PPI_HASH_SIZE];
87 static struct per_process_info *ppi_list;
88 static int ppi_list_entries;
89
90 #define S_OPTS  "a:A:i:o:b:stqw:f:F:vVhD:"
91 static struct option l_opts[] = {
92         {
93                 .name = "act-mask",
94                 .has_arg = required_argument,
95                 .flag = NULL,
96                 .val = 'a'
97         },
98         {
99                 .name = "set-mask",
100                 .has_arg = required_argument,
101                 .flag = NULL,
102                 .val = 'A'
103         },
104         {
105                 .name = "input",
106                 .has_arg = required_argument,
107                 .flag = NULL,
108                 .val = 'i'
109         },
110         {
111                 .name = "output",
112                 .has_arg = required_argument,
113                 .flag = NULL,
114                 .val = 'o'
115         },
116         {
117                 .name = "batch",
118                 .has_arg = required_argument,
119                 .flag = NULL,
120                 .val = 'b'
121         },
122         {
123                 .name = "per-program-stats",
124                 .has_arg = no_argument,
125                 .flag = NULL,
126                 .val = 's'
127         },
128         {
129                 .name = "track-ios",
130                 .has_arg = no_argument,
131                 .flag = NULL,
132                 .val = 't'
133         },
134         {
135                 .name = "quiet",
136                 .has_arg = no_argument,
137                 .flag = NULL,
138                 .val = 'q'
139         },
140         {
141                 .name = "stopwatch",
142                 .has_arg = required_argument,
143                 .flag = NULL,
144                 .val = 'w'
145         },
146         {
147                 .name = "format",
148                 .has_arg = required_argument,
149                 .flag = NULL,
150                 .val = 'f'
151         },
152         {
153                 .name = "format-spec",
154                 .has_arg = required_argument,
155                 .flag = NULL,
156                 .val = 'F'
157         },
158         {
159                 .name = "hash-by-name",
160                 .has_arg = no_argument,
161                 .flag = NULL,
162                 .val = 'h'
163         },
164         {
165                 .name = "verbose",
166                 .has_arg = no_argument,
167                 .flag = NULL,
168                 .val = 'v'
169         },
170         {
171                 .name = "version",
172                 .has_arg = no_argument,
173                 .flag = NULL,
174                 .val = 'V'
175         },
176         {
177                 .name = "input-directory",
178                 .has_arg = required_argument,
179                 .flag = NULL,
180                 .val = 'D'
181         },
182         {
183                 .name = NULL,
184         }
185 };
186
187 /*
188  * for sorting the displayed output
189  */
190 struct trace {
191         struct blk_io_trace *bit;
192         struct rb_node rb_node;
193         struct trace *next;
194 };
195
196 static struct rb_root rb_sort_root;
197 static unsigned long rb_sort_entries;
198
199 static struct trace *trace_list;
200
201 /*
202  * allocation cache
203  */
204 static struct blk_io_trace *bit_alloc_list;
205 static struct trace *t_alloc_list;
206
207 /*
208  * for tracking individual ios
209  */
210 struct io_track {
211         struct rb_node rb_node;
212
213         __u64 sector;
214         __u32 pid;
215         char comm[16];
216         unsigned long long allocation_time;
217         unsigned long long queue_time;
218         unsigned long long dispatch_time;
219         unsigned long long completion_time;
220 };
221
222 static int ndevices;
223 static struct per_dev_info *devices;
224 static char *get_dev_name(struct per_dev_info *, char *, int);
225
226 FILE *ofp = NULL;
227 static char *output_name;
228 static char *input_dir;
229
230 static unsigned long long genesis_time;
231 static unsigned long long last_allowed_time;
232 static unsigned int smallest_seq_read;
233 static unsigned long long stopwatch_start;      /* start from zero by default */
234 static unsigned long long stopwatch_end = -1ULL;        /* "infinity" */
235
236 static int per_process_stats;
237 static int per_device_and_cpu_stats = 1;
238 static int track_ios;
239 static int ppi_hash_by_pid = 1;
240 static int verbose;
241 static unsigned int act_mask = -1U;
242 static int stats_printed;
243
244 static unsigned int t_alloc_cache;
245 static unsigned int bit_alloc_cache;
246
247 #define RB_BATCH_DEFAULT        (512)
248 static unsigned int rb_batch = RB_BATCH_DEFAULT;
249
250 static int pipeline;
251
252 #define is_done()       (*(volatile int *)(&done))
253 static volatile int done;
254
255 #define JHASH_RANDOM    (0x3af5f2ee)
256
257 #define CPUS_PER_LONG   (8 * sizeof(unsigned long))
258 #define CPU_IDX(cpu)    ((cpu) / CPUS_PER_LONG)
259 #define CPU_BIT(cpu)    ((cpu) & (CPUS_PER_LONG - 1))
260
261 static void cpu_mark_online(struct per_dev_info *pdi, unsigned int cpu)
262 {
263         if (cpu >= pdi->cpu_map_max || !pdi->cpu_map) {
264                 int new_max = (cpu + CPUS_PER_LONG) & ~(CPUS_PER_LONG - 1);
265                 unsigned long *map = malloc(new_max / sizeof(long));
266
267                 memset(map, 0, new_max / sizeof(long));
268
269                 if (pdi->cpu_map) {
270                         memcpy(map, pdi->cpu_map, pdi->cpu_map_max / sizeof(long));
271                         free(pdi->cpu_map);
272                 }
273
274                 pdi->cpu_map = map;
275                 pdi->cpu_map_max = new_max;
276         }
277
278         pdi->cpu_map[CPU_IDX(cpu)] |= (1UL << CPU_BIT(cpu));
279 }
280
281 static inline void cpu_mark_offline(struct per_dev_info *pdi, int cpu)
282 {
283         pdi->cpu_map[CPU_IDX(cpu)] &= ~(1UL << CPU_BIT(cpu));
284 }
285
286 static inline int cpu_is_online(struct per_dev_info *pdi, int cpu)
287 {
288         return (pdi->cpu_map[CPU_IDX(cpu)] & (1UL << CPU_BIT(cpu))) != 0;
289 }
290
291 static inline int ppi_hash_pid(__u32 pid)
292 {
293         return jhash_1word(pid, JHASH_RANDOM) & PPI_HASH_MASK;
294 }
295
296 static inline int ppi_hash_name(const char *name)
297 {
298         return jhash(name, 16, JHASH_RANDOM) & PPI_HASH_MASK;
299 }
300
301 static inline int ppi_hash(struct per_process_info *ppi)
302 {
303         if (ppi_hash_by_pid)
304                 return ppi_hash_pid(ppi->pid);
305
306         return ppi_hash_name(ppi->name);
307 }
308
309 static inline void add_process_to_hash(struct per_process_info *ppi)
310 {
311         const int hash_idx = ppi_hash(ppi);
312
313         ppi->hash_next = ppi_hash_table[hash_idx];
314         ppi_hash_table[hash_idx] = ppi;
315 }
316
317 static inline void add_process_to_list(struct per_process_info *ppi)
318 {
319         ppi->list_next = ppi_list;
320         ppi_list = ppi;
321         ppi_list_entries++;
322 }
323
324 static struct per_process_info *find_process_by_name(char *name)
325 {
326         const int hash_idx = ppi_hash_name(name);
327         struct per_process_info *ppi;
328
329         ppi = ppi_hash_table[hash_idx];
330         while (ppi) {
331                 if (!strcmp(ppi->name, name))
332                         return ppi;
333
334                 ppi = ppi->hash_next;
335         }
336
337         return NULL;
338 }
339
340 static struct per_process_info *find_process_by_pid(__u32 pid)
341 {
342         const int hash_idx = ppi_hash_pid(pid);
343         struct per_process_info *ppi;
344
345         ppi = ppi_hash_table[hash_idx];
346         while (ppi) {
347                 if (ppi->pid == pid)
348                         return ppi;
349
350                 ppi = ppi->hash_next;
351         }
352
353         return NULL;
354 }
355
356 static struct per_process_info *find_process(__u32 pid, char *name)
357 {
358         struct per_process_info *ppi;
359
360         if (ppi_hash_by_pid)
361                 return find_process_by_pid(pid);
362
363         ppi = find_process_by_name(name);
364         if (ppi && ppi->pid != pid)
365                 ppi->more_than_one = 1;
366
367         return ppi;
368 }
369
370 static inline int trace_rb_insert(struct trace *t, struct rb_root *root,
371                                   int check_time)
372 {
373         struct rb_node **p = &root->rb_node;
374         struct rb_node *parent = NULL;
375         struct trace *__t;
376
377         while (*p) {
378                 parent = *p;
379
380                 __t = rb_entry(parent, struct trace, rb_node);
381
382                 if (check_time) {
383                         if (t->bit->time < __t->bit->time) {
384                                 p = &(*p)->rb_left;
385                                 continue;
386                         } else if (t->bit->time > __t->bit->time) {
387                                 p = &(*p)->rb_right;
388                                 continue;
389                         }
390                 }
391                 if (t->bit->device < __t->bit->device)
392                         p = &(*p)->rb_left;
393                 else if (t->bit->device > __t->bit->device)
394                         p = &(*p)->rb_right;
395                 else if (t->bit->sequence < __t->bit->sequence)
396                         p = &(*p)->rb_left;
397                 else    /* >= sequence */
398                         p = &(*p)->rb_right;
399         }
400
401         rb_link_node(&t->rb_node, parent, p);
402         rb_insert_color(&t->rb_node, root);
403         return 0;
404 }
405
406 static inline int trace_rb_insert_sort(struct trace *t)
407 {
408         if (!trace_rb_insert(t, &rb_sort_root, 1)) {
409                 rb_sort_entries++;
410                 return 0;
411         }
412
413         return 1;
414 }
415
416 static inline int trace_rb_insert_last(struct per_dev_info *pdi,struct trace *t)
417 {
418         if (!trace_rb_insert(t, &pdi->rb_last, 1)) {
419                 pdi->rb_last_entries++;
420                 return 0;
421         }
422
423         return 1;
424 }
425
426 static struct trace *trace_rb_find(dev_t device, unsigned long sequence,
427                                    struct rb_root *root, int order)
428 {
429         struct rb_node *n = root->rb_node;
430         struct rb_node *prev = NULL;
431         struct trace *__t;
432
433         while (n) {
434                 __t = rb_entry(n, struct trace, rb_node);
435                 prev = n;
436
437                 if (device < __t->bit->device)
438                         n = n->rb_left;
439                 else if (device > __t->bit->device)
440                         n = n->rb_right;
441                 else if (sequence < __t->bit->sequence)
442                         n = n->rb_left;
443                 else if (sequence > __t->bit->sequence)
444                         n = n->rb_right;
445                 else
446                         return __t;
447         }
448
449         /*
450          * hack - the list may not be sequence ordered because some
451          * events don't have sequence and time matched. so we end up
452          * being a little off in the rb lookup here, because we don't
453          * know the time we are looking for. compensate by browsing
454          * a little ahead from the last entry to find the match
455          */
456         if (order && prev) {
457                 int max = 5;
458
459                 while (((n = rb_next(prev)) != NULL) && max--) {
460                         __t = rb_entry(n, struct trace, rb_node);
461                         
462                         if (__t->bit->device == device &&
463                             __t->bit->sequence == sequence)
464                                 return __t;
465
466                         prev = n;
467                 }
468         }
469                         
470         return NULL;
471 }
472
473 static inline struct trace *trace_rb_find_sort(dev_t dev, unsigned long seq)
474 {
475         return trace_rb_find(dev, seq, &rb_sort_root, 1);
476 }
477
478 static inline struct trace *trace_rb_find_last(struct per_dev_info *pdi,
479                                                unsigned long seq)
480 {
481         return trace_rb_find(pdi->dev, seq, &pdi->rb_last, 0);
482 }
483
484 static inline int track_rb_insert(struct per_dev_info *pdi,struct io_track *iot)
485 {
486         struct rb_node **p = &pdi->rb_track.rb_node;
487         struct rb_node *parent = NULL;
488         struct io_track *__iot;
489
490         while (*p) {
491                 parent = *p;
492                 __iot = rb_entry(parent, struct io_track, rb_node);
493
494                 if (iot->sector < __iot->sector)
495                         p = &(*p)->rb_left;
496                 else if (iot->sector > __iot->sector)
497                         p = &(*p)->rb_right;
498                 else {
499                         fprintf(stderr,
500                                 "sector alias (%Lu) on device %d,%d!\n",
501                                 (unsigned long long) iot->sector,
502                                 MAJOR(pdi->dev), MINOR(pdi->dev));
503                         return 1;
504                 }
505         }
506
507         rb_link_node(&iot->rb_node, parent, p);
508         rb_insert_color(&iot->rb_node, &pdi->rb_track);
509         return 0;
510 }
511
512 static struct io_track *__find_track(struct per_dev_info *pdi, __u64 sector)
513 {
514         struct rb_node *n = pdi->rb_track.rb_node;
515         struct io_track *__iot;
516
517         while (n) {
518                 __iot = rb_entry(n, struct io_track, rb_node);
519
520                 if (sector < __iot->sector)
521                         n = n->rb_left;
522                 else if (sector > __iot->sector)
523                         n = n->rb_right;
524                 else
525                         return __iot;
526         }
527
528         return NULL;
529 }
530
531 static struct io_track *find_track(struct per_dev_info *pdi, __u32 pid,
532                                    char *comm, __u64 sector)
533 {
534         struct io_track *iot;
535
536         iot = __find_track(pdi, sector);
537         if (!iot) {
538                 iot = malloc(sizeof(*iot));
539                 iot->pid = pid;
540                 memcpy(iot->comm, comm, sizeof(iot->comm));
541                 iot->sector = sector;
542                 track_rb_insert(pdi, iot);
543         }
544
545         return iot;
546 }
547
548 static void log_track_frontmerge(struct per_dev_info *pdi,
549                                  struct blk_io_trace *t)
550 {
551         struct io_track *iot;
552
553         if (!track_ios)
554                 return;
555
556         iot = __find_track(pdi, t->sector + t_sec(t));
557         if (!iot) {
558                 if (verbose)
559                         fprintf(stderr, "merge not found for (%d,%d): %llu\n",
560                                 MAJOR(pdi->dev), MINOR(pdi->dev),
561                                 (unsigned long long) t->sector + t_sec(t));
562                 return;
563         }
564
565         rb_erase(&iot->rb_node, &pdi->rb_track);
566         iot->sector -= t_sec(t);
567         track_rb_insert(pdi, iot);
568 }
569
570 static void log_track_getrq(struct per_dev_info *pdi, struct blk_io_trace *t)
571 {
572         struct io_track *iot;
573
574         if (!track_ios)
575                 return;
576
577         iot = find_track(pdi, t->pid, t->comm, t->sector);
578         iot->allocation_time = t->time;
579 }
580
581 /*
582  * return time between rq allocation and insertion
583  */
584 static unsigned long long log_track_insert(struct per_dev_info *pdi,
585                                            struct blk_io_trace *t)
586 {
587         unsigned long long elapsed;
588         struct io_track *iot;
589
590         if (!track_ios)
591                 return -1;
592
593         iot = find_track(pdi, t->pid, t->comm, t->sector);
594         iot->queue_time = t->time;
595
596         if (!iot->allocation_time)
597                 return -1;
598
599         elapsed = iot->queue_time - iot->allocation_time;
600
601         if (per_process_stats) {
602                 struct per_process_info *ppi = find_process(iot->pid,iot->comm);
603                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
604
605                 if (ppi && elapsed > ppi->longest_allocation_wait[w])
606                         ppi->longest_allocation_wait[w] = elapsed;
607         }
608
609         return elapsed;
610 }
611
612 /*
613  * return time between queue and issue
614  */
615 static unsigned long long log_track_issue(struct per_dev_info *pdi,
616                                           struct blk_io_trace *t)
617 {
618         unsigned long long elapsed;
619         struct io_track *iot;
620
621         if (!track_ios)
622                 return -1;
623         if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
624                 return -1;
625
626         iot = __find_track(pdi, t->sector);
627         if (!iot) {
628                 if (verbose)
629                         fprintf(stderr, "issue not found for (%d,%d): %llu\n",
630                                 MAJOR(pdi->dev), MINOR(pdi->dev),
631                                 (unsigned long long) t->sector);
632                 return -1;
633         }
634
635         iot->dispatch_time = t->time;
636         elapsed = iot->dispatch_time - iot->queue_time;
637
638         if (per_process_stats) {
639                 struct per_process_info *ppi = find_process(iot->pid,iot->comm);
640                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
641
642                 if (ppi && elapsed > ppi->longest_dispatch_wait[w])
643                         ppi->longest_dispatch_wait[w] = elapsed;
644         }
645
646         return elapsed;
647 }
648
649 /*
650  * return time between dispatch and complete
651  */
652 static unsigned long long log_track_complete(struct per_dev_info *pdi,
653                                              struct blk_io_trace *t)
654 {
655         unsigned long long elapsed;
656         struct io_track *iot;
657
658         if (!track_ios)
659                 return -1;
660         if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
661                 return -1;
662
663         iot = __find_track(pdi, t->sector);
664         if (!iot) {
665                 if (verbose)
666                         fprintf(stderr,"complete not found for (%d,%d): %llu\n",
667                                 MAJOR(pdi->dev), MINOR(pdi->dev),
668                                 (unsigned long long) t->sector);
669                 return -1;
670         }
671
672         iot->completion_time = t->time;
673         elapsed = iot->completion_time - iot->dispatch_time;
674
675         if (per_process_stats) {
676                 struct per_process_info *ppi = find_process(iot->pid,iot->comm);
677                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
678
679                 if (ppi && elapsed > ppi->longest_completion_wait[w])
680                         ppi->longest_completion_wait[w] = elapsed;
681         }
682
683         /*
684          * kill the trace, we don't need it after completion
685          */
686         rb_erase(&iot->rb_node, &pdi->rb_track);
687         free(iot);
688
689         return elapsed;
690 }
691
692
693 static struct io_stats *find_process_io_stats(__u32 pid, char *name)
694 {
695         struct per_process_info *ppi = find_process(pid, name);
696
697         if (!ppi) {
698                 ppi = malloc(sizeof(*ppi));
699                 memset(ppi, 0, sizeof(*ppi));
700                 memcpy(ppi->name, name, 16);
701                 ppi->pid = pid;
702                 add_process_to_hash(ppi);
703                 add_process_to_list(ppi);
704         }
705
706         return &ppi->io_stats;
707 }
708
709 static void resize_cpu_info(struct per_dev_info *pdi, int cpu)
710 {
711         struct per_cpu_info *cpus = pdi->cpus;
712         int ncpus = pdi->ncpus;
713         int new_count = cpu + 1;
714         int new_space, size;
715         char *new_start;
716
717         size = new_count * sizeof(struct per_cpu_info);
718         cpus = realloc(cpus, size);
719         if (!cpus) {
720                 char name[20];
721                 fprintf(stderr, "Out of memory, CPU info for device %s (%d)\n",
722                         get_dev_name(pdi, name, sizeof(name)), size);
723                 exit(1);
724         }
725
726         new_start = (char *)cpus + (ncpus * sizeof(struct per_cpu_info));
727         new_space = (new_count - ncpus) * sizeof(struct per_cpu_info);
728         memset(new_start, 0, new_space);
729
730         pdi->ncpus = new_count;
731         pdi->cpus = cpus;
732 }
733
734 static struct per_cpu_info *get_cpu_info(struct per_dev_info *pdi, int cpu)
735 {
736         struct per_cpu_info *pci;
737
738         if (cpu >= pdi->ncpus)
739                 resize_cpu_info(pdi, cpu);
740
741         pci = &pdi->cpus[cpu];
742         pci->cpu = cpu;
743         return pci;
744 }
745
746
747 static int resize_devices(char *name)
748 {
749         int size = (ndevices + 1) * sizeof(struct per_dev_info);
750
751         devices = realloc(devices, size);
752         if (!devices) {
753                 fprintf(stderr, "Out of memory, device %s (%d)\n", name, size);
754                 return 1;
755         }
756         memset(&devices[ndevices], 0, sizeof(struct per_dev_info));
757         devices[ndevices].name = name;
758         ndevices++;
759         return 0;
760 }
761
762 static struct per_dev_info *get_dev_info(dev_t dev)
763 {
764         struct per_dev_info *pdi;
765         int i;
766
767         for (i = 0; i < ndevices; i++) {
768                 if (!devices[i].dev)
769                         devices[i].dev = dev;
770                 if (devices[i].dev == dev)
771                         return &devices[i];
772         }
773
774         if (resize_devices(NULL))
775                 return NULL;
776
777         pdi = &devices[ndevices - 1];
778         pdi->dev = dev;
779         pdi->first_reported_time = 0;
780         pdi->last_sequence = -1;
781         pdi->last_read_time = 0;
782         memset(&pdi->rb_last, 0, sizeof(pdi->rb_last));
783         pdi->rb_last_entries = 0;
784         return pdi;
785 }
786
787 static char *get_dev_name(struct per_dev_info *pdi, char *buffer, int size)
788 {
789         if (pdi->name)
790                 snprintf(buffer, size, "%s", pdi->name);
791         else
792                 snprintf(buffer, size, "%d,%d",MAJOR(pdi->dev),MINOR(pdi->dev));
793         return buffer;
794 }
795
796 static void check_time(struct per_dev_info *pdi, struct blk_io_trace *bit)
797 {
798         unsigned long long this = bit->time;
799         unsigned long long last = pdi->last_reported_time;
800
801         pdi->backwards = (this < last) ? 'B' : ' ';
802         pdi->last_reported_time = this;
803 }
804
805 static inline void __account_m(struct io_stats *ios, struct blk_io_trace *t,
806                                int rw)
807 {
808         if (rw) {
809                 ios->mwrites++;
810                 ios->qwrite_kb += t_kb(t);
811         } else {
812                 ios->mreads++;
813                 ios->qread_kb += t_kb(t);
814         }
815 }
816
817 static inline void account_m(struct blk_io_trace *t, struct per_cpu_info *pci,
818                              int rw)
819 {
820         __account_m(&pci->io_stats, t, rw);
821
822         if (per_process_stats) {
823                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
824
825                 __account_m(ios, t, rw);
826         }
827 }
828
829 static inline void __account_queue(struct io_stats *ios, struct blk_io_trace *t,
830                                    int rw)
831 {
832         if (rw) {
833                 ios->qwrites++;
834                 ios->qwrite_kb += t_kb(t);
835         } else {
836                 ios->qreads++;
837                 ios->qread_kb += t_kb(t);
838         }
839 }
840
841 static inline void account_queue(struct blk_io_trace *t,
842                                  struct per_cpu_info *pci, int rw)
843 {
844         __account_queue(&pci->io_stats, t, rw);
845
846         if (per_process_stats) {
847                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
848
849                 __account_queue(ios, t, rw);
850         }
851 }
852
853 static inline void __account_c(struct io_stats *ios, int rw, int bytes)
854 {
855         if (rw) {
856                 ios->cwrites++;
857                 ios->cwrite_kb += bytes >> 10;
858         } else {
859                 ios->creads++;
860                 ios->cread_kb += bytes >> 10;
861         }
862 }
863
864 static inline void account_c(struct blk_io_trace *t, struct per_cpu_info *pci,
865                              int rw, int bytes)
866 {
867         __account_c(&pci->io_stats, rw, bytes);
868
869         if (per_process_stats) {
870                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
871
872                 __account_c(ios, rw, bytes);
873         }
874 }
875
876 static inline void __account_issue(struct io_stats *ios, int rw,
877                                    unsigned int bytes)
878 {
879         if (rw) {
880                 ios->iwrites++;
881                 ios->iwrite_kb += bytes >> 10;
882         } else {
883                 ios->ireads++;
884                 ios->iread_kb += bytes >> 10;
885         }
886 }
887
888 static inline void account_issue(struct blk_io_trace *t,
889                                  struct per_cpu_info *pci, int rw)
890 {
891         __account_issue(&pci->io_stats, rw, t->bytes);
892
893         if (per_process_stats) {
894                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
895
896                 __account_issue(ios, rw, t->bytes);
897         }
898 }
899
900 static inline void __account_unplug(struct io_stats *ios, int timer)
901 {
902         if (timer)
903                 ios->timer_unplugs++;
904         else
905                 ios->io_unplugs++;
906 }
907
908 static inline void account_unplug(struct blk_io_trace *t,
909                                   struct per_cpu_info *pci, int timer)
910 {
911         __account_unplug(&pci->io_stats, timer);
912
913         if (per_process_stats) {
914                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
915
916                 __account_unplug(ios, timer);
917         }
918 }
919
920 static inline void __account_requeue(struct io_stats *ios,
921                                      struct blk_io_trace *t, int rw)
922 {
923         if (rw) {
924                 ios->wrqueue++;
925                 ios->iwrite_kb -= t_kb(t);
926         } else {
927                 ios->rrqueue++;
928                 ios->iread_kb -= t_kb(t);
929         }
930 }
931
932 static inline void account_requeue(struct blk_io_trace *t,
933                                    struct per_cpu_info *pci, int rw)
934 {
935         __account_requeue(&pci->io_stats, t, rw);
936
937         if (per_process_stats) {
938                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
939
940                 __account_requeue(ios, t, rw);
941         }
942 }
943
944 static void log_complete(struct per_dev_info *pdi, struct per_cpu_info *pci,
945                          struct blk_io_trace *t, char *act)
946 {
947         process_fmt(act, pci, t, log_track_complete(pdi, t), 0, NULL);
948 }
949
950 static void log_insert(struct per_dev_info *pdi, struct per_cpu_info *pci,
951                        struct blk_io_trace *t, char *act)
952 {
953         process_fmt(act, pci, t, log_track_insert(pdi, t), 0, NULL);
954 }
955
956 static void log_queue(struct per_cpu_info *pci, struct blk_io_trace *t,
957                       char *act)
958 {
959         process_fmt(act, pci, t, -1, 0, NULL);
960 }
961
962 static void log_issue(struct per_dev_info *pdi, struct per_cpu_info *pci,
963                       struct blk_io_trace *t, char *act)
964 {
965         process_fmt(act, pci, t, log_track_issue(pdi, t), 0, NULL);
966 }
967
968 static void log_merge(struct per_dev_info *pdi, struct per_cpu_info *pci,
969                       struct blk_io_trace *t, char *act)
970 {
971         if (act[0] == 'F')
972                 log_track_frontmerge(pdi, t);
973
974         process_fmt(act, pci, t, -1ULL, 0, NULL);
975 }
976
977 static void log_action(struct per_cpu_info *pci, struct blk_io_trace *t,
978                         char *act)
979 {
980         process_fmt(act, pci, t, -1ULL, 0, NULL);
981 }
982
983 static void log_generic(struct per_cpu_info *pci, struct blk_io_trace *t,
984                         char *act)
985 {
986         process_fmt(act, pci, t, -1ULL, 0, NULL);
987 }
988
989 static void log_unplug(struct per_cpu_info *pci, struct blk_io_trace *t,
990                       char *act)
991 {
992         process_fmt(act, pci, t, -1ULL, 0, NULL);
993 }
994
995 static void log_split(struct per_cpu_info *pci, struct blk_io_trace *t,
996                       char *act)
997 {
998         process_fmt(act, pci, t, -1ULL, 0, NULL);
999 }
1000
1001 static void log_pc(struct per_cpu_info *pci, struct blk_io_trace *t, char *act)
1002 {
1003         unsigned char *buf = (unsigned char *) t + sizeof(*t);
1004
1005         process_fmt(act, pci, t, -1ULL, t->pdu_len, buf);
1006 }
1007
1008 static void dump_trace_pc(struct blk_io_trace *t, struct per_cpu_info *pci)
1009 {
1010         int act = t->action & 0xffff;
1011
1012         switch (act) {
1013                 case __BLK_TA_QUEUE:
1014                         log_generic(pci, t, "Q");
1015                         break;
1016                 case __BLK_TA_GETRQ:
1017                         log_generic(pci, t, "G");
1018                         break;
1019                 case __BLK_TA_SLEEPRQ:
1020                         log_generic(pci, t, "S");
1021                         break;
1022                 case __BLK_TA_REQUEUE:
1023                         log_generic(pci, t, "R");
1024                         break;
1025                 case __BLK_TA_ISSUE:
1026                         log_pc(pci, t, "D");
1027                         break;
1028                 case __BLK_TA_COMPLETE:
1029                         log_pc(pci, t, "C");
1030                         break;
1031                 case __BLK_TA_INSERT:
1032                         log_pc(pci, t, "I");
1033                         break;
1034                 default:
1035                         fprintf(stderr, "Bad pc action %x\n", act);
1036                         break;
1037         }
1038 }
1039
1040 static void dump_trace_fs(struct blk_io_trace *t, struct per_dev_info *pdi,
1041                           struct per_cpu_info *pci)
1042 {
1043         int w = t->action & BLK_TC_ACT(BLK_TC_WRITE);
1044         int act = t->action & 0xffff;
1045
1046         switch (act) {
1047                 case __BLK_TA_QUEUE:
1048                         account_queue(t, pci, w);
1049                         log_queue(pci, t, "Q");
1050                         break;
1051                 case __BLK_TA_INSERT:
1052                         log_insert(pdi, pci, t, "I");
1053                         break;
1054                 case __BLK_TA_BACKMERGE:
1055                         account_m(t, pci, w);
1056                         log_merge(pdi, pci, t, "M");
1057                         break;
1058                 case __BLK_TA_FRONTMERGE:
1059                         account_m(t, pci, w);
1060                         log_merge(pdi, pci, t, "F");
1061                         break;
1062                 case __BLK_TA_GETRQ:
1063                         log_track_getrq(pdi, t);
1064                         log_generic(pci, t, "G");
1065                         break;
1066                 case __BLK_TA_SLEEPRQ:
1067                         log_generic(pci, t, "S");
1068                         break;
1069                 case __BLK_TA_REQUEUE:
1070                         account_requeue(t, pci, w);
1071                         log_queue(pci, t, "R");
1072                         break;
1073                 case __BLK_TA_ISSUE:
1074                         account_issue(t, pci, w);
1075                         log_issue(pdi, pci, t, "D");
1076                         break;
1077                 case __BLK_TA_COMPLETE:
1078                         account_c(t, pci, w, t->bytes);
1079                         log_complete(pdi, pci, t, "C");
1080                         break;
1081                 case __BLK_TA_PLUG:
1082                         log_action(pci, t, "P");
1083                         break;
1084                 case __BLK_TA_UNPLUG_IO:
1085                         account_unplug(t, pci, 0);
1086                         log_unplug(pci, t, "U");
1087                         break;
1088                 case __BLK_TA_UNPLUG_TIMER:
1089                         account_unplug(t, pci, 1);
1090                         log_unplug(pci, t, "UT");
1091                         break;
1092                 case __BLK_TA_SPLIT:
1093                         log_split(pci, t, "X");
1094                         break;
1095                 case __BLK_TA_BOUNCE:
1096                         log_generic(pci, t, "B");
1097                         break;
1098                 case __BLK_TA_REMAP:
1099                         log_generic(pci, t, "A");
1100                         break;
1101                 default:
1102                         fprintf(stderr, "Bad fs action %x\n", t->action);
1103                         break;
1104         }
1105 }
1106
1107 static void dump_trace(struct blk_io_trace *t, struct per_cpu_info *pci,
1108                        struct per_dev_info *pdi)
1109 {
1110         if (t->action & BLK_TC_ACT(BLK_TC_PC))
1111                 dump_trace_pc(t, pci);
1112         else
1113                 dump_trace_fs(t, pdi, pci);
1114
1115         if (!pdi->events)
1116                 pdi->first_reported_time = t->time;
1117
1118         pdi->events++;
1119 }
1120
1121 /*
1122  * print in a proper way, not too small and not too big. if more than
1123  * 1000,000K, turn into M and so on
1124  */
1125 static char *size_cnv(char *dst, unsigned long long num, int in_kb)
1126 {
1127         char suff[] = { '\0', 'K', 'M', 'G', 'P' };
1128         unsigned int i = 0;
1129
1130         if (in_kb)
1131                 i++;
1132
1133         while (num > 1000 * 1000ULL && (i < sizeof(suff) - 1)) {
1134                 i++;
1135                 num /= 1000;
1136         }
1137
1138         sprintf(dst, "%'8Lu%c", num, suff[i]);
1139         return dst;
1140 }
1141
1142 static void dump_io_stats(struct io_stats *ios, char *msg)
1143 {
1144         static char x[256], y[256];
1145
1146         fprintf(ofp, "%s\n", msg);
1147
1148         fprintf(ofp, " Reads Queued:    %s, %siB\t", size_cnv(x, ios->qreads, 0), size_cnv(y, ios->qread_kb, 1));
1149         fprintf(ofp, " Writes Queued:    %s, %siB\n", size_cnv(x, ios->qwrites, 0), size_cnv(y, ios->qwrite_kb, 1));
1150
1151         fprintf(ofp, " Read Dispatches: %s, %siB\t", size_cnv(x, ios->ireads, 0), size_cnv(y, ios->iread_kb, 1));
1152         fprintf(ofp, " Write Dispatches: %s, %siB\n", size_cnv(x, ios->iwrites, 0), size_cnv(y, ios->iwrite_kb, 1));
1153         fprintf(ofp, " Reads Requeued:  %s\t\t", size_cnv(x, ios->rrqueue, 0));
1154         fprintf(ofp, " Writes Requeued:  %s\n", size_cnv(x, ios->wrqueue, 0));
1155         fprintf(ofp, " Reads Completed: %s, %siB\t", size_cnv(x, ios->creads, 0), size_cnv(y, ios->cread_kb, 1));
1156         fprintf(ofp, " Writes Completed: %s, %siB\n", size_cnv(x, ios->cwrites, 0), size_cnv(y, ios->cwrite_kb, 1));
1157         fprintf(ofp, " Read Merges:     %'8lu%8c\t", ios->mreads, ' ');
1158         fprintf(ofp, " Write Merges:     %'8lu\n", ios->mwrites);
1159         fprintf(ofp, " IO unplugs:      %'8lu%8c\t", ios->io_unplugs, ' ');
1160         fprintf(ofp, " Timer unplugs:    %'8lu\n", ios->timer_unplugs);
1161 }
1162
1163 static void dump_wait_stats(struct per_process_info *ppi)
1164 {
1165         unsigned long rawait = ppi->longest_allocation_wait[0] / 1000;
1166         unsigned long rdwait = ppi->longest_dispatch_wait[0] / 1000;
1167         unsigned long rcwait = ppi->longest_completion_wait[0] / 1000;
1168         unsigned long wawait = ppi->longest_allocation_wait[1] / 1000;
1169         unsigned long wdwait = ppi->longest_dispatch_wait[1] / 1000;
1170         unsigned long wcwait = ppi->longest_completion_wait[1] / 1000;
1171
1172         fprintf(ofp, " Allocation wait: %'8lu%8c\t", rawait, ' ');
1173         fprintf(ofp, " Allocation wait:  %'8lu\n", wawait);
1174         fprintf(ofp, " Dispatch wait:   %'8lu%8c\t", rdwait, ' ');
1175         fprintf(ofp, " Dispatch wait:    %'8lu\n", wdwait);
1176         fprintf(ofp, " Completion wait: %'8lu%8c\t", rcwait, ' ');
1177         fprintf(ofp, " Completion wait:  %'8lu\n", wcwait);
1178 }
1179
1180 static int ppi_name_compare(const void *p1, const void *p2)
1181 {
1182         struct per_process_info *ppi1 = *((struct per_process_info **) p1);
1183         struct per_process_info *ppi2 = *((struct per_process_info **) p2);
1184         int res;
1185
1186         res = strverscmp(ppi1->name, ppi2->name);
1187         if (!res)
1188                 res = ppi1->pid > ppi2->pid;
1189
1190         return res;
1191 }
1192
1193 static void sort_process_list(void)
1194 {
1195         struct per_process_info **ppis;
1196         struct per_process_info *ppi;
1197         int i = 0;
1198
1199         ppis = malloc(ppi_list_entries * sizeof(struct per_process_info *));
1200
1201         ppi = ppi_list;
1202         while (ppi) {
1203                 ppis[i++] = ppi;
1204                 ppi = ppi->list_next;
1205         }
1206
1207         qsort(ppis, ppi_list_entries, sizeof(ppi), ppi_name_compare);
1208
1209         i = ppi_list_entries - 1;
1210         ppi_list = NULL;
1211         while (i >= 0) {
1212                 ppi = ppis[i];
1213
1214                 ppi->list_next = ppi_list;
1215                 ppi_list = ppi;
1216                 i--;
1217         }
1218
1219         free(ppis);
1220 }
1221
1222 static void show_process_stats(void)
1223 {
1224         struct per_process_info *ppi;
1225
1226         sort_process_list();
1227
1228         ppi = ppi_list;
1229         while (ppi) {
1230                 char name[64];
1231
1232                 if (ppi->more_than_one)
1233                         sprintf(name, "%s (%u, ...)", ppi->name, ppi->pid);
1234                 else
1235                         sprintf(name, "%s (%u)", ppi->name, ppi->pid);
1236
1237                 dump_io_stats(&ppi->io_stats, name);
1238                 dump_wait_stats(ppi);
1239                 ppi = ppi->list_next;
1240         }
1241
1242         fprintf(ofp, "\n");
1243 }
1244
1245 static void show_device_and_cpu_stats(void)
1246 {
1247         struct per_dev_info *pdi;
1248         struct per_cpu_info *pci;
1249         struct io_stats total, *ios;
1250         unsigned long long rrate, wrate, msec;
1251         int i, j, pci_events;
1252         char line[3 + 8/*cpu*/ + 2 + 32/*dev*/ + 3];
1253         char name[32];
1254
1255         for (pdi = devices, i = 0; i < ndevices; i++, pdi++) {
1256
1257                 memset(&total, 0, sizeof(total));
1258                 pci_events = 0;
1259
1260                 if (i > 0)
1261                         fprintf(ofp, "\n");
1262
1263                 for (pci = pdi->cpus, j = 0; j < pdi->ncpus; j++, pci++) {
1264                         if (!pci->nelems)
1265                                 continue;
1266
1267                         ios = &pci->io_stats;
1268                         total.qreads += ios->qreads;
1269                         total.qwrites += ios->qwrites;
1270                         total.creads += ios->creads;
1271                         total.cwrites += ios->cwrites;
1272                         total.mreads += ios->mreads;
1273                         total.mwrites += ios->mwrites;
1274                         total.ireads += ios->ireads;
1275                         total.iwrites += ios->iwrites;
1276                         total.rrqueue += ios->rrqueue;
1277                         total.wrqueue += ios->wrqueue;
1278                         total.qread_kb += ios->qread_kb;
1279                         total.qwrite_kb += ios->qwrite_kb;
1280                         total.cread_kb += ios->cread_kb;
1281                         total.cwrite_kb += ios->cwrite_kb;
1282                         total.iread_kb += ios->iread_kb;
1283                         total.iwrite_kb += ios->iwrite_kb;
1284                         total.timer_unplugs += ios->timer_unplugs;
1285                         total.io_unplugs += ios->io_unplugs;
1286
1287                         snprintf(line, sizeof(line) - 1, "CPU%d (%s):",
1288                                  j, get_dev_name(pdi, name, sizeof(name)));
1289                         dump_io_stats(ios, line);
1290                         pci_events++;
1291                 }
1292
1293                 if (pci_events > 1) {
1294                         fprintf(ofp, "\n");
1295                         snprintf(line, sizeof(line) - 1, "Total (%s):",
1296                                  get_dev_name(pdi, name, sizeof(name)));
1297                         dump_io_stats(&total, line);
1298                 }
1299
1300                 wrate = rrate = 0;
1301                 msec = (pdi->last_reported_time - pdi->first_reported_time) / 1000000;
1302                 if (msec) {
1303                         rrate = 1000 * total.cread_kb / msec;
1304                         wrate = 1000 * total.cwrite_kb / msec;
1305                 }
1306
1307                 fprintf(ofp, "\nThroughput (R/W): %'LuKiB/s / %'LuKiB/s\n",
1308                         rrate, wrate);
1309                 fprintf(ofp, "Events (%s): %'Lu entries\n",
1310                         get_dev_name(pdi, line, sizeof(line)), pdi->events);
1311                 fprintf(ofp, "Skips: %'lu forward (%'llu - %5.1lf%%) %'lu backward (%'llu - %5.1lf%%)\n",
1312                         pdi->skips,pdi->seq_skips,
1313                         100.0 * ((double)pdi->seq_skips /
1314                                 (double)(pdi->events + pdi->seq_skips)),
1315                         pdi->nskips,pdi->seq_nskips,
1316                         100.0 * ((double)pdi->seq_nskips /
1317                                 (double)(pdi->events + pdi->seq_skips)));
1318         }
1319 }
1320
1321 /*
1322  * struct trace and blktrace allocation cache, we do potentially
1323  * millions of mallocs for these structures while only using at most
1324  * a few thousand at the time
1325  */
1326 static inline void t_free(struct trace *t)
1327 {
1328         if (t_alloc_cache < 1024) {
1329                 t->next = t_alloc_list;
1330                 t_alloc_list = t;
1331                 t_alloc_cache++;
1332         } else
1333                 free(t);
1334 }
1335
1336 static inline struct trace *t_alloc(void)
1337 {
1338         struct trace *t = t_alloc_list;
1339
1340         if (t) {
1341                 t_alloc_list = t->next;
1342                 t_alloc_cache--;
1343                 return t;
1344         }
1345
1346         return malloc(sizeof(*t));
1347 }
1348
1349 static inline void bit_free(struct blk_io_trace *bit)
1350 {
1351         if (bit_alloc_cache < 1024 && !bit->pdu_len) {
1352                 /*
1353                  * abuse a 64-bit field for a next pointer for the free item
1354                  */
1355                 bit->time = (__u64) (unsigned long) bit_alloc_list;
1356                 bit_alloc_list = (struct blk_io_trace *) bit;
1357                 bit_alloc_cache++;
1358         } else
1359                 free(bit);
1360 }
1361
1362 static inline struct blk_io_trace *bit_alloc(void)
1363 {
1364         struct blk_io_trace *bit = bit_alloc_list;
1365
1366         if (bit) {
1367                 bit_alloc_list = (struct blk_io_trace *) (unsigned long) \
1368                                  bit->time;
1369                 bit_alloc_cache--;
1370                 return bit;
1371         }
1372
1373         return malloc(sizeof(*bit));
1374 }
1375
1376 static void find_genesis(void)
1377 {
1378         struct trace *t = trace_list;
1379
1380         genesis_time = -1ULL;
1381         while (t != NULL) {
1382                 if (t->bit->time < genesis_time)
1383                         genesis_time = t->bit->time;
1384
1385                 t = t->next;
1386         }
1387 }
1388
1389 static inline int check_stopwatch(struct blk_io_trace *bit)
1390 {
1391         if (bit->time < stopwatch_end &&
1392             bit->time >= stopwatch_start)
1393                 return 0;
1394
1395         return 1;
1396 }
1397
1398 /*
1399  * return youngest entry read
1400  */
1401 static int sort_entries(unsigned long long *youngest)
1402 {
1403         struct trace *t;
1404
1405         if (!genesis_time)
1406                 find_genesis();
1407
1408         *youngest = 0;
1409         while ((t = trace_list) != NULL) {
1410                 struct blk_io_trace *bit = t->bit;
1411
1412                 trace_list = t->next;
1413
1414                 bit->time -= genesis_time;
1415
1416                 if (bit->time < *youngest || !*youngest)
1417                         *youngest = bit->time;
1418
1419                 if (bit->sequence < smallest_seq_read)
1420                         smallest_seq_read = bit->sequence;
1421
1422                 if (check_stopwatch(bit)) {
1423                         bit_free(bit);
1424                         t_free(t);
1425                         continue;
1426                 }
1427
1428                 if (trace_rb_insert_sort(t))
1429                         return -1;
1430         }
1431
1432         return 0;
1433 }
1434
1435 static inline void __put_trace_last(struct per_dev_info *pdi, struct trace *t)
1436 {
1437         rb_erase(&t->rb_node, &pdi->rb_last);
1438         pdi->rb_last_entries--;
1439
1440         bit_free(t->bit);
1441         t_free(t);
1442 }
1443
1444 static void put_trace(struct per_dev_info *pdi, struct trace *t)
1445 {
1446         rb_erase(&t->rb_node, &rb_sort_root);
1447         rb_sort_entries--;
1448
1449         trace_rb_insert_last(pdi, t);
1450
1451         if (pdi->rb_last_entries > rb_batch * pdi->nfiles) {
1452                 struct rb_node *n = rb_first(&pdi->rb_last);
1453
1454                 t = rb_entry(n, struct trace, rb_node);
1455                 __put_trace_last(pdi, t);
1456         }
1457 }
1458
1459 /*
1460  * to continue, we must have traces from all online cpus in the tree
1461  */
1462 static int check_cpu_map(struct per_dev_info *pdi)
1463 {
1464         unsigned long *cpu_map;
1465         struct rb_node *n;
1466         struct trace *__t;
1467         unsigned int i;
1468         int ret, cpu;
1469
1470         /*
1471          * create a map of the cpus we have traces for
1472          */
1473         cpu_map = malloc(pdi->cpu_map_max / sizeof(long));
1474         n = rb_first(&rb_sort_root);
1475         while (n) {
1476                 __t = rb_entry(n, struct trace, rb_node);
1477                 cpu = __t->bit->cpu;
1478
1479                 cpu_map[CPU_IDX(cpu)] |= (1UL << CPU_BIT(cpu));
1480                 n = rb_next(n);
1481         }
1482
1483         /*
1484          * we can't continue if pdi->cpu_map has entries set that we don't
1485          * have in the sort rbtree. the opposite is not a problem, though
1486          */
1487         ret = 0;
1488         for (i = 0; i < pdi->cpu_map_max / CPUS_PER_LONG; i++) {
1489                 if (pdi->cpu_map[i] & ~(cpu_map[i])) {
1490                         ret = 1;
1491                         break;
1492                 }
1493         }
1494
1495         free(cpu_map);
1496         return ret;
1497 }
1498
1499 static int check_sequence(struct per_dev_info *pdi, struct trace *t, int force)
1500 {
1501         unsigned long expected_sequence = pdi->last_sequence + 1;
1502         struct blk_io_trace *bit = t->bit;
1503         struct trace *__t;
1504         
1505         if (!expected_sequence) {
1506                 /*
1507                  * 1 should be the first entry, just allow it
1508                  */
1509                 if (bit->sequence == 1)
1510                         return 0;
1511
1512                 return check_cpu_map(pdi);
1513         }
1514
1515         if (bit->sequence == expected_sequence)
1516                 return 0;
1517
1518         /*
1519          * we may not have seen that sequence yet. if we are not doing
1520          * the final run, break and wait for more entries.
1521          */
1522         if (expected_sequence < smallest_seq_read) {
1523                 __t = trace_rb_find_last(pdi, expected_sequence);
1524                 if (!__t)
1525                         goto skip;
1526
1527                 __put_trace_last(pdi, __t);
1528                 return 0;
1529         } else if (!force) {
1530                 return 1;
1531         } else {
1532 skip:
1533                 if (verbose) {
1534                         fprintf(stderr, "(%d,%d): skipping %lu -> %u\n",
1535                                 MAJOR(pdi->dev), MINOR(pdi->dev),
1536                                 pdi->last_sequence, bit->sequence);
1537                 }
1538                 if (bit->sequence > pdi->last_sequence) {
1539                         pdi->skips++;
1540                         pdi->seq_skips += (bit->sequence - pdi->last_sequence);
1541                 } else {
1542                         pdi->nskips++;
1543                         pdi->seq_nskips += (pdi->last_sequence - bit->sequence);
1544                 }
1545                 return 0;
1546         }
1547 }
1548
1549 static void show_entries_rb(int force)
1550 {
1551         struct per_dev_info *pdi = NULL;
1552         struct per_cpu_info *pci = NULL;
1553         struct blk_io_trace *bit;
1554         struct rb_node *n;
1555         struct trace *t;
1556
1557         while ((n = rb_first(&rb_sort_root)) != NULL) {
1558                 if (is_done() && !force && !pipeline)
1559                         break;
1560
1561                 t = rb_entry(n, struct trace, rb_node);
1562                 bit = t->bit;
1563
1564                 if (!pdi || pdi->dev != bit->device)
1565                         pdi = get_dev_info(bit->device);
1566
1567                 if (!pdi) {
1568                         fprintf(stderr, "Unknown device ID? (%d,%d)\n",
1569                                 MAJOR(bit->device), MINOR(bit->device));
1570                         break;
1571                 }
1572
1573                 if (check_sequence(pdi, t, force))
1574                         break;
1575
1576                 if (!force && bit->time > last_allowed_time)
1577                         break;
1578
1579                 pdi->last_sequence = bit->sequence;
1580
1581                 check_time(pdi, bit);
1582
1583                 if (!pci || pci->cpu != bit->cpu)
1584                         pci = get_cpu_info(pdi, bit->cpu);
1585
1586                 pci->nelems++;
1587
1588                 if (bit->action & (act_mask << BLK_TC_SHIFT)) 
1589                         dump_trace(bit, pci, pdi);
1590
1591                 put_trace(pdi, t);
1592         }
1593 }
1594
1595 static int read_data(int fd, void *buffer, int bytes, int block)
1596 {
1597         int ret, bytes_left, fl;
1598         void *p;
1599
1600         fl = fcntl(fd, F_GETFL);
1601
1602         if (!block)
1603                 fcntl(fd, F_SETFL, fl | O_NONBLOCK);
1604         else
1605                 fcntl(fd, F_SETFL, fl & ~O_NONBLOCK);
1606
1607         bytes_left = bytes;
1608         p = buffer;
1609         while (bytes_left > 0) {
1610                 ret = read(fd, p, bytes_left);
1611                 if (!ret)
1612                         return 1;
1613                 else if (ret < 0) {
1614                         if (errno != EAGAIN)
1615                                 perror("read");
1616
1617                         return -1;
1618                 } else {
1619                         p += ret;
1620                         bytes_left -= ret;
1621                 }
1622         }
1623
1624         return 0;
1625 }
1626
1627 static int read_events(int fd, int always_block)
1628 {
1629         struct per_dev_info *pdi = NULL;
1630         unsigned int events = 0;
1631
1632         while (!is_done() && events < rb_batch) {
1633                 struct blk_io_trace *bit;
1634                 struct trace *t;
1635                 int pdu_len;
1636                 __u32 magic;
1637
1638                 bit = bit_alloc();
1639
1640                 if (read_data(fd, bit, sizeof(*bit), !events || always_block)) {
1641                         bit_free(bit);
1642                         break;
1643                 }
1644
1645                 magic = be32_to_cpu(bit->magic);
1646                 if ((magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
1647                         fprintf(stderr, "Bad magic %x\n", magic);
1648                         break;
1649                 }
1650
1651                 pdu_len = be16_to_cpu(bit->pdu_len);
1652                 if (pdu_len) {
1653                         void *ptr = realloc(bit, sizeof(*bit) + pdu_len);
1654
1655                         if (read_data(fd, ptr + sizeof(*bit), pdu_len, 1)) {
1656                                 bit_free(ptr);
1657                                 break;
1658                         }
1659
1660                         bit = ptr;
1661                 }
1662
1663                 trace_to_cpu(bit);
1664
1665                 if (verify_trace(bit)) {
1666                         bit_free(bit);
1667                         continue;
1668                 }
1669
1670                 t = t_alloc();
1671                 memset(t, 0, sizeof(*t));
1672                 t->bit = bit;
1673
1674                 t->next = trace_list;
1675                 trace_list = t;
1676
1677                 if (!pdi || pdi->dev != bit->device)
1678                         pdi = get_dev_info(bit->device);
1679
1680                 if (bit->time > pdi->last_read_time)
1681                         pdi->last_read_time = bit->time;
1682
1683                 events++;
1684         }
1685
1686         return events;
1687 }
1688
1689 static int do_file(void)
1690 {
1691         struct per_cpu_info *pci;
1692         struct per_dev_info *pdi;
1693         int i, j, events, events_added;
1694
1695         /*
1696          * first prepare all files for reading
1697          */
1698         for (i = 0; i < ndevices; i++) {
1699                 pdi = &devices[i];
1700                 pdi->nfiles = 0;
1701                 pdi->last_sequence = -1;
1702
1703                 for (j = 0;; j++) {
1704                         struct stat st;
1705                         int len = 0;
1706                         char *p, *dname;
1707
1708                         pci = get_cpu_info(pdi, j);
1709                         pci->cpu = j;
1710                         pci->fd = -1;
1711         
1712                         p = strdup(pdi->name);
1713                         dname = dirname(p);
1714                         if (strcmp(dname, ".")) {
1715                                 input_dir = dname;
1716                                 p = strdup(pdi->name);
1717                                 strcpy(pdi->name, basename(p));
1718                         }
1719                         free(p);
1720
1721                         if (input_dir)
1722                                 len = sprintf(pci->fname, "%s/", input_dir);
1723
1724                         snprintf(pci->fname + len, sizeof(pci->fname)-1-len,
1725                                  "%s.blktrace.%d", pdi->name, pci->cpu);
1726                         if (stat(pci->fname, &st) < 0)
1727                                 break;
1728                         if (st.st_size) {
1729                                 pci->fd = open(pci->fname, O_RDONLY);
1730                                 if (pci->fd < 0) {
1731                                         perror(pci->fname);
1732                                         continue;
1733                                 }
1734                         }
1735
1736                         printf("Input file %s added\n", pci->fname);
1737                         pdi->nfiles++;
1738                         cpu_mark_online(pdi, pci->cpu);
1739                 }
1740         }
1741
1742         /*
1743          * now loop over the files reading in the data
1744          */
1745         do {
1746                 unsigned long long youngest;
1747
1748                 events_added = 0;
1749                 last_allowed_time = -1ULL;
1750                 smallest_seq_read = -1U;
1751
1752                 for (i = 0; i < ndevices; i++) {
1753                         pdi = &devices[i];
1754
1755                         for (j = 0; j < pdi->nfiles; j++) {
1756
1757                                 pci = get_cpu_info(pdi, j);
1758
1759                                 if (pci->fd == -1)
1760                                         continue;
1761
1762                                 events = read_events(pci->fd, 1);
1763                                 if (!events) {
1764                                         cpu_mark_offline(pdi, pci->cpu);
1765                                         close(pci->fd);
1766                                         pci->fd = -1;
1767                                         continue;
1768                                 }
1769
1770                                 if (pdi->last_read_time < last_allowed_time)
1771                                         last_allowed_time = pdi->last_read_time;
1772
1773                                 events_added += events;
1774                         }
1775                 }
1776
1777                 if (sort_entries(&youngest))
1778                         break;
1779
1780                 if (youngest > stopwatch_end)
1781                         break;
1782
1783                 show_entries_rb(0);
1784
1785         } while (events_added);
1786
1787         if (rb_sort_entries)
1788                 show_entries_rb(1);
1789
1790         return 0;
1791 }
1792
1793 static int do_stdin(void)
1794 {
1795         unsigned long long youngest;
1796         int fd, events;
1797
1798         last_allowed_time = -1ULL;
1799         fd = dup(STDIN_FILENO);
1800         if (fd == -1) {
1801                 perror("dup stdin");
1802                 return -1;
1803         }
1804
1805         while ((events = read_events(fd, 0)) != 0) {
1806         
1807                 smallest_seq_read = -1U;
1808
1809                 if (sort_entries(&youngest))
1810                         break;
1811
1812                 if (youngest > stopwatch_end)
1813                         break;
1814
1815                 show_entries_rb(0);
1816         }
1817
1818         if (rb_sort_entries)
1819                 show_entries_rb(1);
1820
1821         close(fd);
1822         return 0;
1823 }
1824
1825 static void show_stats(void)
1826 {
1827         if (!ofp)
1828                 return;
1829         if (stats_printed)
1830                 return;
1831
1832         stats_printed = 1;
1833
1834         if (per_process_stats)
1835                 show_process_stats();
1836
1837         if (per_device_and_cpu_stats)
1838                 show_device_and_cpu_stats();
1839
1840         fflush(ofp);
1841 }
1842
1843 static void handle_sigint(__attribute__((__unused__)) int sig)
1844 {
1845         done = 1;
1846         show_stats();
1847 }
1848
1849 /*
1850  * Extract start and duration times from a string, allowing
1851  * us to specify a time interval of interest within a trace.
1852  * Format: "duration" (start is zero) or "start:duration".
1853  */
1854 static int find_stopwatch_interval(char *string)
1855 {
1856         double value;
1857         char *sp;
1858
1859         value = strtod(string, &sp);
1860         if (sp == string) {
1861                 fprintf(stderr,"Invalid stopwatch timer: %s\n", string);
1862                 return 1;
1863         }
1864         if (*sp == ':') {
1865                 stopwatch_start = DOUBLE_TO_NANO_ULL(value);
1866                 string = sp + 1;
1867                 value = strtod(string, &sp);
1868                 if (sp == string || *sp != '\0') {
1869                         fprintf(stderr,"Invalid stopwatch duration time: %s\n",
1870                                 string);
1871                         return 1;
1872                 }
1873         } else if (*sp != '\0') {
1874                 fprintf(stderr,"Invalid stopwatch start timer: %s\n", string);
1875                 return 1;
1876         }
1877         stopwatch_end = DOUBLE_TO_NANO_ULL(value);
1878         if (stopwatch_end <= stopwatch_start) {
1879                 fprintf(stderr, "Invalid stopwatch interval: %Lu -> %Lu\n",
1880                         stopwatch_start, stopwatch_end);
1881                 return 1;
1882         }
1883
1884         return 0;
1885 }
1886
1887 static char usage_str[] = \
1888         "[ -i <input name> ] [-o <output name> [ -s ] [ -t ] [ -q ]\n" \
1889         "[ -w start:stop ] [ -f output format ] [ -F format spec ] [ -v] \n\n" \
1890         "\t-i Input file containing trace data, or '-' for stdin\n" \
1891         "\t-D Directory to prepend to input file names\n" \
1892         "\t-o Output file. If not given, output is stdout\n" \
1893         "\t-b stdin read batching\n" \
1894         "\t-s Show per-program io statistics\n" \
1895         "\t-n Hash processes by name, not pid\n" \
1896         "\t-t Track individual ios. Will tell you the time a request took\n" \
1897         "\t   to get queued, to get dispatched, and to get completed\n" \
1898         "\t-q Quiet. Don't display any stats at the end of the trace\n" \
1899         "\t-w Only parse data between the given time interval in seconds.\n" \
1900         "\t   If 'start' isn't given, blkparse defaults the start time to 0\n" \
1901         "\t-f Output format. Customize the output format. The format field\n" \
1902         "\t   identifies can be found in the documentation\n" \
1903         "\t-F Format specification. Can be found in the documentation\n" \
1904         "\t-v More verbose for marginal errors\n" \
1905         "\t-V Print program version info\n\n";
1906
1907 static void usage(char *prog)
1908 {
1909         fprintf(stderr, "Usage: %s %s %s", prog, blkparse_version, usage_str);
1910 }
1911
1912 int main(int argc, char *argv[])
1913 {
1914         char *ofp_buffer;
1915         int i, c, ret, mode;
1916         int act_mask_tmp = 0;
1917
1918         while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
1919                 switch (c) {
1920                 case 'a':
1921                         i = find_mask_map(optarg);
1922                         if (i < 0) {
1923                                 fprintf(stderr,"Invalid action mask %s\n",
1924                                         optarg);
1925                                 return 1;
1926                         }
1927                         act_mask_tmp |= i;
1928                         break;
1929
1930                 case 'A':
1931                         if ((sscanf(optarg, "%x", &i) != 1) || 
1932                                                         !valid_act_opt(i)) {
1933                                 fprintf(stderr,
1934                                         "Invalid set action mask %s/0x%x\n",
1935                                         optarg, i);
1936                                 return 1;
1937                         }
1938                         act_mask_tmp = i;
1939                         break;
1940                 case 'i':
1941                         if (!strcmp(optarg, "-") && !pipeline)
1942                                 pipeline = 1;
1943                         else if (resize_devices(optarg) != 0)
1944                                 return 1;
1945                         break;
1946                 case 'D':
1947                         input_dir = optarg;
1948                         break;
1949                 case 'o':
1950                         output_name = optarg;
1951                         break;
1952                 case 'b':
1953                         rb_batch = atoi(optarg);
1954                         if (rb_batch <= 0)
1955                                 rb_batch = RB_BATCH_DEFAULT;
1956                         break;
1957                 case 's':
1958                         per_process_stats = 1;
1959                         break;
1960                 case 't':
1961                         track_ios = 1;
1962                         break;
1963                 case 'q':
1964                         per_device_and_cpu_stats = 0;
1965                         break;
1966                 case 'w':
1967                         if (find_stopwatch_interval(optarg) != 0)
1968                                 return 1;
1969                         break;
1970                 case 'f':
1971                         set_all_format_specs(optarg);
1972                         break;
1973                 case 'F':
1974                         if (add_format_spec(optarg) != 0)
1975                                 return 1;
1976                         break;
1977                 case 'h':
1978                         ppi_hash_by_pid = 0;
1979                         break;
1980                 case 'v':
1981                         verbose++;
1982                         break;
1983                 case 'V':
1984                         printf("%s version %s\n", argv[0], blkparse_version);
1985                         return 0;
1986                 default:
1987                         usage(argv[0]);
1988                         return 1;
1989                 }
1990         }
1991
1992         while (optind < argc) {
1993                 if (!strcmp(argv[optind], "-") && !pipeline)
1994                         pipeline = 1;
1995                 else if (resize_devices(argv[optind]) != 0)
1996                         return 1;
1997                 optind++;
1998         }
1999
2000         if (!pipeline && !ndevices) {
2001                 usage(argv[0]);
2002                 return 1;
2003         }
2004
2005         if (act_mask_tmp != 0)
2006                 act_mask = act_mask_tmp;
2007
2008         memset(&rb_sort_root, 0, sizeof(rb_sort_root));
2009
2010         signal(SIGINT, handle_sigint);
2011         signal(SIGHUP, handle_sigint);
2012         signal(SIGTERM, handle_sigint);
2013
2014         setlocale(LC_NUMERIC, "en_US");
2015
2016         if (!output_name) {
2017                 ofp = fdopen(STDOUT_FILENO, "w");
2018                 mode = _IOLBF;
2019         } else {
2020                 char ofname[128];
2021
2022                 snprintf(ofname, sizeof(ofname) - 1, "%s", output_name);
2023                 ofp = fopen(ofname, "w");
2024                 mode = _IOFBF;
2025         }
2026
2027         if (!ofp) {
2028                 perror("fopen");
2029                 return 1;
2030         }
2031
2032         ofp_buffer = malloc(4096);      
2033         if (setvbuf(ofp, ofp_buffer, mode, 4096)) {
2034                 perror("setvbuf");
2035                 return 1;
2036         }
2037
2038         if (pipeline)
2039                 ret = do_stdin();
2040         else
2041                 ret = do_file();
2042
2043         show_stats();
2044         free(ofp_buffer);
2045         return ret;
2046 }