blkparse: split off the timestamp correction code in to a separate function
[blktrace.git] / blkparse.c
1 /*
2  * block queue tracing parse application
3  *
4  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5  * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
6  *
7  *  This program is free software; you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation; either version 2 of the License, or
10  *  (at your option) any later version.
11  *
12  *  This program is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *  GNU General Public License for more details.
16  *
17  *  You should have received a copy of the GNU General Public License
18  *  along with this program; if not, write to the Free Software
19  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20  *
21  */
22 #include <sys/types.h>
23 #include <sys/stat.h>
24 #include <unistd.h>
25 #include <stdio.h>
26 #include <fcntl.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <getopt.h>
30 #include <errno.h>
31 #include <signal.h>
32 #include <locale.h>
33 #include <libgen.h>
34
35 #include "blktrace.h"
36 #include "rbtree.h"
37 #include "jhash.h"
38
39 static char blkparse_version[] = "1.2.0";
40
41 struct skip_info {
42         unsigned long start, end;
43         struct skip_info *prev, *next;
44 };
45
46 struct per_dev_info {
47         dev_t dev;
48         char *name;
49
50         int backwards;
51         unsigned long long events;
52         unsigned long long first_reported_time;
53         unsigned long long last_reported_time;
54         unsigned long long last_read_time;
55         struct io_stats io_stats;
56         unsigned long skips;
57         unsigned long long seq_skips;
58         unsigned int max_depth[2];
59         unsigned int cur_depth[2];
60
61         struct rb_root rb_track;
62
63         int nfiles;
64         int ncpus;
65
66         unsigned long *cpu_map;
67         unsigned int cpu_map_max;
68
69         struct per_cpu_info *cpus;
70 };
71
72 /*
73  * some duplicated effort here, we can unify this hash and the ppi hash later
74  */
75 struct process_pid_map {
76         pid_t pid;
77         char comm[16];
78         struct process_pid_map *hash_next, *list_next;
79 };
80
81 #define PPM_HASH_SHIFT  (8)
82 #define PPM_HASH_SIZE   (1 << PPM_HASH_SHIFT)
83 #define PPM_HASH_MASK   (PPM_HASH_SIZE - 1)
84 static struct process_pid_map *ppm_hash_table[PPM_HASH_SIZE];
85
86 struct per_process_info {
87         struct process_pid_map *ppm;
88         struct io_stats io_stats;
89         struct per_process_info *hash_next, *list_next;
90         int more_than_one;
91
92         /*
93          * individual io stats
94          */
95         unsigned long long longest_allocation_wait[2];
96         unsigned long long longest_dispatch_wait[2];
97         unsigned long long longest_completion_wait[2];
98 };
99
100 #define PPI_HASH_SHIFT  (8)
101 #define PPI_HASH_SIZE   (1 << PPI_HASH_SHIFT)
102 #define PPI_HASH_MASK   (PPI_HASH_SIZE - 1)
103
104 enum {
105         SORT_PROG_EVENT_N,   /* Program Name */
106         SORT_PROG_EVENT_QKB, /* KB: Queued read and write */
107         SORT_PROG_EVENT_RKB, /* KB: Queued Read */
108         SORT_PROG_EVENT_WKB, /* KB: Queued Write */
109         SORT_PROG_EVENT_CKB, /* KB: Complete */
110         SORT_PROG_EVENT_QIO, /* IO: Queued read and write */
111         SORT_PROG_EVENT_RIO, /* IO: Queued Read */
112         SORT_PROG_EVENT_WIO, /* IO: Queued Write */
113         SORT_PROG_EVENT_CIO, /* IO: Complete */
114 };
115
116 static struct per_process_info *ppi_hash_table[PPI_HASH_SIZE];
117 static struct per_process_info *ppi_list;
118 static int ppi_list_entries;
119
120 static struct option l_opts[] = {
121         {
122                 .name = "act-mask",
123                 .has_arg = required_argument,
124                 .flag = NULL,
125                 .val = 'a'
126         },
127         {
128                 .name = "set-mask",
129                 .has_arg = required_argument,
130                 .flag = NULL,
131                 .val = 'A'
132         },
133         {
134                 .name = "batch",
135                 .has_arg = required_argument,
136                 .flag = NULL,
137                 .val = 'b'
138         },
139         {
140                 .name = "input-directory",
141                 .has_arg = required_argument,
142                 .flag = NULL,
143                 .val = 'D'
144         },
145         {
146                 .name = "dump-binary",
147                 .has_arg = required_argument,
148                 .flag = NULL,
149                 .val = 'd'
150         },
151         {
152                 .name = "format",
153                 .has_arg = required_argument,
154                 .flag = NULL,
155                 .val = 'f'
156         },
157         {
158                 .name = "format-spec",
159                 .has_arg = required_argument,
160                 .flag = NULL,
161                 .val = 'F'
162         },
163         {
164                 .name = "hash-by-name",
165                 .has_arg = no_argument,
166                 .flag = NULL,
167                 .val = 'h'
168         },
169         {
170                 .name = "input",
171                 .has_arg = required_argument,
172                 .flag = NULL,
173                 .val = 'i'
174         },
175         {
176                 .name = "no-msgs",
177                 .has_arg = no_argument,
178                 .flag = NULL,
179                 .val = 'M'
180         },
181         {
182                 .name = "output",
183                 .has_arg = required_argument,
184                 .flag = NULL,
185                 .val = 'o'
186         },
187         {
188                 .name = "no-text-output",
189                 .has_arg = no_argument,
190                 .flag = NULL,
191                 .val = 'O'
192         },
193         {
194                 .name = "quiet",
195                 .has_arg = no_argument,
196                 .flag = NULL,
197                 .val = 'q'
198         },
199         {
200                 .name = "per-program-stats",
201                 .has_arg = no_argument,
202                 .flag = NULL,
203                 .val = 's'
204         },
205         {
206                 .name = "sort-program-stats",
207                 .has_arg = required_argument,
208                 .flag = NULL,
209                 .val = 'S'
210         },
211         {
212                 .name = "track-ios",
213                 .has_arg = no_argument,
214                 .flag = NULL,
215                 .val = 't'
216         },
217         {
218                 .name = "stopwatch",
219                 .has_arg = required_argument,
220                 .flag = NULL,
221                 .val = 'w'
222         },
223         {
224                 .name = "verbose",
225                 .has_arg = no_argument,
226                 .flag = NULL,
227                 .val = 'v'
228         },
229         {
230                 .name = "version",
231                 .has_arg = no_argument,
232                 .flag = NULL,
233                 .val = 'V'
234         },
235         {
236                 .name = NULL,
237         }
238 };
239
240 /*
241  * for sorting the displayed output
242  */
243 struct trace {
244         struct blk_io_trace *bit;
245         struct rb_node rb_node;
246         struct trace *next;
247         unsigned long read_sequence;
248 };
249
250 static struct rb_root rb_sort_root;
251 static unsigned long rb_sort_entries;
252
253 static struct trace *trace_list;
254
255 /*
256  * allocation cache
257  */
258 static struct blk_io_trace *bit_alloc_list;
259 static struct trace *t_alloc_list;
260
261 /*
262  * for tracking individual ios
263  */
264 struct io_track {
265         struct rb_node rb_node;
266
267         struct process_pid_map *ppm;
268         __u64 sector;
269         unsigned long long allocation_time;
270         unsigned long long queue_time;
271         unsigned long long dispatch_time;
272         unsigned long long completion_time;
273 };
274
275 static int ndevices;
276 static struct per_dev_info *devices;
277 static char *get_dev_name(struct per_dev_info *, char *, int);
278 static int trace_rb_insert_last(struct per_dev_info *, struct trace *);
279
280 FILE *ofp = NULL;
281 static char *output_name;
282 static char *input_dir;
283
284 static unsigned long long genesis_time;
285 static unsigned long long last_allowed_time;
286 static unsigned long long stopwatch_start;      /* start from zero by default */
287 static unsigned long long stopwatch_end = -1ULL;        /* "infinity" */
288 static unsigned long read_sequence;
289
290 static int per_process_stats;
291 static int per_process_stats_event = SORT_PROG_EVENT_N;
292 static int per_device_and_cpu_stats = 1;
293 static int track_ios;
294 static int ppi_hash_by_pid = 1;
295 static int verbose;
296 static unsigned int act_mask = -1U;
297 static int stats_printed;
298 static int bin_output_msgs = 1;
299 int data_is_native = -1;
300
301 static FILE *dump_fp;
302 static char *dump_binary;
303
304 static unsigned int t_alloc_cache;
305 static unsigned int bit_alloc_cache;
306
307 #define RB_BATCH_DEFAULT        (512)
308 static unsigned int rb_batch = RB_BATCH_DEFAULT;
309
310 static int pipeline;
311 static char *pipename;
312
313 static int text_output = 1;
314
315 #define is_done()       (*(volatile int *)(&done))
316 static volatile int done;
317
318 struct timespec         abs_start_time;
319 static unsigned long long start_timestamp;
320
321 static int have_drv_data = 0;
322
323 #define JHASH_RANDOM    (0x3af5f2ee)
324
325 #define CPUS_PER_LONG   (8 * sizeof(unsigned long))
326 #define CPU_IDX(cpu)    ((cpu) / CPUS_PER_LONG)
327 #define CPU_BIT(cpu)    ((cpu) & (CPUS_PER_LONG - 1))
328
329 static void output_binary(void *buf, int len)
330 {
331         if (dump_binary) {
332                 size_t n = fwrite(buf, len, 1, dump_fp);
333                 if (n != 1) {
334                         perror(dump_binary);
335                         fclose(dump_fp);
336                         dump_binary = NULL;
337                 }
338         }
339 }
340
341 static void resize_cpu_info(struct per_dev_info *pdi, int cpu)
342 {
343         struct per_cpu_info *cpus = pdi->cpus;
344         int ncpus = pdi->ncpus;
345         int new_count = cpu + 1;
346         int new_space, size;
347         char *new_start;
348
349         size = new_count * sizeof(struct per_cpu_info);
350         cpus = realloc(cpus, size);
351         if (!cpus) {
352                 char name[20];
353                 fprintf(stderr, "Out of memory, CPU info for device %s (%d)\n",
354                         get_dev_name(pdi, name, sizeof(name)), size);
355                 exit(1);
356         }
357
358         new_start = (char *)cpus + (ncpus * sizeof(struct per_cpu_info));
359         new_space = (new_count - ncpus) * sizeof(struct per_cpu_info);
360         memset(new_start, 0, new_space);
361
362         pdi->ncpus = new_count;
363         pdi->cpus = cpus;
364
365         for (new_count = 0; new_count < pdi->ncpus; new_count++) {
366                 struct per_cpu_info *pci = &pdi->cpus[new_count];
367
368                 if (!pci->fd) {
369                         pci->fd = -1;
370                         memset(&pci->rb_last, 0, sizeof(pci->rb_last));
371                         pci->rb_last_entries = 0;
372                         pci->last_sequence = -1;
373                 }
374         }
375 }
376
377 static struct per_cpu_info *get_cpu_info(struct per_dev_info *pdi, int cpu)
378 {
379         struct per_cpu_info *pci;
380
381         if (cpu >= pdi->ncpus)
382                 resize_cpu_info(pdi, cpu);
383
384         pci = &pdi->cpus[cpu];
385         pci->cpu = cpu;
386         return pci;
387 }
388
389
390 static int resize_devices(char *name)
391 {
392         int size = (ndevices + 1) * sizeof(struct per_dev_info);
393
394         devices = realloc(devices, size);
395         if (!devices) {
396                 fprintf(stderr, "Out of memory, device %s (%d)\n", name, size);
397                 return 1;
398         }
399         memset(&devices[ndevices], 0, sizeof(struct per_dev_info));
400         devices[ndevices].name = name;
401         ndevices++;
402         return 0;
403 }
404
405 static struct per_dev_info *get_dev_info(dev_t dev)
406 {
407         struct per_dev_info *pdi;
408         int i;
409
410         for (i = 0; i < ndevices; i++) {
411                 if (!devices[i].dev)
412                         devices[i].dev = dev;
413                 if (devices[i].dev == dev)
414                         return &devices[i];
415         }
416
417         if (resize_devices(NULL))
418                 return NULL;
419
420         pdi = &devices[ndevices - 1];
421         pdi->dev = dev;
422         pdi->first_reported_time = 0;
423         pdi->last_read_time = 0;
424
425         return pdi;
426 }
427
428 static void insert_skip(struct per_cpu_info *pci, unsigned long start,
429                         unsigned long end)
430 {
431         struct skip_info *sip;
432
433         for (sip = pci->skips_tail; sip != NULL; sip = sip->prev) {
434                 if (end == (sip->start - 1)) {
435                         sip->start = start;
436                         return;
437                 } else if (start == (sip->end + 1)) {
438                         sip->end = end;
439                         return;
440                 }
441         }
442
443         sip = malloc(sizeof(struct skip_info));
444         sip->start = start;
445         sip->end = end;
446         sip->prev = sip->next = NULL;
447         if (pci->skips_tail == NULL)
448                 pci->skips_head = pci->skips_tail = sip;
449         else {
450                 sip->prev = pci->skips_tail;
451                 pci->skips_tail->next = sip;
452                 pci->skips_tail = sip;
453         }
454 }
455
456 static void remove_sip(struct per_cpu_info *pci, struct skip_info *sip)
457 {
458         if (sip->prev == NULL) {
459                 if (sip->next == NULL)
460                         pci->skips_head = pci->skips_tail = NULL;
461                 else {
462                         pci->skips_head = sip->next;
463                         sip->next->prev = NULL;
464                 }
465         } else if (sip->next == NULL) {
466                 pci->skips_tail = sip->prev;
467                 sip->prev->next = NULL;
468         } else {
469                 sip->prev->next = sip->next;
470                 sip->next->prev = sip->prev;
471         }
472
473         sip->prev = sip->next = NULL;
474         free(sip);
475 }
476
477 #define IN_SKIP(sip,seq) (((sip)->start <= (seq)) && ((seq) <= sip->end))
478 static int check_current_skips(struct per_cpu_info *pci, unsigned long seq)
479 {
480         struct skip_info *sip;
481
482         for (sip = pci->skips_tail; sip != NULL; sip = sip->prev) {
483                 if (IN_SKIP(sip, seq)) {
484                         if (sip->start == seq) {
485                                 if (sip->end == seq)
486                                         remove_sip(pci, sip);
487                                 else
488                                         sip->start += 1;
489                         } else if (sip->end == seq)
490                                 sip->end -= 1;
491                         else {
492                                 sip->end = seq - 1;
493                                 insert_skip(pci, seq + 1, sip->end);
494                         }
495                         return 1;
496                 }
497         }
498
499         return 0;
500 }
501
502 static void collect_pdi_skips(struct per_dev_info *pdi)
503 {
504         struct skip_info *sip;
505         int cpu;
506
507         pdi->skips = 0;
508         pdi->seq_skips = 0;
509
510         for (cpu = 0; cpu < pdi->ncpus; cpu++) {
511                 struct per_cpu_info *pci = &pdi->cpus[cpu];
512
513                 for (sip = pci->skips_head; sip != NULL; sip = sip->next) {
514                         pdi->skips++;
515                         pdi->seq_skips += (sip->end - sip->start + 1);
516                         if (verbose)
517                                 fprintf(stderr,"(%d,%d): skipping %lu -> %lu\n",
518                                         MAJOR(pdi->dev), MINOR(pdi->dev),
519                                         sip->start, sip->end);
520                 }
521         }
522 }
523
524 static void cpu_mark_online(struct per_dev_info *pdi, unsigned int cpu)
525 {
526         if (cpu >= pdi->cpu_map_max || !pdi->cpu_map) {
527                 int new_max = (cpu + CPUS_PER_LONG) & ~(CPUS_PER_LONG - 1);
528                 unsigned long *map = malloc(new_max / sizeof(long));
529
530                 memset(map, 0, new_max / sizeof(long));
531
532                 if (pdi->cpu_map) {
533                         memcpy(map, pdi->cpu_map, pdi->cpu_map_max / sizeof(long));
534                         free(pdi->cpu_map);
535                 }
536
537                 pdi->cpu_map = map;
538                 pdi->cpu_map_max = new_max;
539         }
540
541         pdi->cpu_map[CPU_IDX(cpu)] |= (1UL << CPU_BIT(cpu));
542 }
543
544 static inline void cpu_mark_offline(struct per_dev_info *pdi, int cpu)
545 {
546         pdi->cpu_map[CPU_IDX(cpu)] &= ~(1UL << CPU_BIT(cpu));
547 }
548
549 static inline int cpu_is_online(struct per_dev_info *pdi, int cpu)
550 {
551         return (pdi->cpu_map[CPU_IDX(cpu)] & (1UL << CPU_BIT(cpu))) != 0;
552 }
553
554 static inline int ppm_hash_pid(pid_t pid)
555 {
556         return jhash_1word(pid, JHASH_RANDOM) & PPM_HASH_MASK;
557 }
558
559 static struct process_pid_map *find_ppm(pid_t pid)
560 {
561         const int hash_idx = ppm_hash_pid(pid);
562         struct process_pid_map *ppm;
563
564         ppm = ppm_hash_table[hash_idx];
565         while (ppm) {
566                 if (ppm->pid == pid)
567                         return ppm;
568
569                 ppm = ppm->hash_next;
570         }
571
572         return NULL;
573 }
574
575 static struct process_pid_map *add_ppm_hash(pid_t pid, const char *name)
576 {
577         const int hash_idx = ppm_hash_pid(pid);
578         struct process_pid_map *ppm;
579
580         ppm = find_ppm(pid);
581         if (!ppm) {
582                 ppm = malloc(sizeof(*ppm));
583                 memset(ppm, 0, sizeof(*ppm));
584                 ppm->pid = pid;
585                 memset(ppm->comm, 0, sizeof(ppm->comm));
586                 strncpy(ppm->comm, name, sizeof(ppm->comm));
587                 ppm->comm[sizeof(ppm->comm) - 1] = '\0';
588                 ppm->hash_next = ppm_hash_table[hash_idx];
589                 ppm_hash_table[hash_idx] = ppm;
590         }
591
592         return ppm;
593 }
594
595 static void handle_notify(struct blk_io_trace *bit)
596 {
597         void    *payload = (caddr_t) bit + sizeof(*bit);
598         __u32   two32[2];
599
600         switch (bit->action) {
601         case BLK_TN_PROCESS:
602                 add_ppm_hash(bit->pid, payload);
603                 break;
604
605         case BLK_TN_TIMESTAMP:
606                 if (bit->pdu_len != sizeof(two32))
607                         return;
608                 memcpy(two32, payload, sizeof(two32));
609                 if (!data_is_native) {
610                         two32[0] = be32_to_cpu(two32[0]);
611                         two32[1] = be32_to_cpu(two32[1]);
612                 }
613                 start_timestamp = bit->time;
614                 abs_start_time.tv_sec  = two32[0];
615                 abs_start_time.tv_nsec = two32[1];
616                 if (abs_start_time.tv_nsec < 0) {
617                         abs_start_time.tv_sec--;
618                         abs_start_time.tv_nsec += 1000000000;
619                 }
620
621                 break;
622
623         case BLK_TN_MESSAGE:
624                 if (bit->pdu_len > 0) {
625                         char msg[bit->pdu_len+1];
626
627                         memcpy(msg, (char *)payload, bit->pdu_len);
628                         msg[bit->pdu_len] = '\0';
629
630                         fprintf(ofp,
631                                 "%3d,%-3d %2d %8s %5d.%09lu %5u %2s %3s %s\n",
632                                 MAJOR(bit->device), MINOR(bit->device),
633                                 bit->cpu, "0", (int) SECONDS(bit->time),
634                                 (unsigned long) NANO_SECONDS(bit->time),
635                                 0, "m", "N", msg);
636                 }
637                 break;
638
639         default:
640                 /* Ignore unknown notify events */
641                 ;
642         }
643 }
644
645 char *find_process_name(pid_t pid)
646 {
647         struct process_pid_map *ppm = find_ppm(pid);
648
649         if (ppm)
650                 return ppm->comm;
651
652         return NULL;
653 }
654
655 static inline int ppi_hash_pid(pid_t pid)
656 {
657         return jhash_1word(pid, JHASH_RANDOM) & PPI_HASH_MASK;
658 }
659
660 static inline int ppi_hash_name(const char *name)
661 {
662         return jhash(name, 16, JHASH_RANDOM) & PPI_HASH_MASK;
663 }
664
665 static inline int ppi_hash(struct per_process_info *ppi)
666 {
667         struct process_pid_map *ppm = ppi->ppm;
668
669         if (ppi_hash_by_pid)
670                 return ppi_hash_pid(ppm->pid);
671
672         return ppi_hash_name(ppm->comm);
673 }
674
675 static inline void add_ppi_to_hash(struct per_process_info *ppi)
676 {
677         const int hash_idx = ppi_hash(ppi);
678
679         ppi->hash_next = ppi_hash_table[hash_idx];
680         ppi_hash_table[hash_idx] = ppi;
681 }
682
683 static inline void add_ppi_to_list(struct per_process_info *ppi)
684 {
685         ppi->list_next = ppi_list;
686         ppi_list = ppi;
687         ppi_list_entries++;
688 }
689
690 static struct per_process_info *find_ppi_by_name(char *name)
691 {
692         const int hash_idx = ppi_hash_name(name);
693         struct per_process_info *ppi;
694
695         ppi = ppi_hash_table[hash_idx];
696         while (ppi) {
697                 struct process_pid_map *ppm = ppi->ppm;
698
699                 if (!strcmp(ppm->comm, name))
700                         return ppi;
701
702                 ppi = ppi->hash_next;
703         }
704
705         return NULL;
706 }
707
708 static struct per_process_info *find_ppi_by_pid(pid_t pid)
709 {
710         const int hash_idx = ppi_hash_pid(pid);
711         struct per_process_info *ppi;
712
713         ppi = ppi_hash_table[hash_idx];
714         while (ppi) {
715                 struct process_pid_map *ppm = ppi->ppm;
716
717                 if (ppm->pid == pid)
718                         return ppi;
719
720                 ppi = ppi->hash_next;
721         }
722
723         return NULL;
724 }
725
726 static struct per_process_info *find_ppi(pid_t pid)
727 {
728         struct per_process_info *ppi;
729         char *name;
730
731         if (ppi_hash_by_pid)
732                 return find_ppi_by_pid(pid);
733
734         name = find_process_name(pid);
735         if (!name)
736                 return NULL;
737
738         ppi = find_ppi_by_name(name);
739         if (ppi && ppi->ppm->pid != pid)
740                 ppi->more_than_one = 1;
741
742         return ppi;
743 }
744
745 /*
746  * struct trace and blktrace allocation cache, we do potentially
747  * millions of mallocs for these structures while only using at most
748  * a few thousand at the time
749  */
750 static inline void t_free(struct trace *t)
751 {
752         if (t_alloc_cache < 1024) {
753                 t->next = t_alloc_list;
754                 t_alloc_list = t;
755                 t_alloc_cache++;
756         } else
757                 free(t);
758 }
759
760 static inline struct trace *t_alloc(void)
761 {
762         struct trace *t = t_alloc_list;
763
764         if (t) {
765                 t_alloc_list = t->next;
766                 t_alloc_cache--;
767                 return t;
768         }
769
770         return malloc(sizeof(*t));
771 }
772
773 static inline void bit_free(struct blk_io_trace *bit)
774 {
775         if (bit_alloc_cache < 1024 && !bit->pdu_len) {
776                 /*
777                  * abuse a 64-bit field for a next pointer for the free item
778                  */
779                 bit->time = (__u64) (unsigned long) bit_alloc_list;
780                 bit_alloc_list = (struct blk_io_trace *) bit;
781                 bit_alloc_cache++;
782         } else
783                 free(bit);
784 }
785
786 static inline struct blk_io_trace *bit_alloc(void)
787 {
788         struct blk_io_trace *bit = bit_alloc_list;
789
790         if (bit) {
791                 bit_alloc_list = (struct blk_io_trace *) (unsigned long) \
792                                  bit->time;
793                 bit_alloc_cache--;
794                 return bit;
795         }
796
797         return malloc(sizeof(*bit));
798 }
799
800 static inline void __put_trace_last(struct per_dev_info *pdi, struct trace *t)
801 {
802         struct per_cpu_info *pci = get_cpu_info(pdi, t->bit->cpu);
803
804         rb_erase(&t->rb_node, &pci->rb_last);
805         pci->rb_last_entries--;
806
807         bit_free(t->bit);
808         t_free(t);
809 }
810
811 static void put_trace(struct per_dev_info *pdi, struct trace *t)
812 {
813         rb_erase(&t->rb_node, &rb_sort_root);
814         rb_sort_entries--;
815
816         trace_rb_insert_last(pdi, t);
817 }
818
819 static inline int trace_rb_insert(struct trace *t, struct rb_root *root)
820 {
821         struct rb_node **p = &root->rb_node;
822         struct rb_node *parent = NULL;
823         struct trace *__t;
824
825         while (*p) {
826                 parent = *p;
827
828                 __t = rb_entry(parent, struct trace, rb_node);
829
830                 if (t->bit->time < __t->bit->time)
831                         p = &(*p)->rb_left;
832                 else if (t->bit->time > __t->bit->time)
833                         p = &(*p)->rb_right;
834                 else if (t->bit->device < __t->bit->device)
835                         p = &(*p)->rb_left;
836                 else if (t->bit->device > __t->bit->device)
837                         p = &(*p)->rb_right;
838                 else if (t->bit->sequence < __t->bit->sequence)
839                         p = &(*p)->rb_left;
840                 else    /* >= sequence */
841                         p = &(*p)->rb_right;
842         }
843
844         rb_link_node(&t->rb_node, parent, p);
845         rb_insert_color(&t->rb_node, root);
846         return 0;
847 }
848
849 static inline int trace_rb_insert_sort(struct trace *t)
850 {
851         if (!trace_rb_insert(t, &rb_sort_root)) {
852                 rb_sort_entries++;
853                 return 0;
854         }
855
856         return 1;
857 }
858
859 static int trace_rb_insert_last(struct per_dev_info *pdi, struct trace *t)
860 {
861         struct per_cpu_info *pci = get_cpu_info(pdi, t->bit->cpu);
862
863         if (trace_rb_insert(t, &pci->rb_last))
864                 return 1;
865
866         pci->rb_last_entries++;
867
868         if (pci->rb_last_entries > rb_batch * pdi->nfiles) {
869                 struct rb_node *n = rb_first(&pci->rb_last);
870
871                 t = rb_entry(n, struct trace, rb_node);
872                 __put_trace_last(pdi, t);
873         }
874
875         return 0;
876 }
877
878 static struct trace *trace_rb_find(dev_t device, unsigned long sequence,
879                                    struct rb_root *root, int order)
880 {
881         struct rb_node *n = root->rb_node;
882         struct rb_node *prev = NULL;
883         struct trace *__t;
884
885         while (n) {
886                 __t = rb_entry(n, struct trace, rb_node);
887                 prev = n;
888
889                 if (device < __t->bit->device)
890                         n = n->rb_left;
891                 else if (device > __t->bit->device)
892                         n = n->rb_right;
893                 else if (sequence < __t->bit->sequence)
894                         n = n->rb_left;
895                 else if (sequence > __t->bit->sequence)
896                         n = n->rb_right;
897                 else
898                         return __t;
899         }
900
901         /*
902          * hack - the list may not be sequence ordered because some
903          * events don't have sequence and time matched. so we end up
904          * being a little off in the rb lookup here, because we don't
905          * know the time we are looking for. compensate by browsing
906          * a little ahead from the last entry to find the match
907          */
908         if (order && prev) {
909                 int max = 5;
910
911                 while (((n = rb_next(prev)) != NULL) && max--) {
912                         __t = rb_entry(n, struct trace, rb_node);
913
914                         if (__t->bit->device == device &&
915                             __t->bit->sequence == sequence)
916                                 return __t;
917
918                         prev = n;
919                 }
920         }
921
922         return NULL;
923 }
924
925 static inline struct trace *trace_rb_find_last(struct per_dev_info *pdi,
926                                                struct per_cpu_info *pci,
927                                                unsigned long seq)
928 {
929         return trace_rb_find(pdi->dev, seq, &pci->rb_last, 0);
930 }
931
932 static inline int track_rb_insert(struct per_dev_info *pdi,struct io_track *iot)
933 {
934         struct rb_node **p = &pdi->rb_track.rb_node;
935         struct rb_node *parent = NULL;
936         struct io_track *__iot;
937
938         while (*p) {
939                 parent = *p;
940                 __iot = rb_entry(parent, struct io_track, rb_node);
941
942                 if (iot->sector < __iot->sector)
943                         p = &(*p)->rb_left;
944                 else if (iot->sector > __iot->sector)
945                         p = &(*p)->rb_right;
946                 else {
947                         fprintf(stderr,
948                                 "sector alias (%Lu) on device %d,%d!\n",
949                                 (unsigned long long) iot->sector,
950                                 MAJOR(pdi->dev), MINOR(pdi->dev));
951                         return 1;
952                 }
953         }
954
955         rb_link_node(&iot->rb_node, parent, p);
956         rb_insert_color(&iot->rb_node, &pdi->rb_track);
957         return 0;
958 }
959
960 static struct io_track *__find_track(struct per_dev_info *pdi, __u64 sector)
961 {
962         struct rb_node *n = pdi->rb_track.rb_node;
963         struct io_track *__iot;
964
965         while (n) {
966                 __iot = rb_entry(n, struct io_track, rb_node);
967
968                 if (sector < __iot->sector)
969                         n = n->rb_left;
970                 else if (sector > __iot->sector)
971                         n = n->rb_right;
972                 else
973                         return __iot;
974         }
975
976         return NULL;
977 }
978
979 static struct io_track *find_track(struct per_dev_info *pdi, pid_t pid,
980                                    __u64 sector)
981 {
982         struct io_track *iot;
983
984         iot = __find_track(pdi, sector);
985         if (!iot) {
986                 iot = malloc(sizeof(*iot));
987                 iot->ppm = find_ppm(pid);
988                 if (!iot->ppm)
989                         iot->ppm = add_ppm_hash(pid, "unknown");
990                 iot->sector = sector;
991                 track_rb_insert(pdi, iot);
992         }
993
994         return iot;
995 }
996
997 static void log_track_frontmerge(struct per_dev_info *pdi,
998                                  struct blk_io_trace *t)
999 {
1000         struct io_track *iot;
1001
1002         if (!track_ios)
1003                 return;
1004
1005         iot = __find_track(pdi, t->sector + t_sec(t));
1006         if (!iot) {
1007                 if (verbose)
1008                         fprintf(stderr, "merge not found for (%d,%d): %llu\n",
1009                                 MAJOR(pdi->dev), MINOR(pdi->dev),
1010                                 (unsigned long long) t->sector + t_sec(t));
1011                 return;
1012         }
1013
1014         rb_erase(&iot->rb_node, &pdi->rb_track);
1015         iot->sector -= t_sec(t);
1016         track_rb_insert(pdi, iot);
1017 }
1018
1019 static void log_track_getrq(struct per_dev_info *pdi, struct blk_io_trace *t)
1020 {
1021         struct io_track *iot;
1022
1023         if (!track_ios)
1024                 return;
1025
1026         iot = find_track(pdi, t->pid, t->sector);
1027         iot->allocation_time = t->time;
1028 }
1029
1030 static inline int is_remapper(struct per_dev_info *pdi)
1031 {
1032         int major = MAJOR(pdi->dev);
1033
1034         return (major == 253 || major == 9);
1035 }
1036
1037 /*
1038  * for md/dm setups, the interesting cycle is Q -> C. So track queueing
1039  * time here, as dispatch time
1040  */
1041 static void log_track_queue(struct per_dev_info *pdi, struct blk_io_trace *t)
1042 {
1043         struct io_track *iot;
1044
1045         if (!track_ios)
1046                 return;
1047         if (!is_remapper(pdi))
1048                 return;
1049
1050         iot = find_track(pdi, t->pid, t->sector);
1051         iot->dispatch_time = t->time;
1052 }
1053
1054 /*
1055  * return time between rq allocation and insertion
1056  */
1057 static unsigned long long log_track_insert(struct per_dev_info *pdi,
1058                                            struct blk_io_trace *t)
1059 {
1060         unsigned long long elapsed;
1061         struct io_track *iot;
1062
1063         if (!track_ios)
1064                 return -1;
1065
1066         iot = find_track(pdi, t->pid, t->sector);
1067         iot->queue_time = t->time;
1068
1069         if (!iot->allocation_time)
1070                 return -1;
1071
1072         elapsed = iot->queue_time - iot->allocation_time;
1073
1074         if (per_process_stats) {
1075                 struct per_process_info *ppi = find_ppi(iot->ppm->pid);
1076                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
1077
1078                 if (ppi && elapsed > ppi->longest_allocation_wait[w])
1079                         ppi->longest_allocation_wait[w] = elapsed;
1080         }
1081
1082         return elapsed;
1083 }
1084
1085 /*
1086  * return time between queue and issue
1087  */
1088 static unsigned long long log_track_issue(struct per_dev_info *pdi,
1089                                           struct blk_io_trace *t)
1090 {
1091         unsigned long long elapsed;
1092         struct io_track *iot;
1093
1094         if (!track_ios)
1095                 return -1;
1096         if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
1097                 return -1;
1098
1099         iot = __find_track(pdi, t->sector);
1100         if (!iot) {
1101                 if (verbose)
1102                         fprintf(stderr, "issue not found for (%d,%d): %llu\n",
1103                                 MAJOR(pdi->dev), MINOR(pdi->dev),
1104                                 (unsigned long long) t->sector);
1105                 return -1;
1106         }
1107
1108         iot->dispatch_time = t->time;
1109         elapsed = iot->dispatch_time - iot->queue_time;
1110
1111         if (per_process_stats) {
1112                 struct per_process_info *ppi = find_ppi(iot->ppm->pid);
1113                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
1114
1115                 if (ppi && elapsed > ppi->longest_dispatch_wait[w])
1116                         ppi->longest_dispatch_wait[w] = elapsed;
1117         }
1118
1119         return elapsed;
1120 }
1121
1122 /*
1123  * return time between dispatch and complete
1124  */
1125 static unsigned long long log_track_complete(struct per_dev_info *pdi,
1126                                              struct blk_io_trace *t)
1127 {
1128         unsigned long long elapsed;
1129         struct io_track *iot;
1130
1131         if (!track_ios)
1132                 return -1;
1133
1134         iot = __find_track(pdi, t->sector);
1135         if (!iot) {
1136                 if (verbose)
1137                         fprintf(stderr,"complete not found for (%d,%d): %llu\n",
1138                                 MAJOR(pdi->dev), MINOR(pdi->dev),
1139                                 (unsigned long long) t->sector);
1140                 return -1;
1141         }
1142
1143         iot->completion_time = t->time;
1144         elapsed = iot->completion_time - iot->dispatch_time;
1145
1146         if (per_process_stats) {
1147                 struct per_process_info *ppi = find_ppi(iot->ppm->pid);
1148                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
1149
1150                 if (ppi && elapsed > ppi->longest_completion_wait[w])
1151                         ppi->longest_completion_wait[w] = elapsed;
1152         }
1153
1154         /*
1155          * kill the trace, we don't need it after completion
1156          */
1157         rb_erase(&iot->rb_node, &pdi->rb_track);
1158         free(iot);
1159
1160         return elapsed;
1161 }
1162
1163
1164 static struct io_stats *find_process_io_stats(pid_t pid)
1165 {
1166         struct per_process_info *ppi = find_ppi(pid);
1167
1168         if (!ppi) {
1169                 ppi = malloc(sizeof(*ppi));
1170                 memset(ppi, 0, sizeof(*ppi));
1171                 ppi->ppm = find_ppm(pid);
1172                 if (!ppi->ppm)
1173                         ppi->ppm = add_ppm_hash(pid, "unknown");
1174                 add_ppi_to_hash(ppi);
1175                 add_ppi_to_list(ppi);
1176         }
1177
1178         return &ppi->io_stats;
1179 }
1180
1181 static char *get_dev_name(struct per_dev_info *pdi, char *buffer, int size)
1182 {
1183         if (pdi->name)
1184                 snprintf(buffer, size, "%s", pdi->name);
1185         else
1186                 snprintf(buffer, size, "%d,%d",MAJOR(pdi->dev),MINOR(pdi->dev));
1187         return buffer;
1188 }
1189
1190 static void check_time(struct per_dev_info *pdi, struct blk_io_trace *bit)
1191 {
1192         unsigned long long this = bit->time;
1193         unsigned long long last = pdi->last_reported_time;
1194
1195         pdi->backwards = (this < last) ? 'B' : ' ';
1196         pdi->last_reported_time = this;
1197 }
1198
1199 static inline void __account_m(struct io_stats *ios, struct blk_io_trace *t,
1200                                int rw)
1201 {
1202         if (rw) {
1203                 ios->mwrites++;
1204                 ios->mwrite_kb += t_kb(t);
1205                 ios->mwrite_b += t_b(t);
1206         } else {
1207                 ios->mreads++;
1208                 ios->mread_kb += t_kb(t);
1209                 ios->mread_b += t_b(t);
1210         }
1211 }
1212
1213 static inline void account_m(struct blk_io_trace *t, struct per_cpu_info *pci,
1214                              int rw)
1215 {
1216         __account_m(&pci->io_stats, t, rw);
1217
1218         if (per_process_stats) {
1219                 struct io_stats *ios = find_process_io_stats(t->pid);
1220
1221                 __account_m(ios, t, rw);
1222         }
1223 }
1224
1225 static inline void __account_pc_queue(struct io_stats *ios,
1226                                       struct blk_io_trace *t, int rw)
1227 {
1228         if (rw) {
1229                 ios->qwrites_pc++;
1230                 ios->qwrite_kb_pc += t_kb(t);
1231                 ios->qwrite_b_pc += t_b(t);
1232         } else {
1233                 ios->qreads_pc++;
1234                 ios->qread_kb += t_kb(t);
1235                 ios->qread_b_pc += t_b(t);
1236         }
1237 }
1238
1239 static inline void account_pc_queue(struct blk_io_trace *t,
1240                                     struct per_cpu_info *pci, int rw)
1241 {
1242         __account_pc_queue(&pci->io_stats, t, rw);
1243
1244         if (per_process_stats) {
1245                 struct io_stats *ios = find_process_io_stats(t->pid);
1246
1247                 __account_pc_queue(ios, t, rw);
1248         }
1249 }
1250
1251 static inline void __account_pc_issue(struct io_stats *ios, int rw,
1252                                       unsigned int bytes)
1253 {
1254         if (rw) {
1255                 ios->iwrites_pc++;
1256                 ios->iwrite_kb_pc += bytes >> 10;
1257                 ios->iwrite_b_pc += bytes & 1023;
1258         } else {
1259                 ios->ireads_pc++;
1260                 ios->iread_kb_pc += bytes >> 10;
1261                 ios->iread_b_pc += bytes & 1023;
1262         }
1263 }
1264
1265 static inline void account_pc_issue(struct blk_io_trace *t,
1266                                     struct per_cpu_info *pci, int rw)
1267 {
1268         __account_pc_issue(&pci->io_stats, rw, t->bytes);
1269
1270         if (per_process_stats) {
1271                 struct io_stats *ios = find_process_io_stats(t->pid);
1272
1273                 __account_pc_issue(ios, rw, t->bytes);
1274         }
1275 }
1276
1277 static inline void __account_pc_requeue(struct io_stats *ios,
1278                                         struct blk_io_trace *t, int rw)
1279 {
1280         if (rw) {
1281                 ios->wrqueue_pc++;
1282                 ios->iwrite_kb_pc -= t_kb(t);
1283                 ios->iwrite_b_pc -= t_b(t);
1284         } else {
1285                 ios->rrqueue_pc++;
1286                 ios->iread_kb_pc -= t_kb(t);
1287                 ios->iread_b_pc -= t_b(t);
1288         }
1289 }
1290
1291 static inline void account_pc_requeue(struct blk_io_trace *t,
1292                                       struct per_cpu_info *pci, int rw)
1293 {
1294         __account_pc_requeue(&pci->io_stats, t, rw);
1295
1296         if (per_process_stats) {
1297                 struct io_stats *ios = find_process_io_stats(t->pid);
1298
1299                 __account_pc_requeue(ios, t, rw);
1300         }
1301 }
1302
1303 static inline void __account_pc_c(struct io_stats *ios, int rw)
1304 {
1305         if (rw)
1306                 ios->cwrites_pc++;
1307         else
1308                 ios->creads_pc++;
1309 }
1310
1311 static inline void account_pc_c(struct blk_io_trace *t,
1312                                 struct per_cpu_info *pci, int rw)
1313 {
1314         __account_pc_c(&pci->io_stats, rw);
1315
1316         if (per_process_stats) {
1317                 struct io_stats *ios = find_process_io_stats(t->pid);
1318
1319                 __account_pc_c(ios, rw);
1320         }
1321 }
1322
1323 static inline void __account_queue(struct io_stats *ios, struct blk_io_trace *t,
1324                                    int rw)
1325 {
1326         if (rw) {
1327                 ios->qwrites++;
1328                 ios->qwrite_kb += t_kb(t);
1329                 ios->qwrite_b += t_b(t);
1330         } else {
1331                 ios->qreads++;
1332                 ios->qread_kb += t_kb(t);
1333                 ios->qread_b += t_b(t);
1334         }
1335 }
1336
1337 static inline void account_queue(struct blk_io_trace *t,
1338                                  struct per_cpu_info *pci, int rw)
1339 {
1340         __account_queue(&pci->io_stats, t, rw);
1341
1342         if (per_process_stats) {
1343                 struct io_stats *ios = find_process_io_stats(t->pid);
1344
1345                 __account_queue(ios, t, rw);
1346         }
1347 }
1348
1349 static inline void __account_c(struct io_stats *ios, int rw, int bytes)
1350 {
1351         if (rw) {
1352                 ios->cwrites++;
1353                 ios->cwrite_kb += bytes >> 10;
1354                 ios->cwrite_b += bytes & 1023;
1355         } else {
1356                 ios->creads++;
1357                 ios->cread_kb += bytes >> 10;
1358                 ios->cread_b += bytes & 1023;
1359         }
1360 }
1361
1362 static inline void account_c(struct blk_io_trace *t, struct per_cpu_info *pci,
1363                              int rw, int bytes)
1364 {
1365         __account_c(&pci->io_stats, rw, bytes);
1366
1367         if (per_process_stats) {
1368                 struct io_stats *ios = find_process_io_stats(t->pid);
1369
1370                 __account_c(ios, rw, bytes);
1371         }
1372 }
1373
1374 static inline void __account_issue(struct io_stats *ios, int rw,
1375                                    unsigned int bytes)
1376 {
1377         if (rw) {
1378                 ios->iwrites++;
1379                 ios->iwrite_kb += bytes >> 10;
1380                 ios->iwrite_b  += bytes & 1023;
1381         } else {
1382                 ios->ireads++;
1383                 ios->iread_kb += bytes >> 10;
1384                 ios->iread_b  += bytes & 1023;
1385         }
1386 }
1387
1388 static inline void account_issue(struct blk_io_trace *t,
1389                                  struct per_cpu_info *pci, int rw)
1390 {
1391         __account_issue(&pci->io_stats, rw, t->bytes);
1392
1393         if (per_process_stats) {
1394                 struct io_stats *ios = find_process_io_stats(t->pid);
1395
1396                 __account_issue(ios, rw, t->bytes);
1397         }
1398 }
1399
1400 static inline void __account_unplug(struct io_stats *ios, int timer)
1401 {
1402         if (timer)
1403                 ios->timer_unplugs++;
1404         else
1405                 ios->io_unplugs++;
1406 }
1407
1408 static inline void account_unplug(struct blk_io_trace *t,
1409                                   struct per_cpu_info *pci, int timer)
1410 {
1411         __account_unplug(&pci->io_stats, timer);
1412
1413         if (per_process_stats) {
1414                 struct io_stats *ios = find_process_io_stats(t->pid);
1415
1416                 __account_unplug(ios, timer);
1417         }
1418 }
1419
1420 static inline void __account_requeue(struct io_stats *ios,
1421                                      struct blk_io_trace *t, int rw)
1422 {
1423         if (rw) {
1424                 ios->wrqueue++;
1425                 ios->iwrite_kb -= t_kb(t);
1426                 ios->iwrite_b -= t_b(t);
1427         } else {
1428                 ios->rrqueue++;
1429                 ios->iread_kb -= t_kb(t);
1430                 ios->iread_b -= t_b(t);
1431         }
1432 }
1433
1434 static inline void account_requeue(struct blk_io_trace *t,
1435                                    struct per_cpu_info *pci, int rw)
1436 {
1437         __account_requeue(&pci->io_stats, t, rw);
1438
1439         if (per_process_stats) {
1440                 struct io_stats *ios = find_process_io_stats(t->pid);
1441
1442                 __account_requeue(ios, t, rw);
1443         }
1444 }
1445
1446 static void log_complete(struct per_dev_info *pdi, struct per_cpu_info *pci,
1447                          struct blk_io_trace *t, char *act)
1448 {
1449         process_fmt(act, pci, t, log_track_complete(pdi, t), 0, NULL);
1450 }
1451
1452 static void log_insert(struct per_dev_info *pdi, struct per_cpu_info *pci,
1453                        struct blk_io_trace *t, char *act)
1454 {
1455         process_fmt(act, pci, t, log_track_insert(pdi, t), 0, NULL);
1456 }
1457
1458 static void log_queue(struct per_cpu_info *pci, struct blk_io_trace *t,
1459                       char *act)
1460 {
1461         process_fmt(act, pci, t, -1, 0, NULL);
1462 }
1463
1464 static void log_issue(struct per_dev_info *pdi, struct per_cpu_info *pci,
1465                       struct blk_io_trace *t, char *act)
1466 {
1467         process_fmt(act, pci, t, log_track_issue(pdi, t), 0, NULL);
1468 }
1469
1470 static void log_merge(struct per_dev_info *pdi, struct per_cpu_info *pci,
1471                       struct blk_io_trace *t, char *act)
1472 {
1473         if (act[0] == 'F')
1474                 log_track_frontmerge(pdi, t);
1475
1476         process_fmt(act, pci, t, -1ULL, 0, NULL);
1477 }
1478
1479 static void log_action(struct per_cpu_info *pci, struct blk_io_trace *t,
1480                         char *act)
1481 {
1482         process_fmt(act, pci, t, -1ULL, 0, NULL);
1483 }
1484
1485 static void log_generic(struct per_cpu_info *pci, struct blk_io_trace *t,
1486                         char *act)
1487 {
1488         process_fmt(act, pci, t, -1ULL, 0, NULL);
1489 }
1490
1491 static void log_unplug(struct per_cpu_info *pci, struct blk_io_trace *t,
1492                       char *act)
1493 {
1494         process_fmt(act, pci, t, -1ULL, 0, NULL);
1495 }
1496
1497 static void log_split(struct per_cpu_info *pci, struct blk_io_trace *t,
1498                       char *act)
1499 {
1500         process_fmt(act, pci, t, -1ULL, 0, NULL);
1501 }
1502
1503 static void log_pc(struct per_cpu_info *pci, struct blk_io_trace *t, char *act)
1504 {
1505         unsigned char *buf = (unsigned char *) t + sizeof(*t);
1506
1507         process_fmt(act, pci, t, -1ULL, t->pdu_len, buf);
1508 }
1509
1510 static void dump_trace_pc(struct blk_io_trace *t, struct per_dev_info *pdi,
1511                           struct per_cpu_info *pci)
1512 {
1513         int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
1514         int act = t->action & 0xffff;
1515
1516         switch (act) {
1517                 case __BLK_TA_QUEUE:
1518                         log_generic(pci, t, "Q");
1519                         account_pc_queue(t, pci, w);
1520                         break;
1521                 case __BLK_TA_GETRQ:
1522                         log_generic(pci, t, "G");
1523                         break;
1524                 case __BLK_TA_SLEEPRQ:
1525                         log_generic(pci, t, "S");
1526                         break;
1527                 case __BLK_TA_REQUEUE:
1528                         /*
1529                          * can happen if we miss traces, don't let it go
1530                          * below zero
1531                          */
1532                         if (pdi->cur_depth[w])
1533                                 pdi->cur_depth[w]--;
1534                         account_pc_requeue(t, pci, w);
1535                         log_generic(pci, t, "R");
1536                         break;
1537                 case __BLK_TA_ISSUE:
1538                         account_pc_issue(t, pci, w);
1539                         pdi->cur_depth[w]++;
1540                         if (pdi->cur_depth[w] > pdi->max_depth[w])
1541                                 pdi->max_depth[w] = pdi->cur_depth[w];
1542                         log_pc(pci, t, "D");
1543                         break;
1544                 case __BLK_TA_COMPLETE:
1545                         if (pdi->cur_depth[w])
1546                                 pdi->cur_depth[w]--;
1547                         log_pc(pci, t, "C");
1548                         account_pc_c(t, pci, w);
1549                         break;
1550                 case __BLK_TA_INSERT:
1551                         log_pc(pci, t, "I");
1552                         break;
1553                 default:
1554                         fprintf(stderr, "Bad pc action %x\n", act);
1555                         break;
1556         }
1557 }
1558
1559 static void dump_trace_fs(struct blk_io_trace *t, struct per_dev_info *pdi,
1560                           struct per_cpu_info *pci)
1561 {
1562         int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
1563         int act = t->action & 0xffff;
1564
1565         switch (act) {
1566                 case __BLK_TA_QUEUE:
1567                         log_track_queue(pdi, t);
1568                         account_queue(t, pci, w);
1569                         log_queue(pci, t, "Q");
1570                         break;
1571                 case __BLK_TA_INSERT:
1572                         log_insert(pdi, pci, t, "I");
1573                         break;
1574                 case __BLK_TA_BACKMERGE:
1575                         account_m(t, pci, w);
1576                         log_merge(pdi, pci, t, "M");
1577                         break;
1578                 case __BLK_TA_FRONTMERGE:
1579                         account_m(t, pci, w);
1580                         log_merge(pdi, pci, t, "F");
1581                         break;
1582                 case __BLK_TA_GETRQ:
1583                         log_track_getrq(pdi, t);
1584                         log_generic(pci, t, "G");
1585                         break;
1586                 case __BLK_TA_SLEEPRQ:
1587                         log_generic(pci, t, "S");
1588                         break;
1589                 case __BLK_TA_REQUEUE:
1590                         /*
1591                          * can happen if we miss traces, don't let it go
1592                          * below zero
1593                          */
1594                         if (pdi->cur_depth[w])
1595                                 pdi->cur_depth[w]--;
1596                         account_requeue(t, pci, w);
1597                         log_queue(pci, t, "R");
1598                         break;
1599                 case __BLK_TA_ISSUE:
1600                         account_issue(t, pci, w);
1601                         pdi->cur_depth[w]++;
1602                         if (pdi->cur_depth[w] > pdi->max_depth[w])
1603                                 pdi->max_depth[w] = pdi->cur_depth[w];
1604                         log_issue(pdi, pci, t, "D");
1605                         break;
1606                 case __BLK_TA_COMPLETE:
1607                         if (pdi->cur_depth[w])
1608                                 pdi->cur_depth[w]--;
1609                         account_c(t, pci, w, t->bytes);
1610                         log_complete(pdi, pci, t, "C");
1611                         break;
1612                 case __BLK_TA_PLUG:
1613                         log_action(pci, t, "P");
1614                         break;
1615                 case __BLK_TA_UNPLUG_IO:
1616                         account_unplug(t, pci, 0);
1617                         log_unplug(pci, t, "U");
1618                         break;
1619                 case __BLK_TA_UNPLUG_TIMER:
1620                         account_unplug(t, pci, 1);
1621                         log_unplug(pci, t, "UT");
1622                         break;
1623                 case __BLK_TA_SPLIT:
1624                         log_split(pci, t, "X");
1625                         break;
1626                 case __BLK_TA_BOUNCE:
1627                         log_generic(pci, t, "B");
1628                         break;
1629                 case __BLK_TA_REMAP:
1630                         log_generic(pci, t, "A");
1631                         break;
1632                 case __BLK_TA_DRV_DATA:
1633                         have_drv_data = 1;
1634                         /* dump to binary file only */
1635                         break;
1636                 default:
1637                         fprintf(stderr, "Bad fs action %x\n", t->action);
1638                         break;
1639         }
1640 }
1641
1642 static void dump_trace(struct blk_io_trace *t, struct per_cpu_info *pci,
1643                        struct per_dev_info *pdi)
1644 {
1645         if (text_output) {
1646                 if (t->action == BLK_TN_MESSAGE)
1647                         handle_notify(t);
1648                 else if (t->action & BLK_TC_ACT(BLK_TC_PC))
1649                         dump_trace_pc(t, pdi, pci);
1650                 else
1651                         dump_trace_fs(t, pdi, pci);
1652         }
1653
1654         if (!pdi->events)
1655                 pdi->first_reported_time = t->time;
1656
1657         pdi->events++;
1658
1659         if (bin_output_msgs ||
1660                             !(t->action & BLK_TC_ACT(BLK_TC_NOTIFY) &&
1661                               t->action == BLK_TN_MESSAGE))
1662                 output_binary(t, sizeof(*t) + t->pdu_len);
1663 }
1664
1665 /*
1666  * print in a proper way, not too small and not too big. if more than
1667  * 1000,000K, turn into M and so on
1668  */
1669 static char *size_cnv(char *dst, unsigned long long num, int in_kb)
1670 {
1671         char suff[] = { '\0', 'K', 'M', 'G', 'P' };
1672         unsigned int i = 0;
1673
1674         if (in_kb)
1675                 i++;
1676
1677         while (num > 1000 * 1000ULL && (i < sizeof(suff) - 1)) {
1678                 i++;
1679                 num /= 1000;
1680         }
1681
1682         sprintf(dst, "%'8Lu%c", num, suff[i]);
1683         return dst;
1684 }
1685
1686 static void dump_io_stats(struct per_dev_info *pdi, struct io_stats *ios,
1687                           char *msg)
1688 {
1689         static char x[256], y[256];
1690
1691         fprintf(ofp, "%s\n", msg);
1692
1693         fprintf(ofp, " Reads Queued:    %s, %siB\t",
1694                         size_cnv(x, ios->qreads, 0),
1695                         size_cnv(y, ios->qread_kb + (ios->qread_b>>10), 1));
1696         fprintf(ofp, " Writes Queued:    %s, %siB\n",
1697                         size_cnv(x, ios->qwrites, 0),
1698                         size_cnv(y, ios->qwrite_kb + (ios->qwrite_b>>10), 1));
1699         fprintf(ofp, " Read Dispatches: %s, %siB\t",
1700                         size_cnv(x, ios->ireads, 0),
1701                         size_cnv(y, ios->iread_kb + (ios->iread_b>>10), 1));
1702         fprintf(ofp, " Write Dispatches: %s, %siB\n",
1703                         size_cnv(x, ios->iwrites, 0),
1704                         size_cnv(y, ios->iwrite_kb + (ios->iwrite_b>>10), 1));
1705         fprintf(ofp, " Reads Requeued:  %s\t\t", size_cnv(x, ios->rrqueue, 0));
1706         fprintf(ofp, " Writes Requeued:  %s\n", size_cnv(x, ios->wrqueue, 0));
1707         fprintf(ofp, " Reads Completed: %s, %siB\t",
1708                         size_cnv(x, ios->creads, 0),
1709                         size_cnv(y, ios->cread_kb + (ios->cread_b>>10), 1));
1710         fprintf(ofp, " Writes Completed: %s, %siB\n",
1711                         size_cnv(x, ios->cwrites, 0),
1712                         size_cnv(y, ios->cwrite_kb + (ios->cwrite_b>>10), 1));
1713         fprintf(ofp, " Read Merges:     %s, %siB\t",
1714                         size_cnv(x, ios->mreads, 0),
1715                         size_cnv(y, ios->mread_kb + (ios->mread_b>>10), 1));
1716         fprintf(ofp, " Write Merges:     %s, %siB\n",
1717                         size_cnv(x, ios->mwrites, 0),
1718                         size_cnv(y, ios->mwrite_kb + (ios->mwrite_b>>10), 1));
1719         if (pdi) {
1720                 fprintf(ofp, " Read depth:      %'8u%8c\t", pdi->max_depth[0], ' ');
1721                 fprintf(ofp, " Write depth:      %'8u\n", pdi->max_depth[1]);
1722         }
1723         if (ios->qreads_pc || ios->qwrites_pc || ios->ireads_pc || ios->iwrites_pc ||
1724             ios->rrqueue_pc || ios->wrqueue_pc || ios->creads_pc || ios->cwrites_pc) {
1725                 fprintf(ofp, " PC Reads Queued: %s, %siB\t",
1726                         size_cnv(x, ios->qreads_pc, 0),
1727                         size_cnv(y,
1728                                 ios->qread_kb_pc + (ios->qread_b_pc>>10), 1));
1729                 fprintf(ofp, " PC Writes Queued: %s, %siB\n",
1730                         size_cnv(x, ios->qwrites_pc, 0),
1731                         size_cnv(y,
1732                                 ios->qwrite_kb_pc + (ios->qwrite_b_pc>>10), 1));
1733                 fprintf(ofp, " PC Read Disp.:   %s, %siB\t",
1734                         size_cnv(x, ios->ireads_pc, 0),
1735                         size_cnv(y,
1736                                 ios->iread_kb_pc + (ios->iread_b_pc>>10), 1));
1737                 fprintf(ofp, " PC Write Disp.:   %s, %siB\n",
1738                         size_cnv(x, ios->iwrites_pc, 0),
1739                         size_cnv(y,
1740                                 ios->iwrite_kb_pc + (ios->iwrite_b_pc>>10),
1741                                 1));
1742                 fprintf(ofp, " PC Reads Req.:   %s\t\t", size_cnv(x, ios->rrqueue_pc, 0));
1743                 fprintf(ofp, " PC Writes Req.:   %s\n", size_cnv(x, ios->wrqueue_pc, 0));
1744                 fprintf(ofp, " PC Reads Compl.: %s\t\t", size_cnv(x, ios->creads_pc, 0));
1745                 fprintf(ofp, " PC Writes Compl.: %s\n", size_cnv(x, ios->cwrites_pc, 0));
1746         }
1747         fprintf(ofp, " IO unplugs:      %'8lu%8c\t", ios->io_unplugs, ' ');
1748         fprintf(ofp, " Timer unplugs:    %'8lu\n", ios->timer_unplugs);
1749 }
1750
1751 static void dump_wait_stats(struct per_process_info *ppi)
1752 {
1753         unsigned long rawait = ppi->longest_allocation_wait[0] / 1000;
1754         unsigned long rdwait = ppi->longest_dispatch_wait[0] / 1000;
1755         unsigned long rcwait = ppi->longest_completion_wait[0] / 1000;
1756         unsigned long wawait = ppi->longest_allocation_wait[1] / 1000;
1757         unsigned long wdwait = ppi->longest_dispatch_wait[1] / 1000;
1758         unsigned long wcwait = ppi->longest_completion_wait[1] / 1000;
1759
1760         fprintf(ofp, " Allocation wait: %'8lu%8c\t", rawait, ' ');
1761         fprintf(ofp, " Allocation wait:  %'8lu\n", wawait);
1762         fprintf(ofp, " Dispatch wait:   %'8lu%8c\t", rdwait, ' ');
1763         fprintf(ofp, " Dispatch wait:    %'8lu\n", wdwait);
1764         fprintf(ofp, " Completion wait: %'8lu%8c\t", rcwait, ' ');
1765         fprintf(ofp, " Completion wait:  %'8lu\n", wcwait);
1766 }
1767
1768 static int ppi_name_compare(const void *p1, const void *p2)
1769 {
1770         struct per_process_info *ppi1 = *((struct per_process_info **) p1);
1771         struct per_process_info *ppi2 = *((struct per_process_info **) p2);
1772         int res;
1773
1774         res = strverscmp(ppi1->ppm->comm, ppi2->ppm->comm);
1775         if (!res)
1776                 res = ppi1->ppm->pid > ppi2->ppm->pid;
1777
1778         return res;
1779 }
1780
1781 static int ppi_event_compare(const void *p1, const void *p2)
1782 {
1783         struct per_process_info *ppi1 = *((struct per_process_info **) p1);
1784         struct per_process_info *ppi2 = *((struct per_process_info **) p2);
1785         struct io_stats *ios1 = &ppi1->io_stats;
1786         struct io_stats *ios2 = &ppi2->io_stats;
1787         unsigned long io1, io2;
1788         unsigned long long kb1,kb2;
1789         int sort_by_kb = 1;
1790
1791         io1 = io2 = 0;
1792         kb1 = kb2 = 0;
1793
1794         switch (per_process_stats_event) {
1795         case SORT_PROG_EVENT_QKB: /* KB: Queued read and write */
1796                 kb1 = ios1->qwrite_kb + (ios1->qwrite_b>>10) +
1797                         ios1->qread_kb + (ios1->qread_b>>10);
1798                 kb2 = ios2->qwrite_kb + (ios2->qwrite_b>>10) +
1799                         ios2->qread_kb + (ios2->qread_b>>10);
1800                 break;
1801         case SORT_PROG_EVENT_RKB: /* KB: Queued Read */
1802                 kb1 = ios1->qread_kb + (ios1->qread_b>>10);
1803                 kb2 = ios2->qread_kb + (ios2->qread_b>>10);
1804                 break;
1805         case SORT_PROG_EVENT_WKB: /* KB: Queued Write */
1806                 kb1 = ios1->qwrite_kb + (ios1->qwrite_b>>10);
1807                 kb2 = ios2->qwrite_kb + (ios2->qwrite_b>>10);
1808                 break;
1809         case SORT_PROG_EVENT_CKB: /* KB: Complete */
1810                 kb1 = ios1->cwrite_kb + (ios1->cwrite_b>>10) +
1811                         ios1->cread_kb + (ios1->cread_b>>10);
1812                 kb2 = ios2->cwrite_kb + (ios2->cwrite_b>>10) +
1813                         ios2->cread_kb + (ios2->cread_b>>10);
1814                 break;
1815         case SORT_PROG_EVENT_QIO: /* IO: Queued read and write */
1816                 sort_by_kb = 0;
1817                 io1 = ios1->qreads + ios1->qwrites;
1818                 io2 = ios2->qreads + ios2->qwrites;
1819                 break;
1820         case SORT_PROG_EVENT_RIO: /* IO: Queued Read */
1821                 sort_by_kb = 0;
1822                 io1 = ios1->qreads;
1823                 io2 = ios2->qreads;
1824                 break;
1825         case SORT_PROG_EVENT_WIO: /* IO: Queued Write */
1826                 sort_by_kb = 0;
1827                 io1 = ios1->qwrites;
1828                 io2 = ios2->qwrites;
1829                 break;
1830         case SORT_PROG_EVENT_CIO: /* IO: Complete */
1831                 sort_by_kb = 0;
1832                 io1 = ios1->creads + ios1->cwrites;
1833                 io2 = ios2->creads + ios2->cwrites;
1834                 break;
1835         }
1836
1837
1838         /* compare kb */
1839         if (sort_by_kb) {
1840                 if (kb1 > kb2)
1841                         return 1;
1842                 else if (kb1 == kb2)
1843                         return 0;
1844                 return -1;
1845         }
1846
1847         /* compare io */
1848         if (io1 > io2)
1849                 return 1;
1850         else if (io1 == io2)
1851                 return 0;
1852         return -1;
1853 }
1854
1855 static int ppi_compare(const void *p1, const void *p2)
1856 {
1857         if (per_process_stats_event == SORT_PROG_EVENT_N)
1858                 return ppi_name_compare(p1, p2);
1859
1860         return ppi_event_compare(p1, p2);
1861 }
1862
1863 static void sort_process_list(void)
1864 {
1865         struct per_process_info **ppis;
1866         struct per_process_info *ppi;
1867         int i = 0;
1868
1869         ppis = malloc(ppi_list_entries * sizeof(struct per_process_info *));
1870
1871         ppi = ppi_list;
1872         while (ppi) {
1873                 ppis[i++] = ppi;
1874                 ppi = ppi->list_next;
1875         }
1876
1877         qsort(ppis, ppi_list_entries, sizeof(ppi), ppi_compare);
1878
1879         i = ppi_list_entries - 1;
1880         ppi_list = NULL;
1881         while (i >= 0) {
1882                 ppi = ppis[i];
1883
1884                 ppi->list_next = ppi_list;
1885                 ppi_list = ppi;
1886                 i--;
1887         }
1888
1889         free(ppis);
1890 }
1891
1892 static void show_process_stats(void)
1893 {
1894         struct per_process_info *ppi;
1895
1896         sort_process_list();
1897
1898         ppi = ppi_list;
1899         while (ppi) {
1900                 struct process_pid_map *ppm = ppi->ppm;
1901                 char name[64];
1902
1903                 if (ppi->more_than_one)
1904                         sprintf(name, "%s (%u, ...)", ppm->comm, ppm->pid);
1905                 else
1906                         sprintf(name, "%s (%u)", ppm->comm, ppm->pid);
1907
1908                 dump_io_stats(NULL, &ppi->io_stats, name);
1909                 dump_wait_stats(ppi);
1910                 ppi = ppi->list_next;
1911         }
1912
1913         fprintf(ofp, "\n");
1914 }
1915
1916 static void show_device_and_cpu_stats(void)
1917 {
1918         struct per_dev_info *pdi;
1919         struct per_cpu_info *pci;
1920         struct io_stats total, *ios;
1921         unsigned long long rrate, wrate, msec;
1922         int i, j, pci_events;
1923         char line[3 + 8/*cpu*/ + 2 + 32/*dev*/ + 3];
1924         char name[32];
1925         double ratio;
1926
1927         for (pdi = devices, i = 0; i < ndevices; i++, pdi++) {
1928
1929                 memset(&total, 0, sizeof(total));
1930                 pci_events = 0;
1931
1932                 if (i > 0)
1933                         fprintf(ofp, "\n");
1934
1935                 for (pci = pdi->cpus, j = 0; j < pdi->ncpus; j++, pci++) {
1936                         if (!pci->nelems)
1937                                 continue;
1938
1939                         ios = &pci->io_stats;
1940                         total.qreads += ios->qreads;
1941                         total.qwrites += ios->qwrites;
1942                         total.creads += ios->creads;
1943                         total.cwrites += ios->cwrites;
1944                         total.mreads += ios->mreads;
1945                         total.mwrites += ios->mwrites;
1946                         total.ireads += ios->ireads;
1947                         total.iwrites += ios->iwrites;
1948                         total.rrqueue += ios->rrqueue;
1949                         total.wrqueue += ios->wrqueue;
1950                         total.qread_kb += ios->qread_kb;
1951                         total.qwrite_kb += ios->qwrite_kb;
1952                         total.cread_kb += ios->cread_kb;
1953                         total.cwrite_kb += ios->cwrite_kb;
1954                         total.iread_kb += ios->iread_kb;
1955                         total.iwrite_kb += ios->iwrite_kb;
1956                         total.mread_kb += ios->mread_kb;
1957                         total.mwrite_kb += ios->mwrite_kb;
1958                         total.qread_b += ios->qread_b;
1959                         total.qwrite_b += ios->qwrite_b;
1960                         total.cread_b += ios->cread_b;
1961                         total.cwrite_b += ios->cwrite_b;
1962                         total.iread_b += ios->iread_b;
1963                         total.iwrite_b += ios->iwrite_b;
1964                         total.mread_b += ios->mread_b;
1965                         total.mwrite_b += ios->mwrite_b;
1966
1967                         total.qreads_pc += ios->qreads_pc;
1968                         total.qwrites_pc += ios->qwrites_pc;
1969                         total.creads_pc += ios->creads_pc;
1970                         total.cwrites_pc += ios->cwrites_pc;
1971                         total.ireads_pc += ios->ireads_pc;
1972                         total.iwrites_pc += ios->iwrites_pc;
1973                         total.rrqueue_pc += ios->rrqueue_pc;
1974                         total.wrqueue_pc += ios->wrqueue_pc;
1975                         total.qread_kb_pc += ios->qread_kb_pc;
1976                         total.qwrite_kb_pc += ios->qwrite_kb_pc;
1977                         total.iread_kb_pc += ios->iread_kb_pc;
1978                         total.iwrite_kb_pc += ios->iwrite_kb_pc;
1979                         total.qread_b_pc += ios->qread_b_pc;
1980                         total.qwrite_b_pc += ios->qwrite_b_pc;
1981                         total.iread_b_pc += ios->iread_b_pc;
1982                         total.iwrite_b_pc += ios->iwrite_b_pc;
1983
1984                         total.timer_unplugs += ios->timer_unplugs;
1985                         total.io_unplugs += ios->io_unplugs;
1986
1987                         snprintf(line, sizeof(line) - 1, "CPU%d (%s):",
1988                                  j, get_dev_name(pdi, name, sizeof(name)));
1989                         dump_io_stats(pdi, ios, line);
1990                         pci_events++;
1991                 }
1992
1993                 if (pci_events > 1) {
1994                         fprintf(ofp, "\n");
1995                         snprintf(line, sizeof(line) - 1, "Total (%s):",
1996                                  get_dev_name(pdi, name, sizeof(name)));
1997                         dump_io_stats(NULL, &total, line);
1998                 }
1999
2000                 wrate = rrate = 0;
2001                 msec = (pdi->last_reported_time - pdi->first_reported_time) / 1000000;
2002                 if (msec) {
2003                         rrate = ((1000 * total.cread_kb) + total.cread_b) /
2004                                                                         msec;
2005                         wrate = ((1000 * total.cwrite_kb) + total.cwrite_b) /
2006                                                                         msec;
2007                 }
2008
2009                 fprintf(ofp, "\nThroughput (R/W): %'LuKiB/s / %'LuKiB/s\n",
2010                         rrate, wrate);
2011                 fprintf(ofp, "Events (%s): %'Lu entries\n",
2012                         get_dev_name(pdi, line, sizeof(line)), pdi->events);
2013
2014                 collect_pdi_skips(pdi);
2015                 if (!pdi->skips && !pdi->events)
2016                         ratio = 0.0;
2017                 else
2018                         ratio = 100.0 * ((double)pdi->seq_skips /
2019                                         (double)(pdi->events + pdi->seq_skips));
2020                 fprintf(ofp, "Skips: %'lu forward (%'llu - %5.1lf%%)\n",
2021                         pdi->skips, pdi->seq_skips, ratio);
2022         }
2023 }
2024
2025 static void correct_abs_start_time(void)
2026 {
2027         long delta = genesis_time - start_timestamp;
2028
2029         abs_start_time.tv_sec  += SECONDS(delta);
2030         abs_start_time.tv_nsec += NANO_SECONDS(delta);
2031         if (abs_start_time.tv_nsec < 0) {
2032                 abs_start_time.tv_nsec += 1000000000;
2033                 abs_start_time.tv_sec -= 1;
2034         } else
2035         if (abs_start_time.tv_nsec > 1000000000) {
2036                 abs_start_time.tv_nsec -= 1000000000;
2037                 abs_start_time.tv_sec += 1;
2038         }
2039 }
2040
2041 static void find_genesis(void)
2042 {
2043         struct trace *t = trace_list;
2044
2045         genesis_time = -1ULL;
2046         while (t != NULL) {
2047                 if (t->bit->time < genesis_time)
2048                         genesis_time = t->bit->time;
2049
2050                 t = t->next;
2051         }
2052
2053         /* The time stamp record will usually be the first
2054          * record in the trace, but not always.
2055          */
2056         if (start_timestamp
2057          && start_timestamp != genesis_time) {
2058                 correct_abs_start_time();
2059         }
2060 }
2061
2062 static inline int check_stopwatch(struct blk_io_trace *bit)
2063 {
2064         if (bit->time < stopwatch_end &&
2065             bit->time >= stopwatch_start)
2066                 return 0;
2067
2068         return 1;
2069 }
2070
2071 /*
2072  * return youngest entry read
2073  */
2074 static int sort_entries(unsigned long long *youngest)
2075 {
2076         struct per_dev_info *pdi = NULL;
2077         struct per_cpu_info *pci = NULL;
2078         struct trace *t;
2079
2080         if (!genesis_time)
2081                 find_genesis();
2082
2083         *youngest = 0;
2084         while ((t = trace_list) != NULL) {
2085                 struct blk_io_trace *bit = t->bit;
2086
2087                 trace_list = t->next;
2088
2089                 bit->time -= genesis_time;
2090
2091                 if (bit->time < *youngest || !*youngest)
2092                         *youngest = bit->time;
2093
2094                 if (!pdi || pdi->dev != bit->device) {
2095                         pdi = get_dev_info(bit->device);
2096                         pci = NULL;
2097                 }
2098
2099                 if (!pci || pci->cpu != bit->cpu)
2100                         pci = get_cpu_info(pdi, bit->cpu);
2101
2102                 if (bit->sequence < pci->smallest_seq_read)
2103                         pci->smallest_seq_read = bit->sequence;
2104
2105                 if (check_stopwatch(bit)) {
2106                         bit_free(bit);
2107                         t_free(t);
2108                         continue;
2109                 }
2110
2111                 if (trace_rb_insert_sort(t))
2112                         return -1;
2113         }
2114
2115         return 0;
2116 }
2117
2118 /*
2119  * to continue, we must have traces from all online cpus in the tree
2120  */
2121 static int check_cpu_map(struct per_dev_info *pdi)
2122 {
2123         unsigned long *cpu_map;
2124         struct rb_node *n;
2125         struct trace *__t;
2126         unsigned int i;
2127         int ret, cpu;
2128
2129         /*
2130          * create a map of the cpus we have traces for
2131          */
2132         cpu_map = malloc(pdi->cpu_map_max / sizeof(long));
2133         memset(cpu_map, 0, sizeof(*cpu_map));
2134         n = rb_first(&rb_sort_root);
2135         while (n) {
2136                 __t = rb_entry(n, struct trace, rb_node);
2137                 cpu = __t->bit->cpu;
2138
2139                 cpu_map[CPU_IDX(cpu)] |= (1UL << CPU_BIT(cpu));
2140                 n = rb_next(n);
2141         }
2142
2143         /*
2144          * we can't continue if pdi->cpu_map has entries set that we don't
2145          * have in the sort rbtree. the opposite is not a problem, though
2146          */
2147         ret = 0;
2148         for (i = 0; i < pdi->cpu_map_max / CPUS_PER_LONG; i++) {
2149                 if (pdi->cpu_map[i] & ~(cpu_map[i])) {
2150                         ret = 1;
2151                         break;
2152                 }
2153         }
2154
2155         free(cpu_map);
2156         return ret;
2157 }
2158
2159 static int check_sequence(struct per_dev_info *pdi, struct trace *t, int force)
2160 {
2161         struct blk_io_trace *bit = t->bit;
2162         unsigned long expected_sequence;
2163         struct per_cpu_info *pci;
2164         struct trace *__t;
2165
2166         pci = get_cpu_info(pdi, bit->cpu);
2167         expected_sequence = pci->last_sequence + 1;
2168
2169         if (!expected_sequence) {
2170                 /*
2171                  * 1 should be the first entry, just allow it
2172                  */
2173                 if (bit->sequence == 1)
2174                         return 0;
2175                 if (bit->sequence == pci->smallest_seq_read)
2176                         return 0;
2177
2178                 return check_cpu_map(pdi);
2179         }
2180
2181         if (bit->sequence == expected_sequence)
2182                 return 0;
2183
2184         /*
2185          * we may not have seen that sequence yet. if we are not doing
2186          * the final run, break and wait for more entries.
2187          */
2188         if (expected_sequence < pci->smallest_seq_read) {
2189                 __t = trace_rb_find_last(pdi, pci, expected_sequence);
2190                 if (!__t)
2191                         goto skip;
2192
2193                 __put_trace_last(pdi, __t);
2194                 return 0;
2195         } else if (!force) {
2196                 return 1;
2197         } else {
2198 skip:
2199                 if (check_current_skips(pci, bit->sequence))
2200                         return 0;
2201
2202                 if (expected_sequence < bit->sequence)
2203                         insert_skip(pci, expected_sequence, bit->sequence - 1);
2204                 return 0;
2205         }
2206 }
2207
2208 static void show_entries_rb(int force)
2209 {
2210         struct per_dev_info *pdi = NULL;
2211         struct per_cpu_info *pci = NULL;
2212         struct blk_io_trace *bit;
2213         struct rb_node *n;
2214         struct trace *t;
2215
2216         while ((n = rb_first(&rb_sort_root)) != NULL) {
2217                 if (is_done() && !force && !pipeline)
2218                         break;
2219
2220                 t = rb_entry(n, struct trace, rb_node);
2221                 bit = t->bit;
2222
2223                 if (read_sequence - t->read_sequence < 1 && !force)
2224                         break;
2225
2226                 if (!pdi || pdi->dev != bit->device) {
2227                         pdi = get_dev_info(bit->device);
2228                         pci = NULL;
2229                 }
2230
2231                 if (!pdi) {
2232                         fprintf(stderr, "Unknown device ID? (%d,%d)\n",
2233                                 MAJOR(bit->device), MINOR(bit->device));
2234                         break;
2235                 }
2236
2237                 if (!(bit->action == BLK_TN_MESSAGE) &&
2238                     check_sequence(pdi, t, force))
2239                         break;
2240
2241                 if (!force && bit->time > last_allowed_time)
2242                         break;
2243
2244                 check_time(pdi, bit);
2245
2246                 if (!pci || pci->cpu != bit->cpu)
2247                         pci = get_cpu_info(pdi, bit->cpu);
2248
2249                 if (!(bit->action == BLK_TN_MESSAGE))
2250                         pci->last_sequence = bit->sequence;
2251
2252                 pci->nelems++;
2253
2254                 if (bit->action & (act_mask << BLK_TC_SHIFT))
2255                         dump_trace(bit, pci, pdi);
2256
2257                 put_trace(pdi, t);
2258         }
2259 }
2260
2261 static int read_data(int fd, void *buffer, int bytes, int block, int *fdblock)
2262 {
2263         int ret, bytes_left, fl;
2264         void *p;
2265
2266         if (block != *fdblock) {
2267                 fl = fcntl(fd, F_GETFL);
2268
2269                 if (!block) {
2270                         *fdblock = 0;
2271                         fcntl(fd, F_SETFL, fl | O_NONBLOCK);
2272                 } else {
2273                         *fdblock = 1;
2274                         fcntl(fd, F_SETFL, fl & ~O_NONBLOCK);
2275                 }
2276         }
2277
2278         bytes_left = bytes;
2279         p = buffer;
2280         while (bytes_left > 0) {
2281                 ret = read(fd, p, bytes_left);
2282                 if (!ret)
2283                         return 1;
2284                 else if (ret < 0) {
2285                         if (errno != EAGAIN) {
2286                                 perror("read");
2287                                 return -1;
2288                         }
2289
2290                         /*
2291                          * never do partial reads. we can return if we
2292                          * didn't read anything and we should not block,
2293                          * otherwise wait for data
2294                          */
2295                         if ((bytes_left == bytes) && !block)
2296                                 return 1;
2297
2298                         usleep(10);
2299                         continue;
2300                 } else {
2301                         p += ret;
2302                         bytes_left -= ret;
2303                 }
2304         }
2305
2306         return 0;
2307 }
2308
2309 static inline __u16 get_pdulen(struct blk_io_trace *bit)
2310 {
2311         if (data_is_native)
2312                 return bit->pdu_len;
2313
2314         return __bswap_16(bit->pdu_len);
2315 }
2316
2317 static inline __u32 get_magic(struct blk_io_trace *bit)
2318 {
2319         if (data_is_native)
2320                 return bit->magic;
2321
2322         return __bswap_32(bit->magic);
2323 }
2324
2325 static int read_events(int fd, int always_block, int *fdblock)
2326 {
2327         struct per_dev_info *pdi = NULL;
2328         unsigned int events = 0;
2329
2330         while (!is_done() && events < rb_batch) {
2331                 struct blk_io_trace *bit;
2332                 struct trace *t;
2333                 int pdu_len, should_block, ret;
2334                 __u32 magic;
2335
2336                 bit = bit_alloc();
2337
2338                 should_block = !events || always_block;
2339
2340                 ret = read_data(fd, bit, sizeof(*bit), should_block, fdblock);
2341                 if (ret) {
2342                         bit_free(bit);
2343                         if (!events && ret < 0)
2344                                 events = ret;
2345                         break;
2346                 }
2347
2348                 /*
2349                  * look at first trace to check whether we need to convert
2350                  * data in the future
2351                  */
2352                 if (data_is_native == -1 && check_data_endianness(bit->magic))
2353                         break;
2354
2355                 magic = get_magic(bit);
2356                 if ((magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
2357                         fprintf(stderr, "Bad magic %x\n", magic);
2358                         break;
2359                 }
2360
2361                 pdu_len = get_pdulen(bit);
2362                 if (pdu_len) {
2363                         void *ptr = realloc(bit, sizeof(*bit) + pdu_len);
2364
2365                         if (read_data(fd, ptr + sizeof(*bit), pdu_len, 1, fdblock)) {
2366                                 bit_free(ptr);
2367                                 break;
2368                         }
2369
2370                         bit = ptr;
2371                 }
2372
2373                 trace_to_cpu(bit);
2374
2375                 if (verify_trace(bit)) {
2376                         bit_free(bit);
2377                         continue;
2378                 }
2379
2380                 /*
2381                  * not a real trace, so grab and handle it here
2382                  */
2383                 if (bit->action & BLK_TC_ACT(BLK_TC_NOTIFY) && bit->action != BLK_TN_MESSAGE) {
2384                         handle_notify(bit);
2385                         output_binary(bit, sizeof(*bit) + bit->pdu_len);
2386                         continue;
2387                 }
2388
2389                 t = t_alloc();
2390                 memset(t, 0, sizeof(*t));
2391                 t->bit = bit;
2392                 t->read_sequence = read_sequence;
2393
2394                 t->next = trace_list;
2395                 trace_list = t;
2396
2397                 if (!pdi || pdi->dev != bit->device)
2398                         pdi = get_dev_info(bit->device);
2399
2400                 if (bit->time > pdi->last_read_time)
2401                         pdi->last_read_time = bit->time;
2402
2403                 events++;
2404         }
2405
2406         return events;
2407 }
2408
2409 /*
2410  * Managing input streams
2411  */
2412
2413 struct ms_stream {
2414         struct ms_stream *next;
2415         struct trace *first, *last;
2416         struct per_dev_info *pdi;
2417         unsigned int cpu;
2418 };
2419
2420 #define MS_HASH(d, c) ((MAJOR(d) & 0xff) ^ (MINOR(d) & 0xff) ^ (cpu & 0xff))
2421
2422 struct ms_stream *ms_head;
2423 struct ms_stream *ms_hash[256];
2424
2425 static void ms_sort(struct ms_stream *msp);
2426 static int ms_prime(struct ms_stream *msp);
2427
2428 static inline struct trace *ms_peek(struct ms_stream *msp)
2429 {
2430         return (msp == NULL) ? NULL : msp->first;
2431 }
2432
2433 static inline __u64 ms_peek_time(struct ms_stream *msp)
2434 {
2435         return ms_peek(msp)->bit->time;
2436 }
2437
2438 static inline void ms_resort(struct ms_stream *msp)
2439 {
2440         if (msp->next && ms_peek_time(msp) > ms_peek_time(msp->next)) {
2441                 ms_head = msp->next;
2442                 msp->next = NULL;
2443                 ms_sort(msp);
2444         }
2445 }
2446
2447 static inline void ms_deq(struct ms_stream *msp)
2448 {
2449         msp->first = msp->first->next;
2450         if (!msp->first) {
2451                 msp->last = NULL;
2452                 if (!ms_prime(msp)) {
2453                         ms_head = msp->next;
2454                         msp->next = NULL;
2455                         return;
2456                 }
2457         }
2458
2459         ms_resort(msp);
2460 }
2461
2462 static void ms_sort(struct ms_stream *msp)
2463 {
2464         __u64 msp_t = ms_peek_time(msp);
2465         struct ms_stream *this_msp = ms_head;
2466
2467         if (this_msp == NULL)
2468                 ms_head = msp;
2469         else if (msp_t < ms_peek_time(this_msp)) {
2470                 msp->next = this_msp;
2471                 ms_head = msp;
2472         }
2473         else {
2474                 while (this_msp->next && ms_peek_time(this_msp->next) < msp_t)
2475                         this_msp = this_msp->next;
2476
2477                 msp->next = this_msp->next;
2478                 this_msp->next = msp;
2479         }
2480 }
2481
2482 static int ms_prime(struct ms_stream *msp)
2483 {
2484         __u32 magic;
2485         unsigned int i;
2486         struct trace *t;
2487         struct per_dev_info *pdi = msp->pdi;
2488         struct per_cpu_info *pci = get_cpu_info(pdi, msp->cpu);
2489         struct blk_io_trace *bit = NULL;
2490         int ret, pdu_len, ndone = 0;
2491
2492         for (i = 0; !is_done() && pci->fd >= 0 && i < rb_batch; i++) {
2493                 bit = bit_alloc();
2494                 ret = read_data(pci->fd, bit, sizeof(*bit), 1, &pci->fdblock);
2495                 if (ret)
2496                         goto err;
2497
2498                 if (data_is_native == -1 && check_data_endianness(bit->magic))
2499                         goto err;
2500
2501                 magic = get_magic(bit);
2502                 if ((magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
2503                         fprintf(stderr, "Bad magic %x\n", magic);
2504                         goto err;
2505
2506                 }
2507
2508                 pdu_len = get_pdulen(bit);
2509                 if (pdu_len) {
2510                         void *ptr = realloc(bit, sizeof(*bit) + pdu_len);
2511                         ret = read_data(pci->fd, ptr + sizeof(*bit), pdu_len,
2512                                                              1, &pci->fdblock);
2513                         if (ret) {
2514                                 free(ptr);
2515                                 bit = NULL;
2516                                 goto err;
2517                         }
2518
2519                         bit = ptr;
2520                 }
2521
2522                 trace_to_cpu(bit);
2523                 if (verify_trace(bit))
2524                         goto err;
2525
2526                 if (bit->cpu != pci->cpu) {
2527                         fprintf(stderr, "cpu %d trace info has error cpu %d\n",
2528                                 pci->cpu, bit->cpu);
2529                         continue;
2530                 }
2531
2532                 if (bit->action & BLK_TC_ACT(BLK_TC_NOTIFY) && bit->action != BLK_TN_MESSAGE) {
2533                         handle_notify(bit);
2534                         output_binary(bit, sizeof(*bit) + bit->pdu_len);
2535                         bit_free(bit);
2536
2537                         i -= 1;
2538                         continue;
2539                 }
2540
2541                 if (bit->time > pdi->last_read_time)
2542                         pdi->last_read_time = bit->time;
2543
2544                 t = t_alloc();
2545                 memset(t, 0, sizeof(*t));
2546                 t->bit = bit;
2547
2548                 if (msp->first == NULL)
2549                         msp->first = msp->last = t;
2550                 else {
2551                         msp->last->next = t;
2552                         msp->last = t;
2553                 }
2554
2555                 ndone++;
2556         }
2557
2558         return ndone;
2559
2560 err:
2561         if (bit) bit_free(bit);
2562
2563         cpu_mark_offline(pdi, pci->cpu);
2564         close(pci->fd);
2565         pci->fd = -1;
2566
2567         return ndone;
2568 }
2569
2570 static struct ms_stream *ms_alloc(struct per_dev_info *pdi, int cpu)
2571 {
2572         struct ms_stream *msp = malloc(sizeof(*msp));
2573
2574         msp->next = NULL;
2575         msp->first = msp->last = NULL;
2576         msp->pdi = pdi;
2577         msp->cpu = cpu;
2578
2579         if (ms_prime(msp))
2580                 ms_sort(msp);
2581
2582         return msp;
2583 }
2584
2585 static int setup_file(struct per_dev_info *pdi, int cpu)
2586 {
2587         int len = 0;
2588         struct stat st;
2589         char *p, *dname;
2590         struct per_cpu_info *pci = get_cpu_info(pdi, cpu);
2591
2592         pci->cpu = cpu;
2593         pci->fdblock = -1;
2594
2595         p = strdup(pdi->name);
2596         dname = dirname(p);
2597         if (strcmp(dname, ".")) {
2598                 input_dir = dname;
2599                 p = strdup(pdi->name);
2600                 strcpy(pdi->name, basename(p));
2601         }
2602         free(p);
2603
2604         if (input_dir)
2605                 len = sprintf(pci->fname, "%s/", input_dir);
2606
2607         snprintf(pci->fname + len, sizeof(pci->fname)-1-len,
2608                  "%s.blktrace.%d", pdi->name, pci->cpu);
2609         if (stat(pci->fname, &st) < 0)
2610                 return 0;
2611         if (!st.st_size)
2612                 return 1;
2613
2614         pci->fd = open(pci->fname, O_RDONLY);
2615         if (pci->fd < 0) {
2616                 perror(pci->fname);
2617                 return 0;
2618         }
2619
2620         printf("Input file %s added\n", pci->fname);
2621         cpu_mark_online(pdi, pci->cpu);
2622
2623         pdi->nfiles++;
2624         ms_alloc(pdi, pci->cpu);
2625
2626         return 1;
2627 }
2628
2629 static int handle(struct ms_stream *msp)
2630 {
2631         struct trace *t;
2632         struct per_dev_info *pdi;
2633         struct per_cpu_info *pci;
2634         struct blk_io_trace *bit;
2635
2636         t = ms_peek(msp);
2637
2638         bit = t->bit;
2639         pdi = msp->pdi;
2640         pci = get_cpu_info(pdi, msp->cpu);
2641         pci->nelems++;
2642         bit->time -= genesis_time;
2643
2644         if (t->bit->time > stopwatch_end)
2645                 return 0;
2646
2647         pdi->last_reported_time = bit->time;
2648         if ((bit->action & (act_mask << BLK_TC_SHIFT))&&
2649             t->bit->time >= stopwatch_start)
2650                 dump_trace(bit, pci, pdi);
2651
2652         ms_deq(msp);
2653
2654         if (text_output)
2655                 trace_rb_insert_last(pdi, t);
2656         else {
2657                 bit_free(t->bit);
2658                 t_free(t);
2659         }
2660
2661         return 1;
2662 }
2663
2664 /*
2665  * Check if we need to sanitize the name. We allow 'foo', or if foo.blktrace.X
2666  * is given, then strip back down to 'foo' to avoid missing files.
2667  */
2668 static int name_fixup(char *name)
2669 {
2670         char *b;
2671
2672         if (!name)
2673                 return 1;
2674
2675         b = strstr(name, ".blktrace.");
2676         if (b)
2677                 *b = '\0';
2678
2679         return 0;
2680 }
2681
2682 static int do_file(void)
2683 {
2684         int i, cpu, ret;
2685         struct per_dev_info *pdi;
2686
2687         /*
2688          * first prepare all files for reading
2689          */
2690         for (i = 0; i < ndevices; i++) {
2691                 pdi = &devices[i];
2692                 ret = name_fixup(pdi->name);
2693                 if (ret)
2694                         return ret;
2695
2696                 for (cpu = 0; setup_file(pdi, cpu); cpu++)
2697                         ;
2698
2699                 if (!cpu) {
2700                         fprintf(stderr,"No input files found for %s\n",
2701                                 pdi->name);
2702                         return 1;
2703                 }
2704         }
2705
2706         /*
2707          * Get the initial time stamp
2708          */
2709         if (ms_head)
2710                 genesis_time = ms_peek_time(ms_head);
2711
2712         /*
2713          * Keep processing traces while any are left
2714          */
2715         while (!is_done() && ms_head && handle(ms_head))
2716                 ;
2717
2718         return 0;
2719 }
2720
2721 static void do_pipe(int fd)
2722 {
2723         unsigned long long youngest;
2724         int events, fdblock;
2725
2726         last_allowed_time = -1ULL;
2727         fdblock = -1;
2728         while ((events = read_events(fd, 0, &fdblock)) > 0) {
2729                 read_sequence++;
2730         
2731 #if 0
2732                 smallest_seq_read = -1U;
2733 #endif
2734
2735                 if (sort_entries(&youngest))
2736                         break;
2737
2738                 if (youngest > stopwatch_end)
2739                         break;
2740
2741                 show_entries_rb(0);
2742         }
2743
2744         if (rb_sort_entries)
2745                 show_entries_rb(1);
2746 }
2747
2748 static int do_fifo(void)
2749 {
2750         int fd;
2751
2752         if (!strcmp(pipename, "-"))
2753                 fd = dup(STDIN_FILENO);
2754         else
2755                 fd = open(pipename, O_RDONLY);
2756
2757         if (fd == -1) {
2758                 perror("dup stdin");
2759                 return -1;
2760         }
2761
2762         do_pipe(fd);
2763         close(fd);
2764         return 0;
2765 }
2766
2767 static void show_stats(void)
2768 {
2769         if (!ofp)
2770                 return;
2771         if (stats_printed)
2772                 return;
2773
2774         stats_printed = 1;
2775
2776         if (per_process_stats)
2777                 show_process_stats();
2778
2779         if (per_device_and_cpu_stats)
2780                 show_device_and_cpu_stats();
2781
2782         fflush(ofp);
2783 }
2784
2785 static void handle_sigint(__attribute__((__unused__)) int sig)
2786 {
2787         done = 1;
2788 }
2789
2790 /*
2791  * Extract start and duration times from a string, allowing
2792  * us to specify a time interval of interest within a trace.
2793  * Format: "duration" (start is zero) or "start:duration".
2794  */
2795 static int find_stopwatch_interval(char *string)
2796 {
2797         double value;
2798         char *sp;
2799
2800         value = strtod(string, &sp);
2801         if (sp == string) {
2802                 fprintf(stderr,"Invalid stopwatch timer: %s\n", string);
2803                 return 1;
2804         }
2805         if (*sp == ':') {
2806                 stopwatch_start = DOUBLE_TO_NANO_ULL(value);
2807                 string = sp + 1;
2808                 value = strtod(string, &sp);
2809                 if (sp == string || *sp != '\0') {
2810                         fprintf(stderr,"Invalid stopwatch duration time: %s\n",
2811                                 string);
2812                         return 1;
2813                 }
2814         } else if (*sp != '\0') {
2815                 fprintf(stderr,"Invalid stopwatch start timer: %s\n", string);
2816                 return 1;
2817         }
2818         stopwatch_end = DOUBLE_TO_NANO_ULL(value);
2819         if (stopwatch_end <= stopwatch_start) {
2820                 fprintf(stderr, "Invalid stopwatch interval: %Lu -> %Lu\n",
2821                         stopwatch_start, stopwatch_end);
2822                 return 1;
2823         }
2824
2825         return 0;
2826 }
2827
2828 static int is_pipe(const char *str)
2829 {
2830         struct stat st;
2831
2832         if (!strcmp(str, "-"))
2833                 return 1;
2834         if (!stat(str, &st) && S_ISFIFO(st.st_mode))
2835                 return 1;
2836
2837         return 0;
2838 }
2839
2840 static int get_program_sort_event(const char *str)
2841 {
2842         char evt = str[0];
2843
2844         switch (evt) {
2845         case 'N':
2846                 per_process_stats_event = SORT_PROG_EVENT_N;
2847                 break;
2848         case 'Q':
2849                 per_process_stats_event = SORT_PROG_EVENT_QKB;
2850                 break;
2851         case 'q':
2852                 per_process_stats_event = SORT_PROG_EVENT_QIO;
2853                 break;
2854         case 'R':
2855                 per_process_stats_event = SORT_PROG_EVENT_RKB;
2856                 break;
2857         case 'r':
2858                 per_process_stats_event = SORT_PROG_EVENT_RIO;
2859                 break;
2860         case 'W':
2861                 per_process_stats_event = SORT_PROG_EVENT_WKB;
2862                 break;
2863         case 'w':
2864                 per_process_stats_event = SORT_PROG_EVENT_WIO;
2865                 break;
2866         case 'C':
2867                 per_process_stats_event = SORT_PROG_EVENT_CKB;
2868                 break;
2869         case 'c':
2870                 per_process_stats_event = SORT_PROG_EVENT_CIO;
2871                 break;
2872         default:
2873                 return 1;
2874         }
2875
2876         return 0;
2877 }
2878
2879 #define S_OPTS  "a:A:b:D:d:f:F:hi:o:OqsS:tw:vVM"
2880 static char usage_str[] =    "\n\n" \
2881         "-i <file>           | --input=<file>\n" \
2882         "[ -a <action field> | --act-mask=<action field> ]\n" \
2883         "[ -A <action mask>  | --set-mask=<action mask> ]\n" \
2884         "[ -b <traces>       | --batch=<traces> ]\n" \
2885         "[ -d <file>         | --dump-binary=<file> ]\n" \
2886         "[ -D <dir>          | --input-directory=<dir> ]\n" \
2887         "[ -f <format>       | --format=<format> ]\n" \
2888         "[ -F <spec>         | --format-spec=<spec> ]\n" \
2889         "[ -h                | --hash-by-name ]\n" \
2890         "[ -o <file>         | --output=<file> ]\n" \
2891         "[ -O                | --no-text-output ]\n" \
2892         "[ -q                | --quiet ]\n" \
2893         "[ -s                | --per-program-stats ]\n" \
2894         "[ -S <event>        | --sort-program-stats=<event> ]\n" \
2895         "[ -t                | --track-ios ]\n" \
2896         "[ -w <time>         | --stopwatch=<time> ]\n" \
2897         "[ -M                | --no-msgs\n" \
2898         "[ -v                | --verbose ]\n" \
2899         "[ -V                | --version ]\n\n" \
2900         "\t-a Only trace specified actions. See documentation\n" \
2901         "\t-A Give trace mask as a single value. See documentation\n" \
2902         "\t-b stdin read batching\n" \
2903         "\t-d Output file. If specified, binary data is written to file\n" \
2904         "\t-D Directory to prepend to input file names\n" \
2905         "\t-f Output format. Customize the output format. The format field\n" \
2906         "\t   identifies can be found in the documentation\n" \
2907         "\t-F Format specification. Can be found in the documentation\n" \
2908         "\t-h Hash processes by name, not pid\n" \
2909         "\t-i Input file containing trace data, or '-' for stdin\n" \
2910         "\t-o Output file. If not given, output is stdout\n" \
2911         "\t-O Do NOT output text data\n" \
2912         "\t-q Quiet. Don't display any stats at the end of the trace\n" \
2913         "\t-s Show per-program io statistics\n" \
2914         "\t-S Show per-program io statistics sorted by N/Q/q/R/r/W/w/C/c\n" \
2915         "\t   N:Name, Q/q:Queued(read & write), R/r:Queued Read, W/w:Queued Write, C/c:Complete.\n" \
2916         "\t   Sort programs by how much data(KB): Q,R,W,C.\n" \
2917         "\t   Sort programs by how many IO operations: q,r,w,c.\n" \
2918         "\t   if -S was used, the -s parameter will be ignored.\n" \
2919         "\t-t Track individual ios. Will tell you the time a request took\n" \
2920         "\t   to get queued, to get dispatched, and to get completed\n" \
2921         "\t-w Only parse data between the given time interval in seconds.\n" \
2922         "\t   If 'start' isn't given, blkparse defaults the start time to 0\n" \
2923         "\t-M Do not output messages to binary file\n" \
2924         "\t-v More verbose for marginal errors\n" \
2925         "\t-V Print program version info\n\n";
2926
2927 static void usage(char *prog)
2928 {
2929         fprintf(stderr, "Usage: %s %s", prog, usage_str);
2930 }
2931
2932 int main(int argc, char *argv[])
2933 {
2934         int i, c, ret, mode;
2935         int act_mask_tmp = 0;
2936         char *ofp_buffer = NULL;
2937         char *bin_ofp_buffer = NULL;
2938
2939         while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
2940                 switch (c) {
2941                 case 'a':
2942                         i = find_mask_map(optarg);
2943                         if (i < 0) {
2944                                 fprintf(stderr,"Invalid action mask %s\n",
2945                                         optarg);
2946                                 return 1;
2947                         }
2948                         act_mask_tmp |= i;
2949                         break;
2950
2951                 case 'A':
2952                         if ((sscanf(optarg, "%x", &i) != 1) || 
2953                                                         !valid_act_opt(i)) {
2954                                 fprintf(stderr,
2955                                         "Invalid set action mask %s/0x%x\n",
2956                                         optarg, i);
2957                                 return 1;
2958                         }
2959                         act_mask_tmp = i;
2960                         break;
2961                 case 'i':
2962                         if (is_pipe(optarg) && !pipeline) {
2963                                 pipeline = 1;
2964                                 pipename = strdup(optarg);
2965                         } else if (resize_devices(optarg) != 0)
2966                                 return 1;
2967                         break;
2968                 case 'D':
2969                         input_dir = optarg;
2970                         break;
2971                 case 'o':
2972                         output_name = optarg;
2973                         break;
2974                 case 'O':
2975                         text_output = 0;
2976                         break;
2977                 case 'b':
2978                         rb_batch = atoi(optarg);
2979                         if (rb_batch <= 0)
2980                                 rb_batch = RB_BATCH_DEFAULT;
2981                         break;
2982                 case 's':
2983                         per_process_stats = 1;
2984                         break;
2985                 case 'S':
2986                         per_process_stats = 1;
2987                         if (get_program_sort_event(optarg))
2988                                 return 1;
2989                         break;
2990                 case 't':
2991                         track_ios = 1;
2992                         break;
2993                 case 'q':
2994                         per_device_and_cpu_stats = 0;
2995                         break;
2996                 case 'w':
2997                         if (find_stopwatch_interval(optarg) != 0)
2998                                 return 1;
2999                         break;
3000                 case 'f':
3001                         set_all_format_specs(optarg);
3002                         break;
3003                 case 'F':
3004                         if (add_format_spec(optarg) != 0)
3005                                 return 1;
3006                         break;
3007                 case 'h':
3008                         ppi_hash_by_pid = 0;
3009                         break;
3010                 case 'v':
3011                         verbose++;
3012                         break;
3013                 case 'V':
3014                         printf("%s version %s\n", argv[0], blkparse_version);
3015                         return 0;
3016                 case 'd':
3017                         dump_binary = optarg;
3018                         break;
3019                 case 'M':
3020                         bin_output_msgs = 0;
3021                         break;
3022                 default:
3023                         usage(argv[0]);
3024                         return 1;
3025                 }
3026         }
3027
3028         while (optind < argc) {
3029                 if (is_pipe(argv[optind]) && !pipeline) {
3030                         pipeline = 1;
3031                         pipename = strdup(argv[optind]);
3032                 } else if (resize_devices(argv[optind]) != 0)
3033                         return 1;
3034                 optind++;
3035         }
3036
3037         if (!pipeline && !ndevices) {
3038                 usage(argv[0]);
3039                 return 1;
3040         }
3041
3042         if (act_mask_tmp != 0)
3043                 act_mask = act_mask_tmp;
3044
3045         memset(&rb_sort_root, 0, sizeof(rb_sort_root));
3046
3047         signal(SIGINT, handle_sigint);
3048         signal(SIGHUP, handle_sigint);
3049         signal(SIGTERM, handle_sigint);
3050
3051         setlocale(LC_NUMERIC, "en_US");
3052
3053         if (text_output) {
3054                 if (!output_name) {
3055                         ofp = fdopen(STDOUT_FILENO, "w");
3056                         mode = _IOLBF;
3057                 } else {
3058                         char ofname[PATH_MAX];
3059
3060                         snprintf(ofname, sizeof(ofname) - 1, "%s", output_name);
3061                         ofp = fopen(ofname, "w");
3062                         mode = _IOFBF;
3063                 }
3064
3065                 if (!ofp) {
3066                         perror("fopen");
3067                         return 1;
3068                 }
3069
3070                 ofp_buffer = malloc(4096);
3071                 if (setvbuf(ofp, ofp_buffer, mode, 4096)) {
3072                         perror("setvbuf");
3073                         return 1;
3074                 }
3075         }
3076
3077         if (dump_binary) {
3078                 if (!strcmp(dump_binary, "-"))
3079                         dump_fp = stdout;
3080                 else {
3081                         dump_fp = fopen(dump_binary, "w");
3082                         if (!dump_fp) {
3083                                 perror(dump_binary);
3084                                 dump_binary = NULL;
3085                                 return 1;
3086                         }
3087                 }
3088                 bin_ofp_buffer = malloc(128 * 1024);
3089                 if (setvbuf(dump_fp, bin_ofp_buffer, _IOFBF, 128 * 1024)) {
3090                         perror("setvbuf binary");
3091                         return 1;
3092                 }
3093         }
3094
3095         if (pipeline)
3096                 ret = do_fifo();
3097         else
3098                 ret = do_file();
3099
3100         if (!ret)
3101                 show_stats();
3102
3103         if (have_drv_data && !dump_binary)
3104                 printf("\ndiscarded traces containing low-level device driver "
3105                        "specific data (only available in binary output)\n");
3106
3107         if (ofp_buffer) {
3108                 fflush(ofp);
3109                 free(ofp_buffer);
3110         }
3111         if (bin_ofp_buffer) {
3112                 fflush(dump_fp);
3113                 free(bin_ofp_buffer);
3114         }
3115         return ret;
3116 }