Too small arrays for file names
[blktrace.git] / blkparse.c
1 /*
2  * block queue tracing parse application
3  *
4  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5  * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
6  *
7  *  This program is free software; you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation; either version 2 of the License, or
10  *  (at your option) any later version.
11  *
12  *  This program is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *  GNU General Public License for more details.
16  *
17  *  You should have received a copy of the GNU General Public License
18  *  along with this program; if not, write to the Free Software
19  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20  *
21  */
22 #include <sys/types.h>
23 #include <sys/stat.h>
24 #include <unistd.h>
25 #include <stdio.h>
26 #include <fcntl.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <getopt.h>
30 #include <errno.h>
31 #include <signal.h>
32 #include <locale.h>
33 #include <libgen.h>
34
35 #include "blktrace.h"
36 #include "rbtree.h"
37 #include "jhash.h"
38
39 static char blkparse_version[] = "1.0.4";
40
41 struct skip_info {
42         unsigned long start, end;
43         struct skip_info *prev, *next;
44 };
45
46 struct per_dev_info {
47         dev_t dev;
48         char *name;
49
50         int backwards;
51         unsigned long long events;
52         unsigned long long first_reported_time;
53         unsigned long long last_reported_time;
54         unsigned long long last_read_time;
55         struct io_stats io_stats;
56         unsigned long skips;
57         unsigned long long seq_skips;
58         unsigned int max_depth[2];
59         unsigned int cur_depth[2];
60
61         struct rb_root rb_track;
62
63         int nfiles;
64         int ncpus;
65
66         unsigned long *cpu_map;
67         unsigned int cpu_map_max;
68
69         struct per_cpu_info *cpus;
70 };
71
72 /*
73  * some duplicated effort here, we can unify this hash and the ppi hash later
74  */
75 struct process_pid_map {
76         pid_t pid;
77         char comm[16];
78         struct process_pid_map *hash_next, *list_next;
79 };
80
81 #define PPM_HASH_SHIFT  (8)
82 #define PPM_HASH_SIZE   (1 << PPM_HASH_SHIFT)
83 #define PPM_HASH_MASK   (PPM_HASH_SIZE - 1)
84 static struct process_pid_map *ppm_hash_table[PPM_HASH_SIZE];
85
86 struct per_process_info {
87         struct process_pid_map *ppm;
88         struct io_stats io_stats;
89         struct per_process_info *hash_next, *list_next;
90         int more_than_one;
91
92         /*
93          * individual io stats
94          */
95         unsigned long long longest_allocation_wait[2];
96         unsigned long long longest_dispatch_wait[2];
97         unsigned long long longest_completion_wait[2];
98 };
99
100 #define PPI_HASH_SHIFT  (8)
101 #define PPI_HASH_SIZE   (1 << PPI_HASH_SHIFT)
102 #define PPI_HASH_MASK   (PPI_HASH_SIZE - 1)
103 static struct per_process_info *ppi_hash_table[PPI_HASH_SIZE];
104 static struct per_process_info *ppi_list;
105 static int ppi_list_entries;
106
107 static struct option l_opts[] = {
108         {
109                 .name = "act-mask",
110                 .has_arg = required_argument,
111                 .flag = NULL,
112                 .val = 'a'
113         },
114         {
115                 .name = "set-mask",
116                 .has_arg = required_argument,
117                 .flag = NULL,
118                 .val = 'A'
119         },
120         {
121                 .name = "batch",
122                 .has_arg = required_argument,
123                 .flag = NULL,
124                 .val = 'b'
125         },
126         {
127                 .name = "input-directory",
128                 .has_arg = required_argument,
129                 .flag = NULL,
130                 .val = 'D'
131         },
132         {
133                 .name = "dump-binary",
134                 .has_arg = required_argument,
135                 .flag = NULL,
136                 .val = 'd'
137         },
138         {
139                 .name = "format",
140                 .has_arg = required_argument,
141                 .flag = NULL,
142                 .val = 'f'
143         },
144         {
145                 .name = "format-spec",
146                 .has_arg = required_argument,
147                 .flag = NULL,
148                 .val = 'F'
149         },
150         {
151                 .name = "hash-by-name",
152                 .has_arg = no_argument,
153                 .flag = NULL,
154                 .val = 'h'
155         },
156         {
157                 .name = "input",
158                 .has_arg = required_argument,
159                 .flag = NULL,
160                 .val = 'i'
161         },
162         {
163                 .name = "no-msgs",
164                 .has_arg = no_argument,
165                 .flag = NULL,
166                 .val = 'M'
167         },
168         {
169                 .name = "output",
170                 .has_arg = required_argument,
171                 .flag = NULL,
172                 .val = 'o'
173         },
174         {
175                 .name = "no-text-output",
176                 .has_arg = no_argument,
177                 .flag = NULL,
178                 .val = 'O'
179         },
180         {
181                 .name = "quiet",
182                 .has_arg = no_argument,
183                 .flag = NULL,
184                 .val = 'q'
185         },
186         {
187                 .name = "per-program-stats",
188                 .has_arg = no_argument,
189                 .flag = NULL,
190                 .val = 's'
191         },
192         {
193                 .name = "track-ios",
194                 .has_arg = no_argument,
195                 .flag = NULL,
196                 .val = 't'
197         },
198         {
199                 .name = "stopwatch",
200                 .has_arg = required_argument,
201                 .flag = NULL,
202                 .val = 'w'
203         },
204         {
205                 .name = "verbose",
206                 .has_arg = no_argument,
207                 .flag = NULL,
208                 .val = 'v'
209         },
210         {
211                 .name = "version",
212                 .has_arg = no_argument,
213                 .flag = NULL,
214                 .val = 'V'
215         },
216         {
217                 .name = NULL,
218         }
219 };
220
221 /*
222  * for sorting the displayed output
223  */
224 struct trace {
225         struct blk_io_trace *bit;
226         struct rb_node rb_node;
227         struct trace *next;
228         unsigned long read_sequence;
229 };
230
231 static struct rb_root rb_sort_root;
232 static unsigned long rb_sort_entries;
233
234 static struct trace *trace_list;
235
236 /*
237  * allocation cache
238  */
239 static struct blk_io_trace *bit_alloc_list;
240 static struct trace *t_alloc_list;
241
242 /*
243  * for tracking individual ios
244  */
245 struct io_track {
246         struct rb_node rb_node;
247
248         struct process_pid_map *ppm;
249         __u64 sector;
250         unsigned long long allocation_time;
251         unsigned long long queue_time;
252         unsigned long long dispatch_time;
253         unsigned long long completion_time;
254 };
255
256 static int ndevices;
257 static struct per_dev_info *devices;
258 static char *get_dev_name(struct per_dev_info *, char *, int);
259 static int trace_rb_insert_last(struct per_dev_info *, struct trace *);
260
261 FILE *ofp = NULL;
262 static char *output_name;
263 static char *input_dir;
264
265 static unsigned long long genesis_time;
266 static unsigned long long last_allowed_time;
267 static unsigned long long stopwatch_start;      /* start from zero by default */
268 static unsigned long long stopwatch_end = -1ULL;        /* "infinity" */
269 static unsigned long read_sequence;
270
271 static int per_process_stats;
272 static int per_device_and_cpu_stats = 1;
273 static int track_ios;
274 static int ppi_hash_by_pid = 1;
275 static int verbose;
276 static unsigned int act_mask = -1U;
277 static int stats_printed;
278 static int bin_output_msgs = 1;
279 int data_is_native = -1;
280
281 static FILE *dump_fp;
282 static char *dump_binary;
283
284 static unsigned int t_alloc_cache;
285 static unsigned int bit_alloc_cache;
286
287 #define RB_BATCH_DEFAULT        (512)
288 static unsigned int rb_batch = RB_BATCH_DEFAULT;
289
290 static int pipeline;
291 static char *pipename;
292
293 static int text_output = 1;
294
295 #define is_done()       (*(volatile int *)(&done))
296 static volatile int done;
297
298 struct timespec         abs_start_time;
299 static unsigned long long start_timestamp;
300
301 static int have_drv_data = 0;
302
303 #define JHASH_RANDOM    (0x3af5f2ee)
304
305 #define CPUS_PER_LONG   (8 * sizeof(unsigned long))
306 #define CPU_IDX(cpu)    ((cpu) / CPUS_PER_LONG)
307 #define CPU_BIT(cpu)    ((cpu) & (CPUS_PER_LONG - 1))
308
309 static void output_binary(void *buf, int len)
310 {
311         if (dump_binary) {
312                 size_t n = fwrite(buf, len, 1, dump_fp);
313                 if (n != 1) {
314                         perror(dump_binary);
315                         fclose(dump_fp);
316                         dump_binary = NULL;
317                 }
318         }
319 }
320
321 static void resize_cpu_info(struct per_dev_info *pdi, int cpu)
322 {
323         struct per_cpu_info *cpus = pdi->cpus;
324         int ncpus = pdi->ncpus;
325         int new_count = cpu + 1;
326         int new_space, size;
327         char *new_start;
328
329         size = new_count * sizeof(struct per_cpu_info);
330         cpus = realloc(cpus, size);
331         if (!cpus) {
332                 char name[20];
333                 fprintf(stderr, "Out of memory, CPU info for device %s (%d)\n",
334                         get_dev_name(pdi, name, sizeof(name)), size);
335                 exit(1);
336         }
337
338         new_start = (char *)cpus + (ncpus * sizeof(struct per_cpu_info));
339         new_space = (new_count - ncpus) * sizeof(struct per_cpu_info);
340         memset(new_start, 0, new_space);
341
342         pdi->ncpus = new_count;
343         pdi->cpus = cpus;
344
345         for (new_count = 0; new_count < pdi->ncpus; new_count++) {
346                 struct per_cpu_info *pci = &pdi->cpus[new_count];
347
348                 if (!pci->fd) {
349                         pci->fd = -1;
350                         memset(&pci->rb_last, 0, sizeof(pci->rb_last));
351                         pci->rb_last_entries = 0;
352                         pci->last_sequence = -1;
353                 }
354         }
355 }
356
357 static struct per_cpu_info *get_cpu_info(struct per_dev_info *pdi, int cpu)
358 {
359         struct per_cpu_info *pci;
360
361         if (cpu >= pdi->ncpus)
362                 resize_cpu_info(pdi, cpu);
363
364         pci = &pdi->cpus[cpu];
365         pci->cpu = cpu;
366         return pci;
367 }
368
369
370 static int resize_devices(char *name)
371 {
372         int size = (ndevices + 1) * sizeof(struct per_dev_info);
373
374         devices = realloc(devices, size);
375         if (!devices) {
376                 fprintf(stderr, "Out of memory, device %s (%d)\n", name, size);
377                 return 1;
378         }
379         memset(&devices[ndevices], 0, sizeof(struct per_dev_info));
380         devices[ndevices].name = name;
381         ndevices++;
382         return 0;
383 }
384
385 static struct per_dev_info *get_dev_info(dev_t dev)
386 {
387         struct per_dev_info *pdi;
388         int i;
389
390         for (i = 0; i < ndevices; i++) {
391                 if (!devices[i].dev)
392                         devices[i].dev = dev;
393                 if (devices[i].dev == dev)
394                         return &devices[i];
395         }
396
397         if (resize_devices(NULL))
398                 return NULL;
399
400         pdi = &devices[ndevices - 1];
401         pdi->dev = dev;
402         pdi->first_reported_time = 0;
403         pdi->last_read_time = 0;
404
405         return pdi;
406 }
407
408 static void insert_skip(struct per_cpu_info *pci, unsigned long start,
409                         unsigned long end)
410 {
411         struct skip_info *sip;
412
413         for (sip = pci->skips_tail; sip != NULL; sip = sip->prev) {
414                 if (end == (sip->start - 1)) {
415                         sip->start = start;
416                         return;
417                 } else if (start == (sip->end + 1)) {
418                         sip->end = end;
419                         return;
420                 }
421         }
422
423         sip = malloc(sizeof(struct skip_info));
424         sip->start = start;
425         sip->end = end;
426         sip->prev = sip->next = NULL;
427         if (pci->skips_tail == NULL)
428                 pci->skips_head = pci->skips_tail = sip;
429         else {
430                 sip->prev = pci->skips_tail;
431                 pci->skips_tail->next = sip;
432                 pci->skips_tail = sip;
433         }
434 }
435
436 static void remove_sip(struct per_cpu_info *pci, struct skip_info *sip)
437 {
438         if (sip->prev == NULL) {
439                 if (sip->next == NULL)
440                         pci->skips_head = pci->skips_tail = NULL;
441                 else {
442                         pci->skips_head = sip->next;
443                         sip->next->prev = NULL;
444                 }
445         } else if (sip->next == NULL) {
446                 pci->skips_tail = sip->prev;
447                 sip->prev->next = NULL;
448         } else {
449                 sip->prev->next = sip->next;
450                 sip->next->prev = sip->prev;
451         }
452
453         sip->prev = sip->next = NULL;
454         free(sip);
455 }
456
457 #define IN_SKIP(sip,seq) (((sip)->start <= (seq)) && ((seq) <= sip->end))
458 static int check_current_skips(struct per_cpu_info *pci, unsigned long seq)
459 {
460         struct skip_info *sip;
461
462         for (sip = pci->skips_tail; sip != NULL; sip = sip->prev) {
463                 if (IN_SKIP(sip, seq)) {
464                         if (sip->start == seq) {
465                                 if (sip->end == seq)
466                                         remove_sip(pci, sip);
467                                 else
468                                         sip->start += 1;
469                         } else if (sip->end == seq)
470                                 sip->end -= 1;
471                         else {
472                                 sip->end = seq - 1;
473                                 insert_skip(pci, seq + 1, sip->end);
474                         }
475                         return 1;
476                 }
477         }
478
479         return 0;
480 }
481
482 static void collect_pdi_skips(struct per_dev_info *pdi)
483 {
484         struct skip_info *sip;
485         int cpu;
486
487         pdi->skips = 0;
488         pdi->seq_skips = 0;
489
490         for (cpu = 0; cpu < pdi->ncpus; cpu++) {
491                 struct per_cpu_info *pci = &pdi->cpus[cpu];
492
493                 for (sip = pci->skips_head; sip != NULL; sip = sip->next) {
494                         pdi->skips++;
495                         pdi->seq_skips += (sip->end - sip->start + 1);
496                         if (verbose)
497                                 fprintf(stderr,"(%d,%d): skipping %lu -> %lu\n",
498                                         MAJOR(pdi->dev), MINOR(pdi->dev),
499                                         sip->start, sip->end);
500                 }
501         }
502 }
503
504 static void cpu_mark_online(struct per_dev_info *pdi, unsigned int cpu)
505 {
506         if (cpu >= pdi->cpu_map_max || !pdi->cpu_map) {
507                 int new_max = (cpu + CPUS_PER_LONG) & ~(CPUS_PER_LONG - 1);
508                 unsigned long *map = malloc(new_max / sizeof(long));
509
510                 memset(map, 0, new_max / sizeof(long));
511
512                 if (pdi->cpu_map) {
513                         memcpy(map, pdi->cpu_map, pdi->cpu_map_max / sizeof(long));
514                         free(pdi->cpu_map);
515                 }
516
517                 pdi->cpu_map = map;
518                 pdi->cpu_map_max = new_max;
519         }
520
521         pdi->cpu_map[CPU_IDX(cpu)] |= (1UL << CPU_BIT(cpu));
522 }
523
524 static inline void cpu_mark_offline(struct per_dev_info *pdi, int cpu)
525 {
526         pdi->cpu_map[CPU_IDX(cpu)] &= ~(1UL << CPU_BIT(cpu));
527 }
528
529 static inline int cpu_is_online(struct per_dev_info *pdi, int cpu)
530 {
531         return (pdi->cpu_map[CPU_IDX(cpu)] & (1UL << CPU_BIT(cpu))) != 0;
532 }
533
534 static inline int ppm_hash_pid(pid_t pid)
535 {
536         return jhash_1word(pid, JHASH_RANDOM) & PPM_HASH_MASK;
537 }
538
539 static struct process_pid_map *find_ppm(pid_t pid)
540 {
541         const int hash_idx = ppm_hash_pid(pid);
542         struct process_pid_map *ppm;
543
544         ppm = ppm_hash_table[hash_idx];
545         while (ppm) {
546                 if (ppm->pid == pid)
547                         return ppm;
548
549                 ppm = ppm->hash_next;
550         }
551
552         return NULL;
553 }
554
555 static struct process_pid_map *add_ppm_hash(pid_t pid, const char *name)
556 {
557         const int hash_idx = ppm_hash_pid(pid);
558         struct process_pid_map *ppm;
559
560         ppm = find_ppm(pid);
561         if (!ppm) {
562                 ppm = malloc(sizeof(*ppm));
563                 memset(ppm, 0, sizeof(*ppm));
564                 ppm->pid = pid;
565                 memset(ppm->comm, 0, sizeof(ppm->comm));
566                 strncpy(ppm->comm, name, sizeof(ppm->comm));
567                 ppm->comm[sizeof(ppm->comm) - 1] = '\0';
568                 ppm->hash_next = ppm_hash_table[hash_idx];
569                 ppm_hash_table[hash_idx] = ppm;
570         }
571
572         return ppm;
573 }
574
575 static void handle_notify(struct blk_io_trace *bit)
576 {
577         void    *payload = (caddr_t) bit + sizeof(*bit);
578         __u32   two32[2];
579
580         switch (bit->action) {
581         case BLK_TN_PROCESS:
582                 add_ppm_hash(bit->pid, payload);
583                 break;
584
585         case BLK_TN_TIMESTAMP:
586                 if (bit->pdu_len != sizeof(two32))
587                         return;
588                 memcpy(two32, payload, sizeof(two32));
589                 if (!data_is_native) {
590                         two32[0] = be32_to_cpu(two32[0]);
591                         two32[1] = be32_to_cpu(two32[1]);
592                 }
593                 start_timestamp = bit->time;
594                 abs_start_time.tv_sec  = two32[0];
595                 abs_start_time.tv_nsec = two32[1];
596                 if (abs_start_time.tv_nsec < 0) {
597                         abs_start_time.tv_sec--;
598                         abs_start_time.tv_nsec += 1000000000;
599                 }
600
601                 break;
602
603         case BLK_TN_MESSAGE:
604                 if (bit->pdu_len > 0) {
605                         char msg[bit->pdu_len+1];
606
607                         memcpy(msg, (char *)payload, bit->pdu_len);
608                         msg[bit->pdu_len] = '\0';
609
610                         fprintf(ofp,
611                                 "%3d,%-3d %2d %8s %5d.%09lu %5u %2s %3s %s\n",
612                                 MAJOR(bit->device), MINOR(bit->device),
613                                 bit->cpu, "0", (int) SECONDS(bit->time),
614                                 (unsigned long) NANO_SECONDS(bit->time),
615                                 0, "m", "N", msg);
616                 }
617                 break;
618
619         default:
620                 /* Ignore unknown notify events */
621                 ;
622         }
623 }
624
625 char *find_process_name(pid_t pid)
626 {
627         struct process_pid_map *ppm = find_ppm(pid);
628
629         if (ppm)
630                 return ppm->comm;
631
632         return NULL;
633 }
634
635 static inline int ppi_hash_pid(pid_t pid)
636 {
637         return jhash_1word(pid, JHASH_RANDOM) & PPI_HASH_MASK;
638 }
639
640 static inline int ppi_hash_name(const char *name)
641 {
642         return jhash(name, 16, JHASH_RANDOM) & PPI_HASH_MASK;
643 }
644
645 static inline int ppi_hash(struct per_process_info *ppi)
646 {
647         struct process_pid_map *ppm = ppi->ppm;
648
649         if (ppi_hash_by_pid)
650                 return ppi_hash_pid(ppm->pid);
651
652         return ppi_hash_name(ppm->comm);
653 }
654
655 static inline void add_ppi_to_hash(struct per_process_info *ppi)
656 {
657         const int hash_idx = ppi_hash(ppi);
658
659         ppi->hash_next = ppi_hash_table[hash_idx];
660         ppi_hash_table[hash_idx] = ppi;
661 }
662
663 static inline void add_ppi_to_list(struct per_process_info *ppi)
664 {
665         ppi->list_next = ppi_list;
666         ppi_list = ppi;
667         ppi_list_entries++;
668 }
669
670 static struct per_process_info *find_ppi_by_name(char *name)
671 {
672         const int hash_idx = ppi_hash_name(name);
673         struct per_process_info *ppi;
674
675         ppi = ppi_hash_table[hash_idx];
676         while (ppi) {
677                 struct process_pid_map *ppm = ppi->ppm;
678
679                 if (!strcmp(ppm->comm, name))
680                         return ppi;
681
682                 ppi = ppi->hash_next;
683         }
684
685         return NULL;
686 }
687
688 static struct per_process_info *find_ppi_by_pid(pid_t pid)
689 {
690         const int hash_idx = ppi_hash_pid(pid);
691         struct per_process_info *ppi;
692
693         ppi = ppi_hash_table[hash_idx];
694         while (ppi) {
695                 struct process_pid_map *ppm = ppi->ppm;
696
697                 if (ppm->pid == pid)
698                         return ppi;
699
700                 ppi = ppi->hash_next;
701         }
702
703         return NULL;
704 }
705
706 static struct per_process_info *find_ppi(pid_t pid)
707 {
708         struct per_process_info *ppi;
709         char *name;
710
711         if (ppi_hash_by_pid)
712                 return find_ppi_by_pid(pid);
713
714         name = find_process_name(pid);
715         if (!name)
716                 return NULL;
717
718         ppi = find_ppi_by_name(name);
719         if (ppi && ppi->ppm->pid != pid)
720                 ppi->more_than_one = 1;
721
722         return ppi;
723 }
724
725 /*
726  * struct trace and blktrace allocation cache, we do potentially
727  * millions of mallocs for these structures while only using at most
728  * a few thousand at the time
729  */
730 static inline void t_free(struct trace *t)
731 {
732         if (t_alloc_cache < 1024) {
733                 t->next = t_alloc_list;
734                 t_alloc_list = t;
735                 t_alloc_cache++;
736         } else
737                 free(t);
738 }
739
740 static inline struct trace *t_alloc(void)
741 {
742         struct trace *t = t_alloc_list;
743
744         if (t) {
745                 t_alloc_list = t->next;
746                 t_alloc_cache--;
747                 return t;
748         }
749
750         return malloc(sizeof(*t));
751 }
752
753 static inline void bit_free(struct blk_io_trace *bit)
754 {
755         if (bit_alloc_cache < 1024 && !bit->pdu_len) {
756                 /*
757                  * abuse a 64-bit field for a next pointer for the free item
758                  */
759                 bit->time = (__u64) (unsigned long) bit_alloc_list;
760                 bit_alloc_list = (struct blk_io_trace *) bit;
761                 bit_alloc_cache++;
762         } else
763                 free(bit);
764 }
765
766 static inline struct blk_io_trace *bit_alloc(void)
767 {
768         struct blk_io_trace *bit = bit_alloc_list;
769
770         if (bit) {
771                 bit_alloc_list = (struct blk_io_trace *) (unsigned long) \
772                                  bit->time;
773                 bit_alloc_cache--;
774                 return bit;
775         }
776
777         return malloc(sizeof(*bit));
778 }
779
780 static inline void __put_trace_last(struct per_dev_info *pdi, struct trace *t)
781 {
782         struct per_cpu_info *pci = get_cpu_info(pdi, t->bit->cpu);
783
784         rb_erase(&t->rb_node, &pci->rb_last);
785         pci->rb_last_entries--;
786
787         bit_free(t->bit);
788         t_free(t);
789 }
790
791 static void put_trace(struct per_dev_info *pdi, struct trace *t)
792 {
793         rb_erase(&t->rb_node, &rb_sort_root);
794         rb_sort_entries--;
795
796         trace_rb_insert_last(pdi, t);
797 }
798
799 static inline int trace_rb_insert(struct trace *t, struct rb_root *root)
800 {
801         struct rb_node **p = &root->rb_node;
802         struct rb_node *parent = NULL;
803         struct trace *__t;
804
805         while (*p) {
806                 parent = *p;
807
808                 __t = rb_entry(parent, struct trace, rb_node);
809
810                 if (t->bit->time < __t->bit->time)
811                         p = &(*p)->rb_left;
812                 else if (t->bit->time > __t->bit->time)
813                         p = &(*p)->rb_right;
814                 else if (t->bit->device < __t->bit->device)
815                         p = &(*p)->rb_left;
816                 else if (t->bit->device > __t->bit->device)
817                         p = &(*p)->rb_right;
818                 else if (t->bit->sequence < __t->bit->sequence)
819                         p = &(*p)->rb_left;
820                 else    /* >= sequence */
821                         p = &(*p)->rb_right;
822         }
823
824         rb_link_node(&t->rb_node, parent, p);
825         rb_insert_color(&t->rb_node, root);
826         return 0;
827 }
828
829 static inline int trace_rb_insert_sort(struct trace *t)
830 {
831         if (!trace_rb_insert(t, &rb_sort_root)) {
832                 rb_sort_entries++;
833                 return 0;
834         }
835
836         return 1;
837 }
838
839 static int trace_rb_insert_last(struct per_dev_info *pdi, struct trace *t)
840 {
841         struct per_cpu_info *pci = get_cpu_info(pdi, t->bit->cpu);
842
843         if (trace_rb_insert(t, &pci->rb_last))
844                 return 1;
845
846         pci->rb_last_entries++;
847
848         if (pci->rb_last_entries > rb_batch * pdi->nfiles) {
849                 struct rb_node *n = rb_first(&pci->rb_last);
850
851                 t = rb_entry(n, struct trace, rb_node);
852                 __put_trace_last(pdi, t);
853         }
854
855         return 0;
856 }
857
858 static struct trace *trace_rb_find(dev_t device, unsigned long sequence,
859                                    struct rb_root *root, int order)
860 {
861         struct rb_node *n = root->rb_node;
862         struct rb_node *prev = NULL;
863         struct trace *__t;
864
865         while (n) {
866                 __t = rb_entry(n, struct trace, rb_node);
867                 prev = n;
868
869                 if (device < __t->bit->device)
870                         n = n->rb_left;
871                 else if (device > __t->bit->device)
872                         n = n->rb_right;
873                 else if (sequence < __t->bit->sequence)
874                         n = n->rb_left;
875                 else if (sequence > __t->bit->sequence)
876                         n = n->rb_right;
877                 else
878                         return __t;
879         }
880
881         /*
882          * hack - the list may not be sequence ordered because some
883          * events don't have sequence and time matched. so we end up
884          * being a little off in the rb lookup here, because we don't
885          * know the time we are looking for. compensate by browsing
886          * a little ahead from the last entry to find the match
887          */
888         if (order && prev) {
889                 int max = 5;
890
891                 while (((n = rb_next(prev)) != NULL) && max--) {
892                         __t = rb_entry(n, struct trace, rb_node);
893
894                         if (__t->bit->device == device &&
895                             __t->bit->sequence == sequence)
896                                 return __t;
897
898                         prev = n;
899                 }
900         }
901
902         return NULL;
903 }
904
905 static inline struct trace *trace_rb_find_last(struct per_dev_info *pdi,
906                                                struct per_cpu_info *pci,
907                                                unsigned long seq)
908 {
909         return trace_rb_find(pdi->dev, seq, &pci->rb_last, 0);
910 }
911
912 static inline int track_rb_insert(struct per_dev_info *pdi,struct io_track *iot)
913 {
914         struct rb_node **p = &pdi->rb_track.rb_node;
915         struct rb_node *parent = NULL;
916         struct io_track *__iot;
917
918         while (*p) {
919                 parent = *p;
920                 __iot = rb_entry(parent, struct io_track, rb_node);
921
922                 if (iot->sector < __iot->sector)
923                         p = &(*p)->rb_left;
924                 else if (iot->sector > __iot->sector)
925                         p = &(*p)->rb_right;
926                 else {
927                         fprintf(stderr,
928                                 "sector alias (%Lu) on device %d,%d!\n",
929                                 (unsigned long long) iot->sector,
930                                 MAJOR(pdi->dev), MINOR(pdi->dev));
931                         return 1;
932                 }
933         }
934
935         rb_link_node(&iot->rb_node, parent, p);
936         rb_insert_color(&iot->rb_node, &pdi->rb_track);
937         return 0;
938 }
939
940 static struct io_track *__find_track(struct per_dev_info *pdi, __u64 sector)
941 {
942         struct rb_node *n = pdi->rb_track.rb_node;
943         struct io_track *__iot;
944
945         while (n) {
946                 __iot = rb_entry(n, struct io_track, rb_node);
947
948                 if (sector < __iot->sector)
949                         n = n->rb_left;
950                 else if (sector > __iot->sector)
951                         n = n->rb_right;
952                 else
953                         return __iot;
954         }
955
956         return NULL;
957 }
958
959 static struct io_track *find_track(struct per_dev_info *pdi, pid_t pid,
960                                    __u64 sector)
961 {
962         struct io_track *iot;
963
964         iot = __find_track(pdi, sector);
965         if (!iot) {
966                 iot = malloc(sizeof(*iot));
967                 iot->ppm = find_ppm(pid);
968                 if (!iot->ppm)
969                         iot->ppm = add_ppm_hash(pid, "unknown");
970                 iot->sector = sector;
971                 track_rb_insert(pdi, iot);
972         }
973
974         return iot;
975 }
976
977 static void log_track_frontmerge(struct per_dev_info *pdi,
978                                  struct blk_io_trace *t)
979 {
980         struct io_track *iot;
981
982         if (!track_ios)
983                 return;
984
985         iot = __find_track(pdi, t->sector + t_sec(t));
986         if (!iot) {
987                 if (verbose)
988                         fprintf(stderr, "merge not found for (%d,%d): %llu\n",
989                                 MAJOR(pdi->dev), MINOR(pdi->dev),
990                                 (unsigned long long) t->sector + t_sec(t));
991                 return;
992         }
993
994         rb_erase(&iot->rb_node, &pdi->rb_track);
995         iot->sector -= t_sec(t);
996         track_rb_insert(pdi, iot);
997 }
998
999 static void log_track_getrq(struct per_dev_info *pdi, struct blk_io_trace *t)
1000 {
1001         struct io_track *iot;
1002
1003         if (!track_ios)
1004                 return;
1005
1006         iot = find_track(pdi, t->pid, t->sector);
1007         iot->allocation_time = t->time;
1008 }
1009
1010 static inline int is_remapper(struct per_dev_info *pdi)
1011 {
1012         int major = MAJOR(pdi->dev);
1013
1014         return (major == 253 || major == 9);
1015 }
1016
1017 /*
1018  * for md/dm setups, the interesting cycle is Q -> C. So track queueing
1019  * time here, as dispatch time
1020  */
1021 static void log_track_queue(struct per_dev_info *pdi, struct blk_io_trace *t)
1022 {
1023         struct io_track *iot;
1024
1025         if (!track_ios)
1026                 return;
1027         if (!is_remapper(pdi))
1028                 return;
1029
1030         iot = find_track(pdi, t->pid, t->sector);
1031         iot->dispatch_time = t->time;
1032 }
1033
1034 /*
1035  * return time between rq allocation and insertion
1036  */
1037 static unsigned long long log_track_insert(struct per_dev_info *pdi,
1038                                            struct blk_io_trace *t)
1039 {
1040         unsigned long long elapsed;
1041         struct io_track *iot;
1042
1043         if (!track_ios)
1044                 return -1;
1045
1046         iot = find_track(pdi, t->pid, t->sector);
1047         iot->queue_time = t->time;
1048
1049         if (!iot->allocation_time)
1050                 return -1;
1051
1052         elapsed = iot->queue_time - iot->allocation_time;
1053
1054         if (per_process_stats) {
1055                 struct per_process_info *ppi = find_ppi(iot->ppm->pid);
1056                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
1057
1058                 if (ppi && elapsed > ppi->longest_allocation_wait[w])
1059                         ppi->longest_allocation_wait[w] = elapsed;
1060         }
1061
1062         return elapsed;
1063 }
1064
1065 /*
1066  * return time between queue and issue
1067  */
1068 static unsigned long long log_track_issue(struct per_dev_info *pdi,
1069                                           struct blk_io_trace *t)
1070 {
1071         unsigned long long elapsed;
1072         struct io_track *iot;
1073
1074         if (!track_ios)
1075                 return -1;
1076         if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
1077                 return -1;
1078
1079         iot = __find_track(pdi, t->sector);
1080         if (!iot) {
1081                 if (verbose)
1082                         fprintf(stderr, "issue not found for (%d,%d): %llu\n",
1083                                 MAJOR(pdi->dev), MINOR(pdi->dev),
1084                                 (unsigned long long) t->sector);
1085                 return -1;
1086         }
1087
1088         iot->dispatch_time = t->time;
1089         elapsed = iot->dispatch_time - iot->queue_time;
1090
1091         if (per_process_stats) {
1092                 struct per_process_info *ppi = find_ppi(iot->ppm->pid);
1093                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
1094
1095                 if (ppi && elapsed > ppi->longest_dispatch_wait[w])
1096                         ppi->longest_dispatch_wait[w] = elapsed;
1097         }
1098
1099         return elapsed;
1100 }
1101
1102 /*
1103  * return time between dispatch and complete
1104  */
1105 static unsigned long long log_track_complete(struct per_dev_info *pdi,
1106                                              struct blk_io_trace *t)
1107 {
1108         unsigned long long elapsed;
1109         struct io_track *iot;
1110
1111         if (!track_ios)
1112                 return -1;
1113
1114         iot = __find_track(pdi, t->sector);
1115         if (!iot) {
1116                 if (verbose)
1117                         fprintf(stderr,"complete not found for (%d,%d): %llu\n",
1118                                 MAJOR(pdi->dev), MINOR(pdi->dev),
1119                                 (unsigned long long) t->sector);
1120                 return -1;
1121         }
1122
1123         iot->completion_time = t->time;
1124         elapsed = iot->completion_time - iot->dispatch_time;
1125
1126         if (per_process_stats) {
1127                 struct per_process_info *ppi = find_ppi(iot->ppm->pid);
1128                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
1129
1130                 if (ppi && elapsed > ppi->longest_completion_wait[w])
1131                         ppi->longest_completion_wait[w] = elapsed;
1132         }
1133
1134         /*
1135          * kill the trace, we don't need it after completion
1136          */
1137         rb_erase(&iot->rb_node, &pdi->rb_track);
1138         free(iot);
1139
1140         return elapsed;
1141 }
1142
1143
1144 static struct io_stats *find_process_io_stats(pid_t pid)
1145 {
1146         struct per_process_info *ppi = find_ppi(pid);
1147
1148         if (!ppi) {
1149                 ppi = malloc(sizeof(*ppi));
1150                 memset(ppi, 0, sizeof(*ppi));
1151                 ppi->ppm = find_ppm(pid);
1152                 if (!ppi->ppm)
1153                         ppi->ppm = add_ppm_hash(pid, "unknown");
1154                 add_ppi_to_hash(ppi);
1155                 add_ppi_to_list(ppi);
1156         }
1157
1158         return &ppi->io_stats;
1159 }
1160
1161 static char *get_dev_name(struct per_dev_info *pdi, char *buffer, int size)
1162 {
1163         if (pdi->name)
1164                 snprintf(buffer, size, "%s", pdi->name);
1165         else
1166                 snprintf(buffer, size, "%d,%d",MAJOR(pdi->dev),MINOR(pdi->dev));
1167         return buffer;
1168 }
1169
1170 static void check_time(struct per_dev_info *pdi, struct blk_io_trace *bit)
1171 {
1172         unsigned long long this = bit->time;
1173         unsigned long long last = pdi->last_reported_time;
1174
1175         pdi->backwards = (this < last) ? 'B' : ' ';
1176         pdi->last_reported_time = this;
1177 }
1178
1179 static inline void __account_m(struct io_stats *ios, struct blk_io_trace *t,
1180                                int rw)
1181 {
1182         if (rw) {
1183                 ios->mwrites++;
1184                 ios->mwrite_kb += t_kb(t);
1185         } else {
1186                 ios->mreads++;
1187                 ios->mread_kb += t_kb(t);
1188         }
1189 }
1190
1191 static inline void account_m(struct blk_io_trace *t, struct per_cpu_info *pci,
1192                              int rw)
1193 {
1194         __account_m(&pci->io_stats, t, rw);
1195
1196         if (per_process_stats) {
1197                 struct io_stats *ios = find_process_io_stats(t->pid);
1198
1199                 __account_m(ios, t, rw);
1200         }
1201 }
1202
1203 static inline void __account_pc_queue(struct io_stats *ios,
1204                                       struct blk_io_trace *t, int rw)
1205 {
1206         if (rw) {
1207                 ios->qwrites_pc++;
1208                 ios->qwrite_kb_pc += t_kb(t);
1209         } else {
1210                 ios->qreads_pc++;
1211                 ios->qread_kb += t_kb(t);
1212         }
1213 }
1214
1215 static inline void account_pc_queue(struct blk_io_trace *t,
1216                                     struct per_cpu_info *pci, int rw)
1217 {
1218         __account_pc_queue(&pci->io_stats, t, rw);
1219
1220         if (per_process_stats) {
1221                 struct io_stats *ios = find_process_io_stats(t->pid);
1222
1223                 __account_pc_queue(ios, t, rw);
1224         }
1225 }
1226
1227 static inline void __account_pc_issue(struct io_stats *ios, int rw,
1228                                       unsigned int bytes)
1229 {
1230         if (rw) {
1231                 ios->iwrites_pc++;
1232                 ios->iwrite_kb_pc += bytes >> 10;
1233         } else {
1234                 ios->ireads_pc++;
1235                 ios->iread_kb_pc += bytes >> 10;
1236         }
1237 }
1238
1239 static inline void account_pc_issue(struct blk_io_trace *t,
1240                                     struct per_cpu_info *pci, int rw)
1241 {
1242         __account_pc_issue(&pci->io_stats, rw, t->bytes);
1243
1244         if (per_process_stats) {
1245                 struct io_stats *ios = find_process_io_stats(t->pid);
1246
1247                 __account_pc_issue(ios, rw, t->bytes);
1248         }
1249 }
1250
1251 static inline void __account_pc_requeue(struct io_stats *ios,
1252                                         struct blk_io_trace *t, int rw)
1253 {
1254         if (rw) {
1255                 ios->wrqueue_pc++;
1256                 ios->iwrite_kb_pc -= t_kb(t);
1257         } else {
1258                 ios->rrqueue_pc++;
1259                 ios->iread_kb_pc -= t_kb(t);
1260         }
1261 }
1262
1263 static inline void account_pc_requeue(struct blk_io_trace *t,
1264                                       struct per_cpu_info *pci, int rw)
1265 {
1266         __account_pc_requeue(&pci->io_stats, t, rw);
1267
1268         if (per_process_stats) {
1269                 struct io_stats *ios = find_process_io_stats(t->pid);
1270
1271                 __account_pc_requeue(ios, t, rw);
1272         }
1273 }
1274
1275 static inline void __account_pc_c(struct io_stats *ios, int rw)
1276 {
1277         if (rw)
1278                 ios->cwrites_pc++;
1279         else
1280                 ios->creads_pc++;
1281 }
1282
1283 static inline void account_pc_c(struct blk_io_trace *t,
1284                                 struct per_cpu_info *pci, int rw)
1285 {
1286         __account_pc_c(&pci->io_stats, rw);
1287
1288         if (per_process_stats) {
1289                 struct io_stats *ios = find_process_io_stats(t->pid);
1290
1291                 __account_pc_c(ios, rw);
1292         }
1293 }
1294
1295 static inline void __account_queue(struct io_stats *ios, struct blk_io_trace *t,
1296                                    int rw)
1297 {
1298         if (rw) {
1299                 ios->qwrites++;
1300                 ios->qwrite_kb += t_kb(t);
1301         } else {
1302                 ios->qreads++;
1303                 ios->qread_kb += t_kb(t);
1304         }
1305 }
1306
1307 static inline void account_queue(struct blk_io_trace *t,
1308                                  struct per_cpu_info *pci, int rw)
1309 {
1310         __account_queue(&pci->io_stats, t, rw);
1311
1312         if (per_process_stats) {
1313                 struct io_stats *ios = find_process_io_stats(t->pid);
1314
1315                 __account_queue(ios, t, rw);
1316         }
1317 }
1318
1319 static inline void __account_c(struct io_stats *ios, int rw, int bytes)
1320 {
1321         if (rw) {
1322                 ios->cwrites++;
1323                 ios->cwrite_kb += bytes >> 10;
1324         } else {
1325                 ios->creads++;
1326                 ios->cread_kb += bytes >> 10;
1327         }
1328 }
1329
1330 static inline void account_c(struct blk_io_trace *t, struct per_cpu_info *pci,
1331                              int rw, int bytes)
1332 {
1333         __account_c(&pci->io_stats, rw, bytes);
1334
1335         if (per_process_stats) {
1336                 struct io_stats *ios = find_process_io_stats(t->pid);
1337
1338                 __account_c(ios, rw, bytes);
1339         }
1340 }
1341
1342 static inline void __account_issue(struct io_stats *ios, int rw,
1343                                    unsigned int bytes)
1344 {
1345         if (rw) {
1346                 ios->iwrites++;
1347                 ios->iwrite_kb += bytes >> 10;
1348         } else {
1349                 ios->ireads++;
1350                 ios->iread_kb += bytes >> 10;
1351         }
1352 }
1353
1354 static inline void account_issue(struct blk_io_trace *t,
1355                                  struct per_cpu_info *pci, int rw)
1356 {
1357         __account_issue(&pci->io_stats, rw, t->bytes);
1358
1359         if (per_process_stats) {
1360                 struct io_stats *ios = find_process_io_stats(t->pid);
1361
1362                 __account_issue(ios, rw, t->bytes);
1363         }
1364 }
1365
1366 static inline void __account_unplug(struct io_stats *ios, int timer)
1367 {
1368         if (timer)
1369                 ios->timer_unplugs++;
1370         else
1371                 ios->io_unplugs++;
1372 }
1373
1374 static inline void account_unplug(struct blk_io_trace *t,
1375                                   struct per_cpu_info *pci, int timer)
1376 {
1377         __account_unplug(&pci->io_stats, timer);
1378
1379         if (per_process_stats) {
1380                 struct io_stats *ios = find_process_io_stats(t->pid);
1381
1382                 __account_unplug(ios, timer);
1383         }
1384 }
1385
1386 static inline void __account_requeue(struct io_stats *ios,
1387                                      struct blk_io_trace *t, int rw)
1388 {
1389         if (rw) {
1390                 ios->wrqueue++;
1391                 ios->iwrite_kb -= t_kb(t);
1392         } else {
1393                 ios->rrqueue++;
1394                 ios->iread_kb -= t_kb(t);
1395         }
1396 }
1397
1398 static inline void account_requeue(struct blk_io_trace *t,
1399                                    struct per_cpu_info *pci, int rw)
1400 {
1401         __account_requeue(&pci->io_stats, t, rw);
1402
1403         if (per_process_stats) {
1404                 struct io_stats *ios = find_process_io_stats(t->pid);
1405
1406                 __account_requeue(ios, t, rw);
1407         }
1408 }
1409
1410 static void log_complete(struct per_dev_info *pdi, struct per_cpu_info *pci,
1411                          struct blk_io_trace *t, char *act)
1412 {
1413         process_fmt(act, pci, t, log_track_complete(pdi, t), 0, NULL);
1414 }
1415
1416 static void log_insert(struct per_dev_info *pdi, struct per_cpu_info *pci,
1417                        struct blk_io_trace *t, char *act)
1418 {
1419         process_fmt(act, pci, t, log_track_insert(pdi, t), 0, NULL);
1420 }
1421
1422 static void log_queue(struct per_cpu_info *pci, struct blk_io_trace *t,
1423                       char *act)
1424 {
1425         process_fmt(act, pci, t, -1, 0, NULL);
1426 }
1427
1428 static void log_issue(struct per_dev_info *pdi, struct per_cpu_info *pci,
1429                       struct blk_io_trace *t, char *act)
1430 {
1431         process_fmt(act, pci, t, log_track_issue(pdi, t), 0, NULL);
1432 }
1433
1434 static void log_merge(struct per_dev_info *pdi, struct per_cpu_info *pci,
1435                       struct blk_io_trace *t, char *act)
1436 {
1437         if (act[0] == 'F')
1438                 log_track_frontmerge(pdi, t);
1439
1440         process_fmt(act, pci, t, -1ULL, 0, NULL);
1441 }
1442
1443 static void log_action(struct per_cpu_info *pci, struct blk_io_trace *t,
1444                         char *act)
1445 {
1446         process_fmt(act, pci, t, -1ULL, 0, NULL);
1447 }
1448
1449 static void log_generic(struct per_cpu_info *pci, struct blk_io_trace *t,
1450                         char *act)
1451 {
1452         process_fmt(act, pci, t, -1ULL, 0, NULL);
1453 }
1454
1455 static void log_unplug(struct per_cpu_info *pci, struct blk_io_trace *t,
1456                       char *act)
1457 {
1458         process_fmt(act, pci, t, -1ULL, 0, NULL);
1459 }
1460
1461 static void log_split(struct per_cpu_info *pci, struct blk_io_trace *t,
1462                       char *act)
1463 {
1464         process_fmt(act, pci, t, -1ULL, 0, NULL);
1465 }
1466
1467 static void log_pc(struct per_cpu_info *pci, struct blk_io_trace *t, char *act)
1468 {
1469         unsigned char *buf = (unsigned char *) t + sizeof(*t);
1470
1471         process_fmt(act, pci, t, -1ULL, t->pdu_len, buf);
1472 }
1473
1474 static void dump_trace_pc(struct blk_io_trace *t, struct per_dev_info *pdi,
1475                           struct per_cpu_info *pci)
1476 {
1477         int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
1478         int act = t->action & 0xffff;
1479
1480         switch (act) {
1481                 case __BLK_TA_QUEUE:
1482                         log_generic(pci, t, "Q");
1483                         account_pc_queue(t, pci, w);
1484                         break;
1485                 case __BLK_TA_GETRQ:
1486                         log_generic(pci, t, "G");
1487                         break;
1488                 case __BLK_TA_SLEEPRQ:
1489                         log_generic(pci, t, "S");
1490                         break;
1491                 case __BLK_TA_REQUEUE:
1492                         /*
1493                          * can happen if we miss traces, don't let it go
1494                          * below zero
1495                          */
1496                         if (pdi->cur_depth[w])
1497                                 pdi->cur_depth[w]--;
1498                         account_pc_requeue(t, pci, w);
1499                         log_generic(pci, t, "R");
1500                         break;
1501                 case __BLK_TA_ISSUE:
1502                         account_pc_issue(t, pci, w);
1503                         pdi->cur_depth[w]++;
1504                         if (pdi->cur_depth[w] > pdi->max_depth[w])
1505                                 pdi->max_depth[w] = pdi->cur_depth[w];
1506                         log_pc(pci, t, "D");
1507                         break;
1508                 case __BLK_TA_COMPLETE:
1509                         if (pdi->cur_depth[w])
1510                                 pdi->cur_depth[w]--;
1511                         log_pc(pci, t, "C");
1512                         account_pc_c(t, pci, w);
1513                         break;
1514                 case __BLK_TA_INSERT:
1515                         log_pc(pci, t, "I");
1516                         break;
1517                 default:
1518                         fprintf(stderr, "Bad pc action %x\n", act);
1519                         break;
1520         }
1521 }
1522
1523 static void dump_trace_fs(struct blk_io_trace *t, struct per_dev_info *pdi,
1524                           struct per_cpu_info *pci)
1525 {
1526         int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
1527         int act = t->action & 0xffff;
1528
1529         switch (act) {
1530                 case __BLK_TA_QUEUE:
1531                         log_track_queue(pdi, t);
1532                         account_queue(t, pci, w);
1533                         log_queue(pci, t, "Q");
1534                         break;
1535                 case __BLK_TA_INSERT:
1536                         log_insert(pdi, pci, t, "I");
1537                         break;
1538                 case __BLK_TA_BACKMERGE:
1539                         account_m(t, pci, w);
1540                         log_merge(pdi, pci, t, "M");
1541                         break;
1542                 case __BLK_TA_FRONTMERGE:
1543                         account_m(t, pci, w);
1544                         log_merge(pdi, pci, t, "F");
1545                         break;
1546                 case __BLK_TA_GETRQ:
1547                         log_track_getrq(pdi, t);
1548                         log_generic(pci, t, "G");
1549                         break;
1550                 case __BLK_TA_SLEEPRQ:
1551                         log_generic(pci, t, "S");
1552                         break;
1553                 case __BLK_TA_REQUEUE:
1554                         /*
1555                          * can happen if we miss traces, don't let it go
1556                          * below zero
1557                          */
1558                         if (pdi->cur_depth[w])
1559                                 pdi->cur_depth[w]--;
1560                         account_requeue(t, pci, w);
1561                         log_queue(pci, t, "R");
1562                         break;
1563                 case __BLK_TA_ISSUE:
1564                         account_issue(t, pci, w);
1565                         pdi->cur_depth[w]++;
1566                         if (pdi->cur_depth[w] > pdi->max_depth[w])
1567                                 pdi->max_depth[w] = pdi->cur_depth[w];
1568                         log_issue(pdi, pci, t, "D");
1569                         break;
1570                 case __BLK_TA_COMPLETE:
1571                         if (pdi->cur_depth[w])
1572                                 pdi->cur_depth[w]--;
1573                         account_c(t, pci, w, t->bytes);
1574                         log_complete(pdi, pci, t, "C");
1575                         break;
1576                 case __BLK_TA_PLUG:
1577                         log_action(pci, t, "P");
1578                         break;
1579                 case __BLK_TA_UNPLUG_IO:
1580                         account_unplug(t, pci, 0);
1581                         log_unplug(pci, t, "U");
1582                         break;
1583                 case __BLK_TA_UNPLUG_TIMER:
1584                         account_unplug(t, pci, 1);
1585                         log_unplug(pci, t, "UT");
1586                         break;
1587                 case __BLK_TA_SPLIT:
1588                         log_split(pci, t, "X");
1589                         break;
1590                 case __BLK_TA_BOUNCE:
1591                         log_generic(pci, t, "B");
1592                         break;
1593                 case __BLK_TA_REMAP:
1594                         log_generic(pci, t, "A");
1595                         break;
1596                 case __BLK_TA_DRV_DATA:
1597                         have_drv_data = 1;
1598                         /* dump to binary file only */
1599                         break;
1600                 default:
1601                         fprintf(stderr, "Bad fs action %x\n", t->action);
1602                         break;
1603         }
1604 }
1605
1606 static void dump_trace(struct blk_io_trace *t, struct per_cpu_info *pci,
1607                        struct per_dev_info *pdi)
1608 {
1609         if (text_output) {
1610                 if (t->action == BLK_TN_MESSAGE)
1611                         handle_notify(t);
1612                 else if (t->action & BLK_TC_ACT(BLK_TC_PC))
1613                         dump_trace_pc(t, pdi, pci);
1614                 else
1615                         dump_trace_fs(t, pdi, pci);
1616         }
1617
1618         if (!pdi->events)
1619                 pdi->first_reported_time = t->time;
1620
1621         pdi->events++;
1622
1623         if (bin_output_msgs ||
1624                             !(t->action & BLK_TC_ACT(BLK_TC_NOTIFY) &&
1625                               t->action == BLK_TN_MESSAGE))
1626                 output_binary(t, sizeof(*t) + t->pdu_len);
1627 }
1628
1629 /*
1630  * print in a proper way, not too small and not too big. if more than
1631  * 1000,000K, turn into M and so on
1632  */
1633 static char *size_cnv(char *dst, unsigned long long num, int in_kb)
1634 {
1635         char suff[] = { '\0', 'K', 'M', 'G', 'P' };
1636         unsigned int i = 0;
1637
1638         if (in_kb)
1639                 i++;
1640
1641         while (num > 1000 * 1000ULL && (i < sizeof(suff) - 1)) {
1642                 i++;
1643                 num /= 1000;
1644         }
1645
1646         sprintf(dst, "%'8Lu%c", num, suff[i]);
1647         return dst;
1648 }
1649
1650 static void dump_io_stats(struct per_dev_info *pdi, struct io_stats *ios,
1651                           char *msg)
1652 {
1653         static char x[256], y[256];
1654
1655         fprintf(ofp, "%s\n", msg);
1656
1657         fprintf(ofp, " Reads Queued:    %s, %siB\t", size_cnv(x, ios->qreads, 0), size_cnv(y, ios->qread_kb, 1));
1658         fprintf(ofp, " Writes Queued:    %s, %siB\n", size_cnv(x, ios->qwrites, 0), size_cnv(y, ios->qwrite_kb, 1));
1659         fprintf(ofp, " Read Dispatches: %s, %siB\t", size_cnv(x, ios->ireads, 0), size_cnv(y, ios->iread_kb, 1));
1660         fprintf(ofp, " Write Dispatches: %s, %siB\n", size_cnv(x, ios->iwrites, 0), size_cnv(y, ios->iwrite_kb, 1));
1661         fprintf(ofp, " Reads Requeued:  %s\t\t", size_cnv(x, ios->rrqueue, 0));
1662         fprintf(ofp, " Writes Requeued:  %s\n", size_cnv(x, ios->wrqueue, 0));
1663         fprintf(ofp, " Reads Completed: %s, %siB\t", size_cnv(x, ios->creads, 0), size_cnv(y, ios->cread_kb, 1));
1664         fprintf(ofp, " Writes Completed: %s, %siB\n", size_cnv(x, ios->cwrites, 0), size_cnv(y, ios->cwrite_kb, 1));
1665         fprintf(ofp, " Read Merges:     %s, %siB\t", size_cnv(x, ios->mreads, 0), size_cnv(y, ios->mread_kb, 1));
1666         fprintf(ofp, " Write Merges:     %s, %siB\n", size_cnv(x, ios->mwrites, 0), size_cnv(y, ios->mwrite_kb, 1));
1667         if (pdi) {
1668                 fprintf(ofp, " Read depth:      %'8u%8c\t", pdi->max_depth[0], ' ');
1669                 fprintf(ofp, " Write depth:      %'8u\n", pdi->max_depth[1]);
1670         }
1671         if (ios->qreads_pc || ios->qwrites_pc || ios->ireads_pc || ios->iwrites_pc ||
1672             ios->rrqueue_pc || ios->wrqueue_pc || ios->creads_pc || ios->cwrites_pc) {
1673                 fprintf(ofp, " PC Reads Queued: %s, %siB\t", size_cnv(x, ios->qreads_pc, 0), size_cnv(y, ios->qread_kb_pc, 1));
1674                 fprintf(ofp, " PC Writes Queued: %s, %siB\n", size_cnv(x, ios->qwrites_pc, 0), size_cnv(y, ios->qwrite_kb_pc, 1));
1675                 fprintf(ofp, " PC Read Disp.:   %s, %siB\t", size_cnv(x, ios->ireads_pc, 0), size_cnv(y, ios->iread_kb_pc, 1));
1676                 fprintf(ofp, " PC Write Disp.:   %s, %siB\n", size_cnv(x, ios->iwrites_pc, 0), size_cnv(y, ios->iwrite_kb_pc, 1));
1677                 fprintf(ofp, " PC Reads Req.:   %s\t\t", size_cnv(x, ios->rrqueue_pc, 0));
1678                 fprintf(ofp, " PC Writes Req.:   %s\n", size_cnv(x, ios->wrqueue_pc, 0));
1679                 fprintf(ofp, " PC Reads Compl.: %s\t\t", size_cnv(x, ios->creads_pc, 0));
1680                 fprintf(ofp, " PC Writes Compl.: %s\n", size_cnv(x, ios->cwrites_pc, 0));
1681         }
1682         fprintf(ofp, " IO unplugs:      %'8lu%8c\t", ios->io_unplugs, ' ');
1683         fprintf(ofp, " Timer unplugs:    %'8lu\n", ios->timer_unplugs);
1684 }
1685
1686 static void dump_wait_stats(struct per_process_info *ppi)
1687 {
1688         unsigned long rawait = ppi->longest_allocation_wait[0] / 1000;
1689         unsigned long rdwait = ppi->longest_dispatch_wait[0] / 1000;
1690         unsigned long rcwait = ppi->longest_completion_wait[0] / 1000;
1691         unsigned long wawait = ppi->longest_allocation_wait[1] / 1000;
1692         unsigned long wdwait = ppi->longest_dispatch_wait[1] / 1000;
1693         unsigned long wcwait = ppi->longest_completion_wait[1] / 1000;
1694
1695         fprintf(ofp, " Allocation wait: %'8lu%8c\t", rawait, ' ');
1696         fprintf(ofp, " Allocation wait:  %'8lu\n", wawait);
1697         fprintf(ofp, " Dispatch wait:   %'8lu%8c\t", rdwait, ' ');
1698         fprintf(ofp, " Dispatch wait:    %'8lu\n", wdwait);
1699         fprintf(ofp, " Completion wait: %'8lu%8c\t", rcwait, ' ');
1700         fprintf(ofp, " Completion wait:  %'8lu\n", wcwait);
1701 }
1702
1703 static int ppi_name_compare(const void *p1, const void *p2)
1704 {
1705         struct per_process_info *ppi1 = *((struct per_process_info **) p1);
1706         struct per_process_info *ppi2 = *((struct per_process_info **) p2);
1707         int res;
1708
1709         res = strverscmp(ppi1->ppm->comm, ppi2->ppm->comm);
1710         if (!res)
1711                 res = ppi1->ppm->pid > ppi2->ppm->pid;
1712
1713         return res;
1714 }
1715
1716 static void sort_process_list(void)
1717 {
1718         struct per_process_info **ppis;
1719         struct per_process_info *ppi;
1720         int i = 0;
1721
1722         ppis = malloc(ppi_list_entries * sizeof(struct per_process_info *));
1723
1724         ppi = ppi_list;
1725         while (ppi) {
1726                 ppis[i++] = ppi;
1727                 ppi = ppi->list_next;
1728         }
1729
1730         qsort(ppis, ppi_list_entries, sizeof(ppi), ppi_name_compare);
1731
1732         i = ppi_list_entries - 1;
1733         ppi_list = NULL;
1734         while (i >= 0) {
1735                 ppi = ppis[i];
1736
1737                 ppi->list_next = ppi_list;
1738                 ppi_list = ppi;
1739                 i--;
1740         }
1741
1742         free(ppis);
1743 }
1744
1745 static void show_process_stats(void)
1746 {
1747         struct per_process_info *ppi;
1748
1749         sort_process_list();
1750
1751         ppi = ppi_list;
1752         while (ppi) {
1753                 struct process_pid_map *ppm = ppi->ppm;
1754                 char name[64];
1755
1756                 if (ppi->more_than_one)
1757                         sprintf(name, "%s (%u, ...)", ppm->comm, ppm->pid);
1758                 else
1759                         sprintf(name, "%s (%u)", ppm->comm, ppm->pid);
1760
1761                 dump_io_stats(NULL, &ppi->io_stats, name);
1762                 dump_wait_stats(ppi);
1763                 ppi = ppi->list_next;
1764         }
1765
1766         fprintf(ofp, "\n");
1767 }
1768
1769 static void show_device_and_cpu_stats(void)
1770 {
1771         struct per_dev_info *pdi;
1772         struct per_cpu_info *pci;
1773         struct io_stats total, *ios;
1774         unsigned long long rrate, wrate, msec;
1775         int i, j, pci_events;
1776         char line[3 + 8/*cpu*/ + 2 + 32/*dev*/ + 3];
1777         char name[32];
1778         double ratio;
1779
1780         for (pdi = devices, i = 0; i < ndevices; i++, pdi++) {
1781
1782                 memset(&total, 0, sizeof(total));
1783                 pci_events = 0;
1784
1785                 if (i > 0)
1786                         fprintf(ofp, "\n");
1787
1788                 for (pci = pdi->cpus, j = 0; j < pdi->ncpus; j++, pci++) {
1789                         if (!pci->nelems)
1790                                 continue;
1791
1792                         ios = &pci->io_stats;
1793                         total.qreads += ios->qreads;
1794                         total.qwrites += ios->qwrites;
1795                         total.creads += ios->creads;
1796                         total.cwrites += ios->cwrites;
1797                         total.mreads += ios->mreads;
1798                         total.mwrites += ios->mwrites;
1799                         total.ireads += ios->ireads;
1800                         total.iwrites += ios->iwrites;
1801                         total.rrqueue += ios->rrqueue;
1802                         total.wrqueue += ios->wrqueue;
1803                         total.qread_kb += ios->qread_kb;
1804                         total.qwrite_kb += ios->qwrite_kb;
1805                         total.cread_kb += ios->cread_kb;
1806                         total.cwrite_kb += ios->cwrite_kb;
1807                         total.iread_kb += ios->iread_kb;
1808                         total.iwrite_kb += ios->iwrite_kb;
1809                         total.mread_kb += ios->mread_kb;
1810                         total.mwrite_kb += ios->mwrite_kb;
1811
1812                         total.qreads_pc += ios->qreads_pc;
1813                         total.qwrites_pc += ios->qwrites_pc;
1814                         total.creads_pc += ios->creads_pc;
1815                         total.cwrites_pc += ios->cwrites_pc;
1816                         total.ireads_pc += ios->ireads_pc;
1817                         total.iwrites_pc += ios->iwrites_pc;
1818                         total.rrqueue_pc += ios->rrqueue_pc;
1819                         total.wrqueue_pc += ios->wrqueue_pc;
1820                         total.qread_kb_pc += ios->qread_kb_pc;
1821                         total.qwrite_kb_pc += ios->qwrite_kb_pc;
1822                         total.iread_kb_pc += ios->iread_kb_pc;
1823                         total.iwrite_kb_pc += ios->iwrite_kb_pc;
1824
1825                         total.timer_unplugs += ios->timer_unplugs;
1826                         total.io_unplugs += ios->io_unplugs;
1827
1828                         snprintf(line, sizeof(line) - 1, "CPU%d (%s):",
1829                                  j, get_dev_name(pdi, name, sizeof(name)));
1830                         dump_io_stats(pdi, ios, line);
1831                         pci_events++;
1832                 }
1833
1834                 if (pci_events > 1) {
1835                         fprintf(ofp, "\n");
1836                         snprintf(line, sizeof(line) - 1, "Total (%s):",
1837                                  get_dev_name(pdi, name, sizeof(name)));
1838                         dump_io_stats(NULL, &total, line);
1839                 }
1840
1841                 wrate = rrate = 0;
1842                 msec = (pdi->last_reported_time - pdi->first_reported_time) / 1000000;
1843                 if (msec) {
1844                         rrate = 1000 * total.cread_kb / msec;
1845                         wrate = 1000 * total.cwrite_kb / msec;
1846                 }
1847
1848                 fprintf(ofp, "\nThroughput (R/W): %'LuKiB/s / %'LuKiB/s\n",
1849                         rrate, wrate);
1850                 fprintf(ofp, "Events (%s): %'Lu entries\n",
1851                         get_dev_name(pdi, line, sizeof(line)), pdi->events);
1852
1853                 collect_pdi_skips(pdi);
1854                 if (!pdi->skips && !pdi->events)
1855                         ratio = 0.0;
1856                 else
1857                         ratio = 100.0 * ((double)pdi->seq_skips /
1858                                         (double)(pdi->events + pdi->seq_skips));
1859                 fprintf(ofp, "Skips: %'lu forward (%'llu - %5.1lf%%)\n",
1860                         pdi->skips, pdi->seq_skips, ratio);
1861         }
1862 }
1863
1864 static void find_genesis(void)
1865 {
1866         struct trace *t = trace_list;
1867
1868         genesis_time = -1ULL;
1869         while (t != NULL) {
1870                 if (t->bit->time < genesis_time)
1871                         genesis_time = t->bit->time;
1872
1873                 t = t->next;
1874         }
1875
1876         /* The time stamp record will usually be the first
1877          * record in the trace, but not always.
1878          */
1879         if (start_timestamp
1880          && start_timestamp != genesis_time) {
1881                 long delta = genesis_time - start_timestamp;
1882
1883                 abs_start_time.tv_sec  += SECONDS(delta);
1884                 abs_start_time.tv_nsec += NANO_SECONDS(delta);
1885                 if (abs_start_time.tv_nsec < 0) {
1886                         abs_start_time.tv_nsec += 1000000000;
1887                         abs_start_time.tv_sec -= 1;
1888                 } else
1889                 if (abs_start_time.tv_nsec > 1000000000) {
1890                         abs_start_time.tv_nsec -= 1000000000;
1891                         abs_start_time.tv_sec += 1;
1892                 }
1893         }
1894 }
1895
1896 static inline int check_stopwatch(struct blk_io_trace *bit)
1897 {
1898         if (bit->time < stopwatch_end &&
1899             bit->time >= stopwatch_start)
1900                 return 0;
1901
1902         return 1;
1903 }
1904
1905 /*
1906  * return youngest entry read
1907  */
1908 static int sort_entries(unsigned long long *youngest)
1909 {
1910         struct per_dev_info *pdi = NULL;
1911         struct per_cpu_info *pci = NULL;
1912         struct trace *t;
1913
1914         if (!genesis_time)
1915                 find_genesis();
1916
1917         *youngest = 0;
1918         while ((t = trace_list) != NULL) {
1919                 struct blk_io_trace *bit = t->bit;
1920
1921                 trace_list = t->next;
1922
1923                 bit->time -= genesis_time;
1924
1925                 if (bit->time < *youngest || !*youngest)
1926                         *youngest = bit->time;
1927
1928                 if (!pdi || pdi->dev != bit->device) {
1929                         pdi = get_dev_info(bit->device);
1930                         pci = NULL;
1931                 }
1932
1933                 if (!pci || pci->cpu != bit->cpu)
1934                         pci = get_cpu_info(pdi, bit->cpu);
1935
1936                 if (bit->sequence < pci->smallest_seq_read)
1937                         pci->smallest_seq_read = bit->sequence;
1938
1939                 if (check_stopwatch(bit)) {
1940                         bit_free(bit);
1941                         t_free(t);
1942                         continue;
1943                 }
1944
1945                 if (trace_rb_insert_sort(t))
1946                         return -1;
1947         }
1948
1949         return 0;
1950 }
1951
1952 /*
1953  * to continue, we must have traces from all online cpus in the tree
1954  */
1955 static int check_cpu_map(struct per_dev_info *pdi)
1956 {
1957         unsigned long *cpu_map;
1958         struct rb_node *n;
1959         struct trace *__t;
1960         unsigned int i;
1961         int ret, cpu;
1962
1963         /*
1964          * create a map of the cpus we have traces for
1965          */
1966         cpu_map = malloc(pdi->cpu_map_max / sizeof(long));
1967         memset(cpu_map, 0, sizeof(*cpu_map));
1968         n = rb_first(&rb_sort_root);
1969         while (n) {
1970                 __t = rb_entry(n, struct trace, rb_node);
1971                 cpu = __t->bit->cpu;
1972
1973                 cpu_map[CPU_IDX(cpu)] |= (1UL << CPU_BIT(cpu));
1974                 n = rb_next(n);
1975         }
1976
1977         /*
1978          * we can't continue if pdi->cpu_map has entries set that we don't
1979          * have in the sort rbtree. the opposite is not a problem, though
1980          */
1981         ret = 0;
1982         for (i = 0; i < pdi->cpu_map_max / CPUS_PER_LONG; i++) {
1983                 if (pdi->cpu_map[i] & ~(cpu_map[i])) {
1984                         ret = 1;
1985                         break;
1986                 }
1987         }
1988
1989         free(cpu_map);
1990         return ret;
1991 }
1992
1993 static int check_sequence(struct per_dev_info *pdi, struct trace *t, int force)
1994 {
1995         struct blk_io_trace *bit = t->bit;
1996         unsigned long expected_sequence;
1997         struct per_cpu_info *pci;
1998         struct trace *__t;
1999
2000         pci = get_cpu_info(pdi, bit->cpu);
2001         expected_sequence = pci->last_sequence + 1;
2002
2003         if (!expected_sequence) {
2004                 /*
2005                  * 1 should be the first entry, just allow it
2006                  */
2007                 if (bit->sequence == 1)
2008                         return 0;
2009                 if (bit->sequence == pci->smallest_seq_read)
2010                         return 0;
2011
2012                 return check_cpu_map(pdi);
2013         }
2014
2015         if (bit->sequence == expected_sequence)
2016                 return 0;
2017
2018         /*
2019          * we may not have seen that sequence yet. if we are not doing
2020          * the final run, break and wait for more entries.
2021          */
2022         if (expected_sequence < pci->smallest_seq_read) {
2023                 __t = trace_rb_find_last(pdi, pci, expected_sequence);
2024                 if (!__t)
2025                         goto skip;
2026
2027                 __put_trace_last(pdi, __t);
2028                 return 0;
2029         } else if (!force) {
2030                 return 1;
2031         } else {
2032 skip:
2033                 if (check_current_skips(pci, bit->sequence))
2034                         return 0;
2035
2036                 if (expected_sequence < bit->sequence)
2037                         insert_skip(pci, expected_sequence, bit->sequence - 1);
2038                 return 0;
2039         }
2040 }
2041
2042 static void show_entries_rb(int force)
2043 {
2044         struct per_dev_info *pdi = NULL;
2045         struct per_cpu_info *pci = NULL;
2046         struct blk_io_trace *bit;
2047         struct rb_node *n;
2048         struct trace *t;
2049
2050         while ((n = rb_first(&rb_sort_root)) != NULL) {
2051                 if (is_done() && !force && !pipeline)
2052                         break;
2053
2054                 t = rb_entry(n, struct trace, rb_node);
2055                 bit = t->bit;
2056
2057                 if (read_sequence - t->read_sequence < 1 && !force)
2058                         break;
2059
2060                 if (!pdi || pdi->dev != bit->device) {
2061                         pdi = get_dev_info(bit->device);
2062                         pci = NULL;
2063                 }
2064
2065                 if (!pdi) {
2066                         fprintf(stderr, "Unknown device ID? (%d,%d)\n",
2067                                 MAJOR(bit->device), MINOR(bit->device));
2068                         break;
2069                 }
2070
2071                 if (!(bit->action == BLK_TN_MESSAGE) &&
2072                     check_sequence(pdi, t, force))
2073                         break;
2074
2075                 if (!force && bit->time > last_allowed_time)
2076                         break;
2077
2078                 check_time(pdi, bit);
2079
2080                 if (!pci || pci->cpu != bit->cpu)
2081                         pci = get_cpu_info(pdi, bit->cpu);
2082
2083                 if (!(bit->action == BLK_TN_MESSAGE))
2084                         pci->last_sequence = bit->sequence;
2085
2086                 pci->nelems++;
2087
2088                 if (bit->action & (act_mask << BLK_TC_SHIFT))
2089                         dump_trace(bit, pci, pdi);
2090
2091                 put_trace(pdi, t);
2092         }
2093 }
2094
2095 static int read_data(int fd, void *buffer, int bytes, int block, int *fdblock)
2096 {
2097         int ret, bytes_left, fl;
2098         void *p;
2099
2100         if (block != *fdblock) {
2101                 fl = fcntl(fd, F_GETFL);
2102
2103                 if (!block) {
2104                         *fdblock = 0;
2105                         fcntl(fd, F_SETFL, fl | O_NONBLOCK);
2106                 } else {
2107                         *fdblock = 1;
2108                         fcntl(fd, F_SETFL, fl & ~O_NONBLOCK);
2109                 }
2110         }
2111
2112         bytes_left = bytes;
2113         p = buffer;
2114         while (bytes_left > 0) {
2115                 ret = read(fd, p, bytes_left);
2116                 if (!ret)
2117                         return 1;
2118                 else if (ret < 0) {
2119                         if (errno != EAGAIN) {
2120                                 perror("read");
2121                                 return -1;
2122                         }
2123
2124                         /*
2125                          * never do partial reads. we can return if we
2126                          * didn't read anything and we should not block,
2127                          * otherwise wait for data
2128                          */
2129                         if ((bytes_left == bytes) && !block)
2130                                 return 1;
2131
2132                         usleep(10);
2133                         continue;
2134                 } else {
2135                         p += ret;
2136                         bytes_left -= ret;
2137                 }
2138         }
2139
2140         return 0;
2141 }
2142
2143 static inline __u16 get_pdulen(struct blk_io_trace *bit)
2144 {
2145         if (data_is_native)
2146                 return bit->pdu_len;
2147
2148         return __bswap_16(bit->pdu_len);
2149 }
2150
2151 static inline __u32 get_magic(struct blk_io_trace *bit)
2152 {
2153         if (data_is_native)
2154                 return bit->magic;
2155
2156         return __bswap_32(bit->magic);
2157 }
2158
2159 static int read_events(int fd, int always_block, int *fdblock)
2160 {
2161         struct per_dev_info *pdi = NULL;
2162         unsigned int events = 0;
2163
2164         while (!is_done() && events < rb_batch) {
2165                 struct blk_io_trace *bit;
2166                 struct trace *t;
2167                 int pdu_len, should_block, ret;
2168                 __u32 magic;
2169
2170                 bit = bit_alloc();
2171
2172                 should_block = !events || always_block;
2173
2174                 ret = read_data(fd, bit, sizeof(*bit), should_block, fdblock);
2175                 if (ret) {
2176                         bit_free(bit);
2177                         if (!events && ret < 0)
2178                                 events = ret;
2179                         break;
2180                 }
2181
2182                 /*
2183                  * look at first trace to check whether we need to convert
2184                  * data in the future
2185                  */
2186                 if (data_is_native == -1 && check_data_endianness(bit->magic))
2187                         break;
2188
2189                 magic = get_magic(bit);
2190                 if ((magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
2191                         fprintf(stderr, "Bad magic %x\n", magic);
2192                         break;
2193                 }
2194
2195                 pdu_len = get_pdulen(bit);
2196                 if (pdu_len) {
2197                         void *ptr = realloc(bit, sizeof(*bit) + pdu_len);
2198
2199                         if (read_data(fd, ptr + sizeof(*bit), pdu_len, 1, fdblock)) {
2200                                 bit_free(ptr);
2201                                 break;
2202                         }
2203
2204                         bit = ptr;
2205                 }
2206
2207                 trace_to_cpu(bit);
2208
2209                 if (verify_trace(bit)) {
2210                         bit_free(bit);
2211                         continue;
2212                 }
2213
2214                 /*
2215                  * not a real trace, so grab and handle it here
2216                  */
2217                 if (bit->action & BLK_TC_ACT(BLK_TC_NOTIFY) && bit->action != BLK_TN_MESSAGE) {
2218                         handle_notify(bit);
2219                         output_binary(bit, sizeof(*bit) + bit->pdu_len);
2220                         continue;
2221                 }
2222
2223                 t = t_alloc();
2224                 memset(t, 0, sizeof(*t));
2225                 t->bit = bit;
2226                 t->read_sequence = read_sequence;
2227
2228                 t->next = trace_list;
2229                 trace_list = t;
2230
2231                 if (!pdi || pdi->dev != bit->device)
2232                         pdi = get_dev_info(bit->device);
2233
2234                 if (bit->time > pdi->last_read_time)
2235                         pdi->last_read_time = bit->time;
2236
2237                 events++;
2238         }
2239
2240         return events;
2241 }
2242
2243 /*
2244  * Managing input streams
2245  */
2246
2247 struct ms_stream {
2248         struct ms_stream *next;
2249         struct trace *first, *last;
2250         struct per_dev_info *pdi;
2251         unsigned int cpu;
2252 };
2253
2254 #define MS_HASH(d, c) ((MAJOR(d) & 0xff) ^ (MINOR(d) & 0xff) ^ (cpu & 0xff))
2255
2256 struct ms_stream *ms_head;
2257 struct ms_stream *ms_hash[256];
2258
2259 static void ms_sort(struct ms_stream *msp);
2260 static int ms_prime(struct ms_stream *msp);
2261
2262 static inline struct trace *ms_peek(struct ms_stream *msp)
2263 {
2264         return (msp == NULL) ? NULL : msp->first;
2265 }
2266
2267 static inline __u64 ms_peek_time(struct ms_stream *msp)
2268 {
2269         return ms_peek(msp)->bit->time;
2270 }
2271
2272 static inline void ms_resort(struct ms_stream *msp)
2273 {
2274         if (msp->next && ms_peek_time(msp) > ms_peek_time(msp->next)) {
2275                 ms_head = msp->next;
2276                 msp->next = NULL;
2277                 ms_sort(msp);
2278         }
2279 }
2280
2281 static inline void ms_deq(struct ms_stream *msp)
2282 {
2283         msp->first = msp->first->next;
2284         if (!msp->first) {
2285                 msp->last = NULL;
2286                 if (!ms_prime(msp)) {
2287                         ms_head = msp->next;
2288                         msp->next = NULL;
2289                         return;
2290                 }
2291         }
2292
2293         ms_resort(msp);
2294 }
2295
2296 static void ms_sort(struct ms_stream *msp)
2297 {
2298         __u64 msp_t = ms_peek_time(msp);
2299         struct ms_stream *this_msp = ms_head;
2300
2301         if (this_msp == NULL)
2302                 ms_head = msp;
2303         else if (msp_t < ms_peek_time(this_msp)) {
2304                 msp->next = this_msp;
2305                 ms_head = msp;
2306         }
2307         else {
2308                 while (this_msp->next && ms_peek_time(this_msp->next) < msp_t)
2309                         this_msp = this_msp->next;
2310
2311                 msp->next = this_msp->next;
2312                 this_msp->next = msp;
2313         }
2314 }
2315
2316 static int ms_prime(struct ms_stream *msp)
2317 {
2318         __u32 magic;
2319         unsigned int i;
2320         struct trace *t;
2321         struct per_dev_info *pdi = msp->pdi;
2322         struct per_cpu_info *pci = get_cpu_info(pdi, msp->cpu);
2323         struct blk_io_trace *bit = NULL;
2324         int ret, pdu_len, ndone = 0;
2325
2326         for (i = 0; !is_done() && pci->fd >= 0 && i < rb_batch; i++) {
2327                 bit = bit_alloc();
2328                 ret = read_data(pci->fd, bit, sizeof(*bit), 1, &pci->fdblock);
2329                 if (ret)
2330                         goto err;
2331
2332                 if (data_is_native == -1 && check_data_endianness(bit->magic))
2333                         goto err;
2334
2335                 magic = get_magic(bit);
2336                 if ((magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
2337                         fprintf(stderr, "Bad magic %x\n", magic);
2338                         goto err;
2339
2340                 }
2341
2342                 pdu_len = get_pdulen(bit);
2343                 if (pdu_len) {
2344                         void *ptr = realloc(bit, sizeof(*bit) + pdu_len);
2345                         ret = read_data(pci->fd, ptr + sizeof(*bit), pdu_len,
2346                                                              1, &pci->fdblock);
2347                         if (ret) {
2348                                 free(ptr);
2349                                 bit = NULL;
2350                                 goto err;
2351                         }
2352
2353                         bit = ptr;
2354                 }
2355
2356                 trace_to_cpu(bit);
2357                 if (verify_trace(bit))
2358                         goto err;
2359
2360                 if (bit->cpu != pci->cpu) {
2361                         fprintf(stderr, "cpu %d trace info has error cpu %d\n",
2362                                 pci->cpu, bit->cpu);
2363                         continue;
2364                 }
2365
2366                 if (bit->action & BLK_TC_ACT(BLK_TC_NOTIFY) && bit->action != BLK_TN_MESSAGE) {
2367                         handle_notify(bit);
2368                         output_binary(bit, sizeof(*bit) + bit->pdu_len);
2369                         bit_free(bit);
2370
2371                         i -= 1;
2372                         continue;
2373                 }
2374
2375                 if (bit->time > pdi->last_read_time)
2376                         pdi->last_read_time = bit->time;
2377
2378                 t = t_alloc();
2379                 memset(t, 0, sizeof(*t));
2380                 t->bit = bit;
2381
2382                 if (msp->first == NULL)
2383                         msp->first = msp->last = t;
2384                 else {
2385                         msp->last->next = t;
2386                         msp->last = t;
2387                 }
2388
2389                 ndone++;
2390         }
2391
2392         return ndone;
2393
2394 err:
2395         if (bit) bit_free(bit);
2396
2397         cpu_mark_offline(pdi, pci->cpu);
2398         close(pci->fd);
2399         pci->fd = -1;
2400
2401         return ndone;
2402 }
2403
2404 static struct ms_stream *ms_alloc(struct per_dev_info *pdi, int cpu)
2405 {
2406         struct ms_stream *msp = malloc(sizeof(*msp));
2407
2408         msp->next = NULL;
2409         msp->first = msp->last = NULL;
2410         msp->pdi = pdi;
2411         msp->cpu = cpu;
2412
2413         if (ms_prime(msp))
2414                 ms_sort(msp);
2415
2416         return msp;
2417 }
2418
2419 static int setup_file(struct per_dev_info *pdi, int cpu)
2420 {
2421         int len = 0;
2422         struct stat st;
2423         char *p, *dname;
2424         struct per_cpu_info *pci = get_cpu_info(pdi, cpu);
2425
2426         pci->cpu = cpu;
2427         pci->fdblock = -1;
2428
2429         p = strdup(pdi->name);
2430         dname = dirname(p);
2431         if (strcmp(dname, ".")) {
2432                 input_dir = dname;
2433                 p = strdup(pdi->name);
2434                 strcpy(pdi->name, basename(p));
2435         }
2436         free(p);
2437
2438         if (input_dir)
2439                 len = sprintf(pci->fname, "%s/", input_dir);
2440
2441         snprintf(pci->fname + len, sizeof(pci->fname)-1-len,
2442                  "%s.blktrace.%d", pdi->name, pci->cpu);
2443         if (stat(pci->fname, &st) < 0)
2444                 return 0;
2445         if (!st.st_size)
2446                 return 1;
2447
2448         pci->fd = open(pci->fname, O_RDONLY);
2449         if (pci->fd < 0) {
2450                 perror(pci->fname);
2451                 return 0;
2452         }
2453
2454         printf("Input file %s added\n", pci->fname);
2455         cpu_mark_online(pdi, pci->cpu);
2456
2457         pdi->nfiles++;
2458         ms_alloc(pdi, pci->cpu);
2459
2460         return 1;
2461 }
2462
2463 static int handle(struct ms_stream *msp)
2464 {
2465         struct trace *t;
2466         struct per_dev_info *pdi;
2467         struct per_cpu_info *pci;
2468         struct blk_io_trace *bit;
2469
2470         t = ms_peek(msp);
2471
2472         bit = t->bit;
2473         pdi = msp->pdi;
2474         pci = get_cpu_info(pdi, msp->cpu);
2475         pci->nelems++;
2476         bit->time -= genesis_time;
2477
2478         if (t->bit->time > stopwatch_end)
2479                 return 0;
2480
2481         pdi->last_reported_time = bit->time;
2482         if ((bit->action & (act_mask << BLK_TC_SHIFT))&&
2483             t->bit->time >= stopwatch_start)
2484                 dump_trace(bit, pci, pdi);
2485
2486         ms_deq(msp);
2487
2488         if (text_output)
2489                 trace_rb_insert_last(pdi, t);
2490         else {
2491                 bit_free(t->bit);
2492                 t_free(t);
2493         }
2494
2495         return 1;
2496 }
2497
2498 /*
2499  * Check if we need to sanitize the name. We allow 'foo', or if foo.blktrace.X
2500  * is given, then strip back down to 'foo' to avoid missing files.
2501  */
2502 static int name_fixup(char *name)
2503 {
2504         char *b;
2505
2506         if (!name)
2507                 return 1;
2508
2509         b = strstr(name, ".blktrace.");
2510         if (b)
2511                 *b = '\0';
2512
2513         return 0;
2514 }
2515
2516 static int do_file(void)
2517 {
2518         int i, cpu, ret;
2519         struct per_dev_info *pdi;
2520
2521         /*
2522          * first prepare all files for reading
2523          */
2524         for (i = 0; i < ndevices; i++) {
2525                 pdi = &devices[i];
2526                 ret = name_fixup(pdi->name);
2527                 if (ret)
2528                         return ret;
2529
2530                 for (cpu = 0; setup_file(pdi, cpu); cpu++)
2531                         ;
2532
2533                 if (!cpu) {
2534                         fprintf(stderr,"No input files found for %s\n",
2535                                 pdi->name);
2536                         return 1;
2537                 }
2538         }
2539
2540         /*
2541          * Get the initial time stamp
2542          */
2543         if (ms_head)
2544                 genesis_time = ms_peek_time(ms_head);
2545
2546         /*
2547          * Keep processing traces while any are left
2548          */
2549         while (!is_done() && ms_head && handle(ms_head))
2550                 ;
2551
2552         return 0;
2553 }
2554
2555 static void do_pipe(int fd)
2556 {
2557         unsigned long long youngest;
2558         int events, fdblock;
2559
2560         last_allowed_time = -1ULL;
2561         fdblock = -1;
2562         while ((events = read_events(fd, 0, &fdblock)) > 0) {
2563                 read_sequence++;
2564         
2565 #if 0
2566                 smallest_seq_read = -1U;
2567 #endif
2568
2569                 if (sort_entries(&youngest))
2570                         break;
2571
2572                 if (youngest > stopwatch_end)
2573                         break;
2574
2575                 show_entries_rb(0);
2576         }
2577
2578         if (rb_sort_entries)
2579                 show_entries_rb(1);
2580 }
2581
2582 static int do_fifo(void)
2583 {
2584         int fd;
2585
2586         if (!strcmp(pipename, "-"))
2587                 fd = dup(STDIN_FILENO);
2588         else
2589                 fd = open(pipename, O_RDONLY);
2590
2591         if (fd == -1) {
2592                 perror("dup stdin");
2593                 return -1;
2594         }
2595
2596         do_pipe(fd);
2597         close(fd);
2598         return 0;
2599 }
2600
2601 static void show_stats(void)
2602 {
2603         if (!ofp)
2604                 return;
2605         if (stats_printed)
2606                 return;
2607
2608         stats_printed = 1;
2609
2610         if (per_process_stats)
2611                 show_process_stats();
2612
2613         if (per_device_and_cpu_stats)
2614                 show_device_and_cpu_stats();
2615
2616         fflush(ofp);
2617 }
2618
2619 static void handle_sigint(__attribute__((__unused__)) int sig)
2620 {
2621         done = 1;
2622 }
2623
2624 /*
2625  * Extract start and duration times from a string, allowing
2626  * us to specify a time interval of interest within a trace.
2627  * Format: "duration" (start is zero) or "start:duration".
2628  */
2629 static int find_stopwatch_interval(char *string)
2630 {
2631         double value;
2632         char *sp;
2633
2634         value = strtod(string, &sp);
2635         if (sp == string) {
2636                 fprintf(stderr,"Invalid stopwatch timer: %s\n", string);
2637                 return 1;
2638         }
2639         if (*sp == ':') {
2640                 stopwatch_start = DOUBLE_TO_NANO_ULL(value);
2641                 string = sp + 1;
2642                 value = strtod(string, &sp);
2643                 if (sp == string || *sp != '\0') {
2644                         fprintf(stderr,"Invalid stopwatch duration time: %s\n",
2645                                 string);
2646                         return 1;
2647                 }
2648         } else if (*sp != '\0') {
2649                 fprintf(stderr,"Invalid stopwatch start timer: %s\n", string);
2650                 return 1;
2651         }
2652         stopwatch_end = DOUBLE_TO_NANO_ULL(value);
2653         if (stopwatch_end <= stopwatch_start) {
2654                 fprintf(stderr, "Invalid stopwatch interval: %Lu -> %Lu\n",
2655                         stopwatch_start, stopwatch_end);
2656                 return 1;
2657         }
2658
2659         return 0;
2660 }
2661
2662 static int is_pipe(const char *str)
2663 {
2664         struct stat st;
2665
2666         if (!strcmp(str, "-"))
2667                 return 1;
2668         if (!stat(str, &st) && S_ISFIFO(st.st_mode))
2669                 return 1;
2670
2671         return 0;
2672 }
2673
2674 #define S_OPTS  "a:A:b:D:d:f:F:hi:o:Oqstw:vVM"
2675 static char usage_str[] =    "\n\n" \
2676         "-i <file>           | --input=<file>\n" \
2677         "[ -a <action field> | --act-mask=<action field> ]\n" \
2678         "[ -A <action mask>  | --set-mask=<action mask> ]\n" \
2679         "[ -b <traces>       | --batch=<traces> ]\n" \
2680         "[ -d <file>         | --dump-binary=<file> ]\n" \
2681         "[ -D <dir>          | --input-directory=<dir> ]\n" \
2682         "[ -f <format>       | --format=<format> ]\n" \
2683         "[ -F <spec>         | --format-spec=<spec> ]\n" \
2684         "[ -h                | --hash-by-name ]\n" \
2685         "[ -o <file>         | --output=<file> ]\n" \
2686         "[ -O                | --no-text-output ]\n" \
2687         "[ -q                | --quiet ]\n" \
2688         "[ -s                | --per-program-stats ]\n" \
2689         "[ -t                | --track-ios ]\n" \
2690         "[ -w <time>         | --stopwatch=<time> ]\n" \
2691         "[ -M                | --no-msgs\n" \
2692         "[ -v                | --verbose ]\n" \
2693         "[ -V                | --version ]\n\n" \
2694         "\t-a Only trace specified actions. See documentation\n" \
2695         "\t-A Give trace mask as a single value. See documentation\n" \
2696         "\t-b stdin read batching\n" \
2697         "\t-d Output file. If specified, binary data is written to file\n" \
2698         "\t-D Directory to prepend to input file names\n" \
2699         "\t-f Output format. Customize the output format. The format field\n" \
2700         "\t   identifies can be found in the documentation\n" \
2701         "\t-F Format specification. Can be found in the documentation\n" \
2702         "\t-h Hash processes by name, not pid\n" \
2703         "\t-i Input file containing trace data, or '-' for stdin\n" \
2704         "\t-o Output file. If not given, output is stdout\n" \
2705         "\t-O Do NOT output text data\n" \
2706         "\t-q Quiet. Don't display any stats at the end of the trace\n" \
2707         "\t-s Show per-program io statistics\n" \
2708         "\t-t Track individual ios. Will tell you the time a request took\n" \
2709         "\t   to get queued, to get dispatched, and to get completed\n" \
2710         "\t-w Only parse data between the given time interval in seconds.\n" \
2711         "\t   If 'start' isn't given, blkparse defaults the start time to 0\n" \
2712         "\t-M Do not output messages to binary file\n" \
2713         "\t-v More verbose for marginal errors\n" \
2714         "\t-V Print program version info\n\n";
2715
2716 static void usage(char *prog)
2717 {
2718         fprintf(stderr, "Usage: %s %s", prog, usage_str);
2719 }
2720
2721 int main(int argc, char *argv[])
2722 {
2723         int i, c, ret, mode;
2724         int act_mask_tmp = 0;
2725         char *ofp_buffer = NULL;
2726         char *bin_ofp_buffer = NULL;
2727
2728         while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
2729                 switch (c) {
2730                 case 'a':
2731                         i = find_mask_map(optarg);
2732                         if (i < 0) {
2733                                 fprintf(stderr,"Invalid action mask %s\n",
2734                                         optarg);
2735                                 return 1;
2736                         }
2737                         act_mask_tmp |= i;
2738                         break;
2739
2740                 case 'A':
2741                         if ((sscanf(optarg, "%x", &i) != 1) || 
2742                                                         !valid_act_opt(i)) {
2743                                 fprintf(stderr,
2744                                         "Invalid set action mask %s/0x%x\n",
2745                                         optarg, i);
2746                                 return 1;
2747                         }
2748                         act_mask_tmp = i;
2749                         break;
2750                 case 'i':
2751                         if (is_pipe(optarg) && !pipeline) {
2752                                 pipeline = 1;
2753                                 pipename = strdup(optarg);
2754                         } else if (resize_devices(optarg) != 0)
2755                                 return 1;
2756                         break;
2757                 case 'D':
2758                         input_dir = optarg;
2759                         break;
2760                 case 'o':
2761                         output_name = optarg;
2762                         break;
2763                 case 'O':
2764                         text_output = 0;
2765                         break;
2766                 case 'b':
2767                         rb_batch = atoi(optarg);
2768                         if (rb_batch <= 0)
2769                                 rb_batch = RB_BATCH_DEFAULT;
2770                         break;
2771                 case 's':
2772                         per_process_stats = 1;
2773                         break;
2774                 case 't':
2775                         track_ios = 1;
2776                         break;
2777                 case 'q':
2778                         per_device_and_cpu_stats = 0;
2779                         break;
2780                 case 'w':
2781                         if (find_stopwatch_interval(optarg) != 0)
2782                                 return 1;
2783                         break;
2784                 case 'f':
2785                         set_all_format_specs(optarg);
2786                         break;
2787                 case 'F':
2788                         if (add_format_spec(optarg) != 0)
2789                                 return 1;
2790                         break;
2791                 case 'h':
2792                         ppi_hash_by_pid = 0;
2793                         break;
2794                 case 'v':
2795                         verbose++;
2796                         break;
2797                 case 'V':
2798                         printf("%s version %s\n", argv[0], blkparse_version);
2799                         return 0;
2800                 case 'd':
2801                         dump_binary = optarg;
2802                         break;
2803                 case 'M':
2804                         bin_output_msgs = 0;
2805                         break;
2806                 default:
2807                         usage(argv[0]);
2808                         return 1;
2809                 }
2810         }
2811
2812         while (optind < argc) {
2813                 if (is_pipe(argv[optind]) && !pipeline) {
2814                         pipeline = 1;
2815                         pipename = strdup(argv[optind]);
2816                 } else if (resize_devices(argv[optind]) != 0)
2817                         return 1;
2818                 optind++;
2819         }
2820
2821         if (!pipeline && !ndevices) {
2822                 usage(argv[0]);
2823                 return 1;
2824         }
2825
2826         if (act_mask_tmp != 0)
2827                 act_mask = act_mask_tmp;
2828
2829         memset(&rb_sort_root, 0, sizeof(rb_sort_root));
2830
2831         signal(SIGINT, handle_sigint);
2832         signal(SIGHUP, handle_sigint);
2833         signal(SIGTERM, handle_sigint);
2834
2835         setlocale(LC_NUMERIC, "en_US");
2836
2837         if (text_output) {
2838                 if (!output_name) {
2839                         ofp = fdopen(STDOUT_FILENO, "w");
2840                         mode = _IOLBF;
2841                 } else {
2842                         char ofname[PATH_MAX];
2843
2844                         snprintf(ofname, sizeof(ofname) - 1, "%s", output_name);
2845                         ofp = fopen(ofname, "w");
2846                         mode = _IOFBF;
2847                 }
2848
2849                 if (!ofp) {
2850                         perror("fopen");
2851                         return 1;
2852                 }
2853
2854                 ofp_buffer = malloc(4096);
2855                 if (setvbuf(ofp, ofp_buffer, mode, 4096)) {
2856                         perror("setvbuf");
2857                         return 1;
2858                 }
2859         }
2860
2861         if (dump_binary) {
2862                 if (!strcmp(dump_binary, "-"))
2863                         dump_fp = stdout;
2864                 else {
2865                         dump_fp = fopen(dump_binary, "w");
2866                         if (!dump_fp) {
2867                                 perror(dump_binary);
2868                                 dump_binary = NULL;
2869                                 return 1;
2870                         }
2871                 }
2872                 bin_ofp_buffer = malloc(128 * 1024);
2873                 if (setvbuf(dump_fp, bin_ofp_buffer, _IOFBF, 128 * 1024)) {
2874                         perror("setvbuf binary");
2875                         return 1;
2876                 }
2877         }
2878
2879         if (pipeline)
2880                 ret = do_fifo();
2881         else
2882                 ret = do_file();
2883
2884         if (!ret)
2885                 show_stats();
2886
2887         if (have_drv_data && !dump_binary)
2888                 printf("\ndiscarded traces containing low-level device driver "
2889                        "specific data (only available in binary output)\n");
2890
2891         if (ofp_buffer) {
2892                 fflush(ofp);
2893                 free(ofp_buffer);
2894         }
2895         if (bin_ofp_buffer) {
2896                 fflush(dump_fp);
2897                 free(bin_ofp_buffer);
2898         }
2899         return ret;
2900 }