blkparse: Initialize and test for undefined request tracking timestamps
[blktrace.git] / blkparse.c
1 /*
2  * block queue tracing parse application
3  *
4  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5  * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
6  *
7  *  This program is free software; you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation; either version 2 of the License, or
10  *  (at your option) any later version.
11  *
12  *  This program is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *  GNU General Public License for more details.
16  *
17  *  You should have received a copy of the GNU General Public License
18  *  along with this program; if not, write to the Free Software
19  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20  *
21  */
22 #include <sys/types.h>
23 #include <sys/stat.h>
24 #include <unistd.h>
25 #include <stdio.h>
26 #include <fcntl.h>
27 #include <stdlib.h>
28 #include <stdarg.h>
29 #include <string.h>
30 #include <getopt.h>
31 #include <errno.h>
32 #include <signal.h>
33 #include <locale.h>
34 #include <libgen.h>
35
36 #include "blktrace.h"
37 #include "rbtree.h"
38 #include "jhash.h"
39
40 static char blkparse_version[] = "1.2.0";
41
42 struct skip_info {
43         unsigned long start, end;
44         struct skip_info *prev, *next;
45 };
46
47 struct per_dev_info {
48         dev_t dev;
49         char *name;
50
51         int backwards;
52         unsigned long long events;
53         unsigned long long first_reported_time;
54         unsigned long long last_reported_time;
55         unsigned long long last_read_time;
56         struct io_stats io_stats;
57         unsigned long skips;
58         unsigned long long seq_skips;
59         unsigned int max_depth[2];
60         unsigned int cur_depth[2];
61
62         struct rb_root rb_track;
63
64         int nfiles;
65         int ncpus;
66
67         unsigned long *cpu_map;
68         unsigned int cpu_map_max;
69
70         struct per_cpu_info *cpus;
71 };
72
73 /*
74  * some duplicated effort here, we can unify this hash and the ppi hash later
75  */
76 struct process_pid_map {
77         pid_t pid;
78         char comm[16];
79         struct process_pid_map *hash_next, *list_next;
80 };
81
82 #define PPM_HASH_SHIFT  (8)
83 #define PPM_HASH_SIZE   (1 << PPM_HASH_SHIFT)
84 #define PPM_HASH_MASK   (PPM_HASH_SIZE - 1)
85 static struct process_pid_map *ppm_hash_table[PPM_HASH_SIZE];
86
87 struct per_process_info {
88         struct process_pid_map *ppm;
89         struct io_stats io_stats;
90         struct per_process_info *hash_next, *list_next;
91         int more_than_one;
92
93         /*
94          * individual io stats
95          */
96         unsigned long long longest_allocation_wait[2];
97         unsigned long long longest_dispatch_wait[2];
98         unsigned long long longest_completion_wait[2];
99 };
100
101 #define PPI_HASH_SHIFT  (8)
102 #define PPI_HASH_SIZE   (1 << PPI_HASH_SHIFT)
103 #define PPI_HASH_MASK   (PPI_HASH_SIZE - 1)
104
105 enum {
106         SORT_PROG_EVENT_N,   /* Program Name */
107         SORT_PROG_EVENT_QKB, /* KB: Queued read and write */
108         SORT_PROG_EVENT_RKB, /* KB: Queued Read */
109         SORT_PROG_EVENT_WKB, /* KB: Queued Write */
110         SORT_PROG_EVENT_CKB, /* KB: Complete */
111         SORT_PROG_EVENT_QIO, /* IO: Queued read and write */
112         SORT_PROG_EVENT_RIO, /* IO: Queued Read */
113         SORT_PROG_EVENT_WIO, /* IO: Queued Write */
114         SORT_PROG_EVENT_CIO, /* IO: Complete */
115 };
116
117 static struct per_process_info *ppi_hash_table[PPI_HASH_SIZE];
118 static struct per_process_info *ppi_list;
119 static int ppi_list_entries;
120
121 static struct option l_opts[] = {
122         {
123                 .name = "act-mask",
124                 .has_arg = required_argument,
125                 .flag = NULL,
126                 .val = 'a'
127         },
128         {
129                 .name = "set-mask",
130                 .has_arg = required_argument,
131                 .flag = NULL,
132                 .val = 'A'
133         },
134         {
135                 .name = "batch",
136                 .has_arg = required_argument,
137                 .flag = NULL,
138                 .val = 'b'
139         },
140         {
141                 .name = "input-directory",
142                 .has_arg = required_argument,
143                 .flag = NULL,
144                 .val = 'D'
145         },
146         {
147                 .name = "dump-binary",
148                 .has_arg = required_argument,
149                 .flag = NULL,
150                 .val = 'd'
151         },
152         {
153                 .name = "format",
154                 .has_arg = required_argument,
155                 .flag = NULL,
156                 .val = 'f'
157         },
158         {
159                 .name = "format-spec",
160                 .has_arg = required_argument,
161                 .flag = NULL,
162                 .val = 'F'
163         },
164         {
165                 .name = "hash-by-name",
166                 .has_arg = no_argument,
167                 .flag = NULL,
168                 .val = 'h'
169         },
170         {
171                 .name = "input",
172                 .has_arg = required_argument,
173                 .flag = NULL,
174                 .val = 'i'
175         },
176         {
177                 .name = "no-msgs",
178                 .has_arg = no_argument,
179                 .flag = NULL,
180                 .val = 'M'
181         },
182         {
183                 .name = "output",
184                 .has_arg = required_argument,
185                 .flag = NULL,
186                 .val = 'o'
187         },
188         {
189                 .name = "no-text-output",
190                 .has_arg = no_argument,
191                 .flag = NULL,
192                 .val = 'O'
193         },
194         {
195                 .name = "quiet",
196                 .has_arg = no_argument,
197                 .flag = NULL,
198                 .val = 'q'
199         },
200         {
201                 .name = "per-program-stats",
202                 .has_arg = no_argument,
203                 .flag = NULL,
204                 .val = 's'
205         },
206         {
207                 .name = "sort-program-stats",
208                 .has_arg = required_argument,
209                 .flag = NULL,
210                 .val = 'S'
211         },
212         {
213                 .name = "track-ios",
214                 .has_arg = no_argument,
215                 .flag = NULL,
216                 .val = 't'
217         },
218         {
219                 .name = "stopwatch",
220                 .has_arg = required_argument,
221                 .flag = NULL,
222                 .val = 'w'
223         },
224         {
225                 .name = "verbose",
226                 .has_arg = no_argument,
227                 .flag = NULL,
228                 .val = 'v'
229         },
230         {
231                 .name = "version",
232                 .has_arg = no_argument,
233                 .flag = NULL,
234                 .val = 'V'
235         },
236         {
237                 .name = NULL,
238         }
239 };
240
241 /*
242  * for sorting the displayed output
243  */
244 struct trace {
245         struct blk_io_trace *bit;
246         struct rb_node rb_node;
247         struct trace *next;
248         unsigned long read_sequence;
249 };
250
251 static struct rb_root rb_sort_root;
252 static unsigned long rb_sort_entries;
253
254 static struct trace *trace_list;
255
256 /*
257  * allocation cache
258  */
259 static struct blk_io_trace *bit_alloc_list;
260 static struct trace *t_alloc_list;
261
262 /*
263  * for tracking individual ios
264  */
265 struct io_track {
266         struct rb_node rb_node;
267
268         struct process_pid_map *ppm;
269         __u64 sector;
270         unsigned long long allocation_time;
271         unsigned long long queue_time;
272         unsigned long long dispatch_time;
273         unsigned long long completion_time;
274 };
275
276 static int ndevices;
277 static struct per_dev_info *devices;
278 static char *get_dev_name(struct per_dev_info *, char *, int);
279 static int trace_rb_insert_last(struct per_dev_info *, struct trace *);
280
281 FILE *ofp = NULL;
282 static char *output_name;
283 static char *input_dir;
284
285 static unsigned long long genesis_time;
286 static unsigned long long last_allowed_time;
287 static unsigned long long stopwatch_start;      /* start from zero by default */
288 static unsigned long long stopwatch_end = -1ULL;        /* "infinity" */
289 static unsigned long read_sequence;
290
291 static int per_process_stats;
292 static int per_process_stats_event = SORT_PROG_EVENT_N;
293 static int per_device_and_cpu_stats = 1;
294 static int track_ios;
295 static int ppi_hash_by_pid = 1;
296 static int verbose;
297 static unsigned int act_mask = -1U;
298 static int stats_printed;
299 static int bin_output_msgs = 1;
300 int data_is_native = -1;
301
302 static FILE *dump_fp;
303 static char *dump_binary;
304
305 static unsigned int t_alloc_cache;
306 static unsigned int bit_alloc_cache;
307
308 #define RB_BATCH_DEFAULT        (512)
309 static unsigned int rb_batch = RB_BATCH_DEFAULT;
310
311 static int pipeline;
312 static char *pipename;
313
314 static int text_output = 1;
315
316 #define is_done()       (*(volatile int *)(&done))
317 static volatile int done;
318
319 struct timespec         abs_start_time;
320 static unsigned long long start_timestamp;
321
322 static int have_drv_data = 0;
323
324 #define JHASH_RANDOM    (0x3af5f2ee)
325
326 #define CPUS_PER_LONG   (8 * sizeof(unsigned long))
327 #define CPU_IDX(cpu)    ((cpu) / CPUS_PER_LONG)
328 #define CPU_BIT(cpu)    ((cpu) & (CPUS_PER_LONG - 1))
329
330 static void io_warn_unless(struct blk_io_trace *t, int condition,
331                            const char *fmt, ...)
332 {
333         va_list ap;
334
335         if (condition)
336                 return;
337         va_start(ap, fmt);
338         printf("(%d,%d) request %llu + %u: ",
339                MAJOR(t->device), MINOR(t->device),
340                t->sector, t->bytes);
341         vfprintf(stderr, fmt, ap);
342         va_end(ap);
343 }
344
345 static void output_binary(void *buf, int len)
346 {
347         if (dump_binary) {
348                 size_t n = fwrite(buf, len, 1, dump_fp);
349                 if (n != 1) {
350                         perror(dump_binary);
351                         fclose(dump_fp);
352                         dump_binary = NULL;
353                 }
354         }
355 }
356
357 static void resize_cpu_info(struct per_dev_info *pdi, int cpu)
358 {
359         struct per_cpu_info *cpus = pdi->cpus;
360         int ncpus = pdi->ncpus;
361         int new_count = cpu + 1;
362         int new_space, size;
363         char *new_start;
364
365         size = new_count * sizeof(struct per_cpu_info);
366         cpus = realloc(cpus, size);
367         if (!cpus) {
368                 char name[20];
369                 fprintf(stderr, "Out of memory, CPU info for device %s (%d)\n",
370                         get_dev_name(pdi, name, sizeof(name)), size);
371                 exit(1);
372         }
373
374         new_start = (char *)cpus + (ncpus * sizeof(struct per_cpu_info));
375         new_space = (new_count - ncpus) * sizeof(struct per_cpu_info);
376         memset(new_start, 0, new_space);
377
378         pdi->ncpus = new_count;
379         pdi->cpus = cpus;
380
381         for (new_count = 0; new_count < pdi->ncpus; new_count++) {
382                 struct per_cpu_info *pci = &pdi->cpus[new_count];
383
384                 if (!pci->fd) {
385                         pci->fd = -1;
386                         memset(&pci->rb_last, 0, sizeof(pci->rb_last));
387                         pci->rb_last_entries = 0;
388                         pci->last_sequence = -1;
389                 }
390         }
391 }
392
393 static struct per_cpu_info *get_cpu_info(struct per_dev_info *pdi, int cpu)
394 {
395         struct per_cpu_info *pci;
396
397         if (cpu >= pdi->ncpus)
398                 resize_cpu_info(pdi, cpu);
399
400         pci = &pdi->cpus[cpu];
401         pci->cpu = cpu;
402         return pci;
403 }
404
405
406 static int resize_devices(char *name)
407 {
408         int size = (ndevices + 1) * sizeof(struct per_dev_info);
409
410         devices = realloc(devices, size);
411         if (!devices) {
412                 fprintf(stderr, "Out of memory, device %s (%d)\n", name, size);
413                 return 1;
414         }
415         memset(&devices[ndevices], 0, sizeof(struct per_dev_info));
416         devices[ndevices].name = name;
417         ndevices++;
418         return 0;
419 }
420
421 static struct per_dev_info *get_dev_info(dev_t dev)
422 {
423         struct per_dev_info *pdi;
424         int i;
425
426         for (i = 0; i < ndevices; i++) {
427                 if (!devices[i].dev)
428                         devices[i].dev = dev;
429                 if (devices[i].dev == dev)
430                         return &devices[i];
431         }
432
433         if (resize_devices(NULL))
434                 return NULL;
435
436         pdi = &devices[ndevices - 1];
437         pdi->dev = dev;
438         pdi->first_reported_time = 0;
439         pdi->last_read_time = 0;
440
441         return pdi;
442 }
443
444 static void insert_skip(struct per_cpu_info *pci, unsigned long start,
445                         unsigned long end)
446 {
447         struct skip_info *sip;
448
449         for (sip = pci->skips_tail; sip != NULL; sip = sip->prev) {
450                 if (end == (sip->start - 1)) {
451                         sip->start = start;
452                         return;
453                 } else if (start == (sip->end + 1)) {
454                         sip->end = end;
455                         return;
456                 }
457         }
458
459         sip = malloc(sizeof(struct skip_info));
460         sip->start = start;
461         sip->end = end;
462         sip->prev = sip->next = NULL;
463         if (pci->skips_tail == NULL)
464                 pci->skips_head = pci->skips_tail = sip;
465         else {
466                 sip->prev = pci->skips_tail;
467                 pci->skips_tail->next = sip;
468                 pci->skips_tail = sip;
469         }
470 }
471
472 static void remove_sip(struct per_cpu_info *pci, struct skip_info *sip)
473 {
474         if (sip->prev == NULL) {
475                 if (sip->next == NULL)
476                         pci->skips_head = pci->skips_tail = NULL;
477                 else {
478                         pci->skips_head = sip->next;
479                         sip->next->prev = NULL;
480                 }
481         } else if (sip->next == NULL) {
482                 pci->skips_tail = sip->prev;
483                 sip->prev->next = NULL;
484         } else {
485                 sip->prev->next = sip->next;
486                 sip->next->prev = sip->prev;
487         }
488
489         sip->prev = sip->next = NULL;
490         free(sip);
491 }
492
493 #define IN_SKIP(sip,seq) (((sip)->start <= (seq)) && ((seq) <= sip->end))
494 static int check_current_skips(struct per_cpu_info *pci, unsigned long seq)
495 {
496         struct skip_info *sip;
497
498         for (sip = pci->skips_tail; sip != NULL; sip = sip->prev) {
499                 if (IN_SKIP(sip, seq)) {
500                         if (sip->start == seq) {
501                                 if (sip->end == seq)
502                                         remove_sip(pci, sip);
503                                 else
504                                         sip->start += 1;
505                         } else if (sip->end == seq)
506                                 sip->end -= 1;
507                         else {
508                                 sip->end = seq - 1;
509                                 insert_skip(pci, seq + 1, sip->end);
510                         }
511                         return 1;
512                 }
513         }
514
515         return 0;
516 }
517
518 static void collect_pdi_skips(struct per_dev_info *pdi)
519 {
520         struct skip_info *sip;
521         int cpu;
522
523         pdi->skips = 0;
524         pdi->seq_skips = 0;
525
526         for (cpu = 0; cpu < pdi->ncpus; cpu++) {
527                 struct per_cpu_info *pci = &pdi->cpus[cpu];
528
529                 for (sip = pci->skips_head; sip != NULL; sip = sip->next) {
530                         pdi->skips++;
531                         pdi->seq_skips += (sip->end - sip->start + 1);
532                         if (verbose)
533                                 fprintf(stderr,"(%d,%d): skipping %lu -> %lu\n",
534                                         MAJOR(pdi->dev), MINOR(pdi->dev),
535                                         sip->start, sip->end);
536                 }
537         }
538 }
539
540 static void cpu_mark_online(struct per_dev_info *pdi, unsigned int cpu)
541 {
542         if (cpu >= pdi->cpu_map_max || !pdi->cpu_map) {
543                 int new_max = (cpu + CPUS_PER_LONG) & ~(CPUS_PER_LONG - 1);
544                 unsigned long *map = malloc(new_max / sizeof(long));
545
546                 memset(map, 0, new_max / sizeof(long));
547
548                 if (pdi->cpu_map) {
549                         memcpy(map, pdi->cpu_map, pdi->cpu_map_max / sizeof(long));
550                         free(pdi->cpu_map);
551                 }
552
553                 pdi->cpu_map = map;
554                 pdi->cpu_map_max = new_max;
555         }
556
557         pdi->cpu_map[CPU_IDX(cpu)] |= (1UL << CPU_BIT(cpu));
558 }
559
560 static inline void cpu_mark_offline(struct per_dev_info *pdi, int cpu)
561 {
562         pdi->cpu_map[CPU_IDX(cpu)] &= ~(1UL << CPU_BIT(cpu));
563 }
564
565 static inline int cpu_is_online(struct per_dev_info *pdi, int cpu)
566 {
567         return (pdi->cpu_map[CPU_IDX(cpu)] & (1UL << CPU_BIT(cpu))) != 0;
568 }
569
570 static inline int ppm_hash_pid(pid_t pid)
571 {
572         return jhash_1word(pid, JHASH_RANDOM) & PPM_HASH_MASK;
573 }
574
575 static struct process_pid_map *find_ppm(pid_t pid)
576 {
577         const int hash_idx = ppm_hash_pid(pid);
578         struct process_pid_map *ppm;
579
580         ppm = ppm_hash_table[hash_idx];
581         while (ppm) {
582                 if (ppm->pid == pid)
583                         return ppm;
584
585                 ppm = ppm->hash_next;
586         }
587
588         return NULL;
589 }
590
591 static struct process_pid_map *add_ppm_hash(pid_t pid, const char *name)
592 {
593         const int hash_idx = ppm_hash_pid(pid);
594         struct process_pid_map *ppm;
595
596         ppm = find_ppm(pid);
597         if (!ppm) {
598                 ppm = malloc(sizeof(*ppm));
599                 memset(ppm, 0, sizeof(*ppm));
600                 ppm->pid = pid;
601                 memset(ppm->comm, 0, sizeof(ppm->comm));
602                 strncpy(ppm->comm, name, sizeof(ppm->comm));
603                 ppm->comm[sizeof(ppm->comm) - 1] = '\0';
604                 ppm->hash_next = ppm_hash_table[hash_idx];
605                 ppm_hash_table[hash_idx] = ppm;
606         }
607
608         return ppm;
609 }
610
611 static void handle_notify(struct blk_io_trace *bit)
612 {
613         void    *payload = (caddr_t) bit + sizeof(*bit);
614         __u32   two32[2];
615
616         switch (bit->action) {
617         case BLK_TN_PROCESS:
618                 add_ppm_hash(bit->pid, payload);
619                 break;
620
621         case BLK_TN_TIMESTAMP:
622                 if (bit->pdu_len != sizeof(two32))
623                         return;
624                 memcpy(two32, payload, sizeof(two32));
625                 if (!data_is_native) {
626                         two32[0] = be32_to_cpu(two32[0]);
627                         two32[1] = be32_to_cpu(two32[1]);
628                 }
629                 start_timestamp = bit->time;
630                 abs_start_time.tv_sec  = two32[0];
631                 abs_start_time.tv_nsec = two32[1];
632                 if (abs_start_time.tv_nsec < 0) {
633                         abs_start_time.tv_sec--;
634                         abs_start_time.tv_nsec += 1000000000;
635                 }
636
637                 break;
638
639         case BLK_TN_MESSAGE:
640                 if (bit->pdu_len > 0) {
641                         char msg[bit->pdu_len+1];
642
643                         memcpy(msg, (char *)payload, bit->pdu_len);
644                         msg[bit->pdu_len] = '\0';
645
646                         fprintf(ofp,
647                                 "%3d,%-3d %2d %8s %5d.%09lu %5u %2s %3s %s\n",
648                                 MAJOR(bit->device), MINOR(bit->device),
649                                 bit->cpu, "0", (int) SECONDS(bit->time),
650                                 (unsigned long) NANO_SECONDS(bit->time),
651                                 0, "m", "N", msg);
652                 }
653                 break;
654
655         default:
656                 /* Ignore unknown notify events */
657                 ;
658         }
659 }
660
661 char *find_process_name(pid_t pid)
662 {
663         struct process_pid_map *ppm = find_ppm(pid);
664
665         if (ppm)
666                 return ppm->comm;
667
668         return NULL;
669 }
670
671 static inline int ppi_hash_pid(pid_t pid)
672 {
673         return jhash_1word(pid, JHASH_RANDOM) & PPI_HASH_MASK;
674 }
675
676 static inline int ppi_hash_name(const char *name)
677 {
678         return jhash(name, 16, JHASH_RANDOM) & PPI_HASH_MASK;
679 }
680
681 static inline int ppi_hash(struct per_process_info *ppi)
682 {
683         struct process_pid_map *ppm = ppi->ppm;
684
685         if (ppi_hash_by_pid)
686                 return ppi_hash_pid(ppm->pid);
687
688         return ppi_hash_name(ppm->comm);
689 }
690
691 static inline void add_ppi_to_hash(struct per_process_info *ppi)
692 {
693         const int hash_idx = ppi_hash(ppi);
694
695         ppi->hash_next = ppi_hash_table[hash_idx];
696         ppi_hash_table[hash_idx] = ppi;
697 }
698
699 static inline void add_ppi_to_list(struct per_process_info *ppi)
700 {
701         ppi->list_next = ppi_list;
702         ppi_list = ppi;
703         ppi_list_entries++;
704 }
705
706 static struct per_process_info *find_ppi_by_name(char *name)
707 {
708         const int hash_idx = ppi_hash_name(name);
709         struct per_process_info *ppi;
710
711         ppi = ppi_hash_table[hash_idx];
712         while (ppi) {
713                 struct process_pid_map *ppm = ppi->ppm;
714
715                 if (!strcmp(ppm->comm, name))
716                         return ppi;
717
718                 ppi = ppi->hash_next;
719         }
720
721         return NULL;
722 }
723
724 static struct per_process_info *find_ppi_by_pid(pid_t pid)
725 {
726         const int hash_idx = ppi_hash_pid(pid);
727         struct per_process_info *ppi;
728
729         ppi = ppi_hash_table[hash_idx];
730         while (ppi) {
731                 struct process_pid_map *ppm = ppi->ppm;
732
733                 if (ppm->pid == pid)
734                         return ppi;
735
736                 ppi = ppi->hash_next;
737         }
738
739         return NULL;
740 }
741
742 static struct per_process_info *find_ppi(pid_t pid)
743 {
744         struct per_process_info *ppi;
745         char *name;
746
747         if (ppi_hash_by_pid)
748                 return find_ppi_by_pid(pid);
749
750         name = find_process_name(pid);
751         if (!name)
752                 return NULL;
753
754         ppi = find_ppi_by_name(name);
755         if (ppi && ppi->ppm->pid != pid)
756                 ppi->more_than_one = 1;
757
758         return ppi;
759 }
760
761 /*
762  * struct trace and blktrace allocation cache, we do potentially
763  * millions of mallocs for these structures while only using at most
764  * a few thousand at the time
765  */
766 static inline void t_free(struct trace *t)
767 {
768         if (t_alloc_cache < 1024) {
769                 t->next = t_alloc_list;
770                 t_alloc_list = t;
771                 t_alloc_cache++;
772         } else
773                 free(t);
774 }
775
776 static inline struct trace *t_alloc(void)
777 {
778         struct trace *t = t_alloc_list;
779
780         if (t) {
781                 t_alloc_list = t->next;
782                 t_alloc_cache--;
783                 return t;
784         }
785
786         return malloc(sizeof(*t));
787 }
788
789 static inline void bit_free(struct blk_io_trace *bit)
790 {
791         if (bit_alloc_cache < 1024 && !bit->pdu_len) {
792                 /*
793                  * abuse a 64-bit field for a next pointer for the free item
794                  */
795                 bit->time = (__u64) (unsigned long) bit_alloc_list;
796                 bit_alloc_list = (struct blk_io_trace *) bit;
797                 bit_alloc_cache++;
798         } else
799                 free(bit);
800 }
801
802 static inline struct blk_io_trace *bit_alloc(void)
803 {
804         struct blk_io_trace *bit = bit_alloc_list;
805
806         if (bit) {
807                 bit_alloc_list = (struct blk_io_trace *) (unsigned long) \
808                                  bit->time;
809                 bit_alloc_cache--;
810                 return bit;
811         }
812
813         return malloc(sizeof(*bit));
814 }
815
816 static inline void __put_trace_last(struct per_dev_info *pdi, struct trace *t)
817 {
818         struct per_cpu_info *pci = get_cpu_info(pdi, t->bit->cpu);
819
820         rb_erase(&t->rb_node, &pci->rb_last);
821         pci->rb_last_entries--;
822
823         bit_free(t->bit);
824         t_free(t);
825 }
826
827 static void put_trace(struct per_dev_info *pdi, struct trace *t)
828 {
829         rb_erase(&t->rb_node, &rb_sort_root);
830         rb_sort_entries--;
831
832         trace_rb_insert_last(pdi, t);
833 }
834
835 static inline int trace_rb_insert(struct trace *t, struct rb_root *root)
836 {
837         struct rb_node **p = &root->rb_node;
838         struct rb_node *parent = NULL;
839         struct trace *__t;
840
841         while (*p) {
842                 parent = *p;
843
844                 __t = rb_entry(parent, struct trace, rb_node);
845
846                 if (t->bit->time < __t->bit->time)
847                         p = &(*p)->rb_left;
848                 else if (t->bit->time > __t->bit->time)
849                         p = &(*p)->rb_right;
850                 else if (t->bit->device < __t->bit->device)
851                         p = &(*p)->rb_left;
852                 else if (t->bit->device > __t->bit->device)
853                         p = &(*p)->rb_right;
854                 else if (t->bit->sequence < __t->bit->sequence)
855                         p = &(*p)->rb_left;
856                 else    /* >= sequence */
857                         p = &(*p)->rb_right;
858         }
859
860         rb_link_node(&t->rb_node, parent, p);
861         rb_insert_color(&t->rb_node, root);
862         return 0;
863 }
864
865 static inline int trace_rb_insert_sort(struct trace *t)
866 {
867         if (!trace_rb_insert(t, &rb_sort_root)) {
868                 rb_sort_entries++;
869                 return 0;
870         }
871
872         return 1;
873 }
874
875 static int trace_rb_insert_last(struct per_dev_info *pdi, struct trace *t)
876 {
877         struct per_cpu_info *pci = get_cpu_info(pdi, t->bit->cpu);
878
879         if (trace_rb_insert(t, &pci->rb_last))
880                 return 1;
881
882         pci->rb_last_entries++;
883
884         if (pci->rb_last_entries > rb_batch * pdi->nfiles) {
885                 struct rb_node *n = rb_first(&pci->rb_last);
886
887                 t = rb_entry(n, struct trace, rb_node);
888                 __put_trace_last(pdi, t);
889         }
890
891         return 0;
892 }
893
894 static struct trace *trace_rb_find(dev_t device, unsigned long sequence,
895                                    struct rb_root *root, int order)
896 {
897         struct rb_node *n = root->rb_node;
898         struct rb_node *prev = NULL;
899         struct trace *__t;
900
901         while (n) {
902                 __t = rb_entry(n, struct trace, rb_node);
903                 prev = n;
904
905                 if (device < __t->bit->device)
906                         n = n->rb_left;
907                 else if (device > __t->bit->device)
908                         n = n->rb_right;
909                 else if (sequence < __t->bit->sequence)
910                         n = n->rb_left;
911                 else if (sequence > __t->bit->sequence)
912                         n = n->rb_right;
913                 else
914                         return __t;
915         }
916
917         /*
918          * hack - the list may not be sequence ordered because some
919          * events don't have sequence and time matched. so we end up
920          * being a little off in the rb lookup here, because we don't
921          * know the time we are looking for. compensate by browsing
922          * a little ahead from the last entry to find the match
923          */
924         if (order && prev) {
925                 int max = 5;
926
927                 while (((n = rb_next(prev)) != NULL) && max--) {
928                         __t = rb_entry(n, struct trace, rb_node);
929
930                         if (__t->bit->device == device &&
931                             __t->bit->sequence == sequence)
932                                 return __t;
933
934                         prev = n;
935                 }
936         }
937
938         return NULL;
939 }
940
941 static inline struct trace *trace_rb_find_last(struct per_dev_info *pdi,
942                                                struct per_cpu_info *pci,
943                                                unsigned long seq)
944 {
945         return trace_rb_find(pdi->dev, seq, &pci->rb_last, 0);
946 }
947
948 static inline int track_rb_insert(struct per_dev_info *pdi,struct io_track *iot)
949 {
950         struct rb_node **p = &pdi->rb_track.rb_node;
951         struct rb_node *parent = NULL;
952         struct io_track *__iot;
953
954         while (*p) {
955                 parent = *p;
956                 __iot = rb_entry(parent, struct io_track, rb_node);
957
958                 if (iot->sector < __iot->sector)
959                         p = &(*p)->rb_left;
960                 else if (iot->sector > __iot->sector)
961                         p = &(*p)->rb_right;
962                 else {
963                         fprintf(stderr,
964                                 "sector alias (%Lu) on device %d,%d!\n",
965                                 (unsigned long long) iot->sector,
966                                 MAJOR(pdi->dev), MINOR(pdi->dev));
967                         return 1;
968                 }
969         }
970
971         rb_link_node(&iot->rb_node, parent, p);
972         rb_insert_color(&iot->rb_node, &pdi->rb_track);
973         return 0;
974 }
975
976 static struct io_track *__find_track(struct per_dev_info *pdi, __u64 sector)
977 {
978         struct rb_node *n = pdi->rb_track.rb_node;
979         struct io_track *__iot;
980
981         while (n) {
982                 __iot = rb_entry(n, struct io_track, rb_node);
983
984                 if (sector < __iot->sector)
985                         n = n->rb_left;
986                 else if (sector > __iot->sector)
987                         n = n->rb_right;
988                 else
989                         return __iot;
990         }
991
992         return NULL;
993 }
994
995 static struct io_track *find_track(struct per_dev_info *pdi, pid_t pid,
996                                    __u64 sector)
997 {
998         struct io_track *iot;
999
1000         iot = __find_track(pdi, sector);
1001         if (!iot) {
1002                 iot = malloc(sizeof(*iot));
1003                 iot->ppm = find_ppm(pid);
1004                 if (!iot->ppm)
1005                         iot->ppm = add_ppm_hash(pid, "unknown");
1006                 iot->sector = sector;
1007                 iot->allocation_time = -1ULL;
1008                 iot->queue_time = -1ULL;
1009                 iot->dispatch_time = -1ULL;
1010                 iot->completion_time = -1ULL;
1011                 track_rb_insert(pdi, iot);
1012         }
1013
1014         return iot;
1015 }
1016
1017 static void log_track_frontmerge(struct per_dev_info *pdi,
1018                                  struct blk_io_trace *t)
1019 {
1020         struct io_track *iot;
1021
1022         if (!track_ios)
1023                 return;
1024
1025         iot = __find_track(pdi, t->sector + t_sec(t));
1026         if (!iot) {
1027                 if (verbose)
1028                         fprintf(stderr, "merge not found for (%d,%d): %llu\n",
1029                                 MAJOR(t->device), MINOR(t->device),
1030                                 (unsigned long long) t->sector + t_sec(t));
1031                 return;
1032         }
1033
1034         rb_erase(&iot->rb_node, &pdi->rb_track);
1035         iot->sector -= t_sec(t);
1036         track_rb_insert(pdi, iot);
1037 }
1038
1039 static void log_track_getrq(struct per_dev_info *pdi, struct blk_io_trace *t)
1040 {
1041         struct io_track *iot;
1042
1043         if (!track_ios)
1044                 return;
1045
1046         iot = find_track(pdi, t->pid, t->sector);
1047         io_warn_unless(t, iot->allocation_time == -1ULL,
1048                        "confused about %s time", "allocation");
1049         iot->allocation_time = t->time;
1050 }
1051
1052 /*
1053  * for md/dm setups, the interesting cycle is Q -> C. So track queueing
1054  * time here, as dispatch time
1055  */
1056 static void log_track_queue(struct per_dev_info *pdi, struct blk_io_trace *t)
1057 {
1058         struct io_track *iot;
1059
1060         if (!track_ios)
1061                 return;
1062
1063         iot = find_track(pdi, t->pid, t->sector);
1064         io_warn_unless(t, iot->dispatch_time == -1ULL,
1065                        "confused about %s time", "dispatch");
1066         iot->dispatch_time = t->time;
1067 }
1068
1069 /*
1070  * return time between rq allocation and insertion
1071  */
1072 static unsigned long long log_track_insert(struct per_dev_info *pdi,
1073                                            struct blk_io_trace *t)
1074 {
1075         unsigned long long elapsed;
1076         struct io_track *iot;
1077
1078         if (!track_ios)
1079                 return -1;
1080
1081         iot = find_track(pdi, t->pid, t->sector);
1082         io_warn_unless(t, iot->queue_time == -1ULL,
1083                        "confused about %s time", "queue");
1084         iot->queue_time = t->time;
1085
1086         if (iot->allocation_time == -1ULL)
1087                 return -1;
1088
1089         elapsed = iot->queue_time - iot->allocation_time;
1090
1091         if (per_process_stats) {
1092                 struct per_process_info *ppi = find_ppi(iot->ppm->pid);
1093                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
1094
1095                 if (ppi && elapsed > ppi->longest_allocation_wait[w])
1096                         ppi->longest_allocation_wait[w] = elapsed;
1097         }
1098
1099         return elapsed;
1100 }
1101
1102 /*
1103  * return time between queue and issue
1104  */
1105 static unsigned long long log_track_issue(struct per_dev_info *pdi,
1106                                           struct blk_io_trace *t)
1107 {
1108         unsigned long long elapsed = -1ULL;
1109         struct io_track *iot;
1110
1111         if (!track_ios)
1112                 return -1;
1113         if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
1114                 return -1;
1115
1116         iot = __find_track(pdi, t->sector);
1117         if (!iot) {
1118                 if (verbose)
1119                         fprintf(stderr, "issue not found for (%d,%d): %llu\n",
1120                                 MAJOR(t->device), MINOR(t->device),
1121                                 (unsigned long long) t->sector);
1122                 return -1;
1123         }
1124
1125         io_warn_unless(t, iot->dispatch_time == -1ULL,
1126                        "confused about %s time", "dispatch");
1127         iot->dispatch_time = t->time;
1128         if (iot->queue_time != -1ULL)
1129                 elapsed = iot->dispatch_time - iot->queue_time;
1130
1131         if (elapsed != -1ULL && per_process_stats) {
1132                 struct per_process_info *ppi = find_ppi(iot->ppm->pid);
1133                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
1134
1135                 if (ppi && elapsed > ppi->longest_dispatch_wait[w])
1136                         ppi->longest_dispatch_wait[w] = elapsed;
1137         }
1138
1139         return elapsed;
1140 }
1141
1142 /*
1143  * return time between dispatch and complete
1144  */
1145 static unsigned long long log_track_complete(struct per_dev_info *pdi,
1146                                              struct blk_io_trace *t)
1147 {
1148         unsigned long long elapsed = -1ULL;
1149         struct io_track *iot;
1150
1151         if (!track_ios)
1152                 return -1;
1153
1154         iot = __find_track(pdi, t->sector);
1155         if (!iot) {
1156                 if (verbose)
1157                         fprintf(stderr,"complete not found for (%d,%d): %llu\n",
1158                                 MAJOR(t->device), MINOR(t->device),
1159                                 (unsigned long long) t->sector);
1160                 return -1;
1161         }
1162
1163         io_warn_unless(t, iot->completion_time == -1ULL,
1164                        "confused about %s time", "completion");
1165         iot->completion_time = t->time;
1166         if (iot->dispatch_time != -1ULL)
1167                 elapsed = iot->completion_time - iot->dispatch_time;
1168
1169         if (elapsed != -1ULL && per_process_stats) {
1170                 struct per_process_info *ppi = find_ppi(iot->ppm->pid);
1171                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
1172
1173                 if (ppi && elapsed > ppi->longest_completion_wait[w])
1174                         ppi->longest_completion_wait[w] = elapsed;
1175         }
1176
1177         /*
1178          * kill the trace, we don't need it after completion
1179          */
1180         rb_erase(&iot->rb_node, &pdi->rb_track);
1181         free(iot);
1182
1183         return elapsed;
1184 }
1185
1186
1187 static struct io_stats *find_process_io_stats(pid_t pid)
1188 {
1189         struct per_process_info *ppi = find_ppi(pid);
1190
1191         if (!ppi) {
1192                 ppi = malloc(sizeof(*ppi));
1193                 memset(ppi, 0, sizeof(*ppi));
1194                 ppi->ppm = find_ppm(pid);
1195                 if (!ppi->ppm)
1196                         ppi->ppm = add_ppm_hash(pid, "unknown");
1197                 add_ppi_to_hash(ppi);
1198                 add_ppi_to_list(ppi);
1199         }
1200
1201         return &ppi->io_stats;
1202 }
1203
1204 static char *get_dev_name(struct per_dev_info *pdi, char *buffer, int size)
1205 {
1206         if (pdi->name)
1207                 snprintf(buffer, size, "%s", pdi->name);
1208         else
1209                 snprintf(buffer, size, "%d,%d",MAJOR(pdi->dev),MINOR(pdi->dev));
1210         return buffer;
1211 }
1212
1213 static void check_time(struct per_dev_info *pdi, struct blk_io_trace *bit)
1214 {
1215         unsigned long long this = bit->time;
1216         unsigned long long last = pdi->last_reported_time;
1217
1218         pdi->backwards = (this < last) ? 'B' : ' ';
1219         pdi->last_reported_time = this;
1220 }
1221
1222 static inline void __account_m(struct io_stats *ios, struct blk_io_trace *t,
1223                                int rw)
1224 {
1225         if (rw) {
1226                 ios->mwrites++;
1227                 ios->mwrite_kb += t_kb(t);
1228                 ios->mwrite_b += t_b(t);
1229         } else {
1230                 ios->mreads++;
1231                 ios->mread_kb += t_kb(t);
1232                 ios->mread_b += t_b(t);
1233         }
1234 }
1235
1236 static inline void account_m(struct blk_io_trace *t, struct per_cpu_info *pci,
1237                              int rw)
1238 {
1239         __account_m(&pci->io_stats, t, rw);
1240
1241         if (per_process_stats) {
1242                 struct io_stats *ios = find_process_io_stats(t->pid);
1243
1244                 __account_m(ios, t, rw);
1245         }
1246 }
1247
1248 static inline void __account_pc_queue(struct io_stats *ios,
1249                                       struct blk_io_trace *t, int rw)
1250 {
1251         if (rw) {
1252                 ios->qwrites_pc++;
1253                 ios->qwrite_kb_pc += t_kb(t);
1254                 ios->qwrite_b_pc += t_b(t);
1255         } else {
1256                 ios->qreads_pc++;
1257                 ios->qread_kb += t_kb(t);
1258                 ios->qread_b_pc += t_b(t);
1259         }
1260 }
1261
1262 static inline void account_pc_queue(struct blk_io_trace *t,
1263                                     struct per_cpu_info *pci, int rw)
1264 {
1265         __account_pc_queue(&pci->io_stats, t, rw);
1266
1267         if (per_process_stats) {
1268                 struct io_stats *ios = find_process_io_stats(t->pid);
1269
1270                 __account_pc_queue(ios, t, rw);
1271         }
1272 }
1273
1274 static inline void __account_pc_issue(struct io_stats *ios, int rw,
1275                                       unsigned int bytes)
1276 {
1277         if (rw) {
1278                 ios->iwrites_pc++;
1279                 ios->iwrite_kb_pc += bytes >> 10;
1280                 ios->iwrite_b_pc += bytes & 1023;
1281         } else {
1282                 ios->ireads_pc++;
1283                 ios->iread_kb_pc += bytes >> 10;
1284                 ios->iread_b_pc += bytes & 1023;
1285         }
1286 }
1287
1288 static inline void account_pc_issue(struct blk_io_trace *t,
1289                                     struct per_cpu_info *pci, int rw)
1290 {
1291         __account_pc_issue(&pci->io_stats, rw, t->bytes);
1292
1293         if (per_process_stats) {
1294                 struct io_stats *ios = find_process_io_stats(t->pid);
1295
1296                 __account_pc_issue(ios, rw, t->bytes);
1297         }
1298 }
1299
1300 static inline void __account_pc_requeue(struct io_stats *ios,
1301                                         struct blk_io_trace *t, int rw)
1302 {
1303         if (rw) {
1304                 ios->wrqueue_pc++;
1305                 ios->iwrite_kb_pc -= t_kb(t);
1306                 ios->iwrite_b_pc -= t_b(t);
1307         } else {
1308                 ios->rrqueue_pc++;
1309                 ios->iread_kb_pc -= t_kb(t);
1310                 ios->iread_b_pc -= t_b(t);
1311         }
1312 }
1313
1314 static inline void account_pc_requeue(struct blk_io_trace *t,
1315                                       struct per_cpu_info *pci, int rw)
1316 {
1317         __account_pc_requeue(&pci->io_stats, t, rw);
1318
1319         if (per_process_stats) {
1320                 struct io_stats *ios = find_process_io_stats(t->pid);
1321
1322                 __account_pc_requeue(ios, t, rw);
1323         }
1324 }
1325
1326 static inline void __account_pc_c(struct io_stats *ios, int rw)
1327 {
1328         if (rw)
1329                 ios->cwrites_pc++;
1330         else
1331                 ios->creads_pc++;
1332 }
1333
1334 static inline void account_pc_c(struct blk_io_trace *t,
1335                                 struct per_cpu_info *pci, int rw)
1336 {
1337         __account_pc_c(&pci->io_stats, rw);
1338
1339         if (per_process_stats) {
1340                 struct io_stats *ios = find_process_io_stats(t->pid);
1341
1342                 __account_pc_c(ios, rw);
1343         }
1344 }
1345
1346 static inline void __account_queue(struct io_stats *ios, struct blk_io_trace *t,
1347                                    int rw)
1348 {
1349         if (rw) {
1350                 ios->qwrites++;
1351                 ios->qwrite_kb += t_kb(t);
1352                 ios->qwrite_b += t_b(t);
1353         } else {
1354                 ios->qreads++;
1355                 ios->qread_kb += t_kb(t);
1356                 ios->qread_b += t_b(t);
1357         }
1358 }
1359
1360 static inline void account_queue(struct blk_io_trace *t,
1361                                  struct per_cpu_info *pci, int rw)
1362 {
1363         __account_queue(&pci->io_stats, t, rw);
1364
1365         if (per_process_stats) {
1366                 struct io_stats *ios = find_process_io_stats(t->pid);
1367
1368                 __account_queue(ios, t, rw);
1369         }
1370 }
1371
1372 static inline void __account_c(struct io_stats *ios, int rw, int bytes)
1373 {
1374         if (rw) {
1375                 ios->cwrites++;
1376                 ios->cwrite_kb += bytes >> 10;
1377                 ios->cwrite_b += bytes & 1023;
1378         } else {
1379                 ios->creads++;
1380                 ios->cread_kb += bytes >> 10;
1381                 ios->cread_b += bytes & 1023;
1382         }
1383 }
1384
1385 static inline void account_c(struct blk_io_trace *t, struct per_cpu_info *pci,
1386                              int rw, int bytes)
1387 {
1388         __account_c(&pci->io_stats, rw, bytes);
1389
1390         if (per_process_stats) {
1391                 struct io_stats *ios = find_process_io_stats(t->pid);
1392
1393                 __account_c(ios, rw, bytes);
1394         }
1395 }
1396
1397 static inline void __account_issue(struct io_stats *ios, int rw,
1398                                    unsigned int bytes)
1399 {
1400         if (rw) {
1401                 ios->iwrites++;
1402                 ios->iwrite_kb += bytes >> 10;
1403                 ios->iwrite_b  += bytes & 1023;
1404         } else {
1405                 ios->ireads++;
1406                 ios->iread_kb += bytes >> 10;
1407                 ios->iread_b  += bytes & 1023;
1408         }
1409 }
1410
1411 static inline void account_issue(struct blk_io_trace *t,
1412                                  struct per_cpu_info *pci, int rw)
1413 {
1414         __account_issue(&pci->io_stats, rw, t->bytes);
1415
1416         if (per_process_stats) {
1417                 struct io_stats *ios = find_process_io_stats(t->pid);
1418
1419                 __account_issue(ios, rw, t->bytes);
1420         }
1421 }
1422
1423 static inline void __account_unplug(struct io_stats *ios, int timer)
1424 {
1425         if (timer)
1426                 ios->timer_unplugs++;
1427         else
1428                 ios->io_unplugs++;
1429 }
1430
1431 static inline void account_unplug(struct blk_io_trace *t,
1432                                   struct per_cpu_info *pci, int timer)
1433 {
1434         __account_unplug(&pci->io_stats, timer);
1435
1436         if (per_process_stats) {
1437                 struct io_stats *ios = find_process_io_stats(t->pid);
1438
1439                 __account_unplug(ios, timer);
1440         }
1441 }
1442
1443 static inline void __account_requeue(struct io_stats *ios,
1444                                      struct blk_io_trace *t, int rw)
1445 {
1446         if (rw) {
1447                 ios->wrqueue++;
1448                 ios->iwrite_kb -= t_kb(t);
1449                 ios->iwrite_b -= t_b(t);
1450         } else {
1451                 ios->rrqueue++;
1452                 ios->iread_kb -= t_kb(t);
1453                 ios->iread_b -= t_b(t);
1454         }
1455 }
1456
1457 static inline void account_requeue(struct blk_io_trace *t,
1458                                    struct per_cpu_info *pci, int rw)
1459 {
1460         __account_requeue(&pci->io_stats, t, rw);
1461
1462         if (per_process_stats) {
1463                 struct io_stats *ios = find_process_io_stats(t->pid);
1464
1465                 __account_requeue(ios, t, rw);
1466         }
1467 }
1468
1469 static void log_complete(struct per_dev_info *pdi, struct per_cpu_info *pci,
1470                          struct blk_io_trace *t, char *act)
1471 {
1472         process_fmt(act, pci, t, log_track_complete(pdi, t), 0, NULL);
1473 }
1474
1475 static void log_insert(struct per_dev_info *pdi, struct per_cpu_info *pci,
1476                        struct blk_io_trace *t, char *act)
1477 {
1478         process_fmt(act, pci, t, log_track_insert(pdi, t), 0, NULL);
1479 }
1480
1481 static void log_queue(struct per_cpu_info *pci, struct blk_io_trace *t,
1482                       char *act)
1483 {
1484         process_fmt(act, pci, t, -1, 0, NULL);
1485 }
1486
1487 static void log_issue(struct per_dev_info *pdi, struct per_cpu_info *pci,
1488                       struct blk_io_trace *t, char *act)
1489 {
1490         process_fmt(act, pci, t, log_track_issue(pdi, t), 0, NULL);
1491 }
1492
1493 static void log_merge(struct per_dev_info *pdi, struct per_cpu_info *pci,
1494                       struct blk_io_trace *t, char *act)
1495 {
1496         if (act[0] == 'F')
1497                 log_track_frontmerge(pdi, t);
1498
1499         process_fmt(act, pci, t, -1ULL, 0, NULL);
1500 }
1501
1502 static void log_action(struct per_cpu_info *pci, struct blk_io_trace *t,
1503                         char *act)
1504 {
1505         process_fmt(act, pci, t, -1ULL, 0, NULL);
1506 }
1507
1508 static void log_generic(struct per_cpu_info *pci, struct blk_io_trace *t,
1509                         char *act)
1510 {
1511         process_fmt(act, pci, t, -1ULL, 0, NULL);
1512 }
1513
1514 static void log_unplug(struct per_cpu_info *pci, struct blk_io_trace *t,
1515                       char *act)
1516 {
1517         process_fmt(act, pci, t, -1ULL, 0, NULL);
1518 }
1519
1520 static void log_split(struct per_cpu_info *pci, struct blk_io_trace *t,
1521                       char *act)
1522 {
1523         process_fmt(act, pci, t, -1ULL, 0, NULL);
1524 }
1525
1526 static void log_pc(struct per_cpu_info *pci, struct blk_io_trace *t, char *act)
1527 {
1528         unsigned char *buf = (unsigned char *) t + sizeof(*t);
1529
1530         process_fmt(act, pci, t, -1ULL, t->pdu_len, buf);
1531 }
1532
1533 static void dump_trace_pc(struct blk_io_trace *t, struct per_dev_info *pdi,
1534                           struct per_cpu_info *pci)
1535 {
1536         int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
1537         int act = t->action & 0xffff;
1538
1539         switch (act) {
1540                 case __BLK_TA_QUEUE:
1541                         log_generic(pci, t, "Q");
1542                         account_pc_queue(t, pci, w);
1543                         break;
1544                 case __BLK_TA_GETRQ:
1545                         log_generic(pci, t, "G");
1546                         break;
1547                 case __BLK_TA_SLEEPRQ:
1548                         log_generic(pci, t, "S");
1549                         break;
1550                 case __BLK_TA_REQUEUE:
1551                         /*
1552                          * can happen if we miss traces, don't let it go
1553                          * below zero
1554                          */
1555                         if (pdi->cur_depth[w])
1556                                 pdi->cur_depth[w]--;
1557                         account_pc_requeue(t, pci, w);
1558                         log_generic(pci, t, "R");
1559                         break;
1560                 case __BLK_TA_ISSUE:
1561                         account_pc_issue(t, pci, w);
1562                         pdi->cur_depth[w]++;
1563                         if (pdi->cur_depth[w] > pdi->max_depth[w])
1564                                 pdi->max_depth[w] = pdi->cur_depth[w];
1565                         log_pc(pci, t, "D");
1566                         break;
1567                 case __BLK_TA_COMPLETE:
1568                         if (pdi->cur_depth[w])
1569                                 pdi->cur_depth[w]--;
1570                         log_pc(pci, t, "C");
1571                         account_pc_c(t, pci, w);
1572                         break;
1573                 case __BLK_TA_INSERT:
1574                         log_pc(pci, t, "I");
1575                         break;
1576                 default:
1577                         fprintf(stderr, "Bad pc action %x\n", act);
1578                         break;
1579         }
1580 }
1581
1582 static void dump_trace_fs(struct blk_io_trace *t, struct per_dev_info *pdi,
1583                           struct per_cpu_info *pci)
1584 {
1585         int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
1586         int act = t->action & 0xffff;
1587
1588         switch (act) {
1589                 case __BLK_TA_QUEUE:
1590                         log_track_queue(pdi, t);
1591                         account_queue(t, pci, w);
1592                         log_queue(pci, t, "Q");
1593                         break;
1594                 case __BLK_TA_INSERT:
1595                         log_insert(pdi, pci, t, "I");
1596                         break;
1597                 case __BLK_TA_BACKMERGE:
1598                         account_m(t, pci, w);
1599                         log_merge(pdi, pci, t, "M");
1600                         break;
1601                 case __BLK_TA_FRONTMERGE:
1602                         account_m(t, pci, w);
1603                         log_merge(pdi, pci, t, "F");
1604                         break;
1605                 case __BLK_TA_GETRQ:
1606                         log_track_getrq(pdi, t);
1607                         log_generic(pci, t, "G");
1608                         break;
1609                 case __BLK_TA_SLEEPRQ:
1610                         log_generic(pci, t, "S");
1611                         break;
1612                 case __BLK_TA_REQUEUE:
1613                         /*
1614                          * can happen if we miss traces, don't let it go
1615                          * below zero
1616                          */
1617                         if (pdi->cur_depth[w])
1618                                 pdi->cur_depth[w]--;
1619                         account_requeue(t, pci, w);
1620                         log_queue(pci, t, "R");
1621                         break;
1622                 case __BLK_TA_ISSUE:
1623                         account_issue(t, pci, w);
1624                         pdi->cur_depth[w]++;
1625                         if (pdi->cur_depth[w] > pdi->max_depth[w])
1626                                 pdi->max_depth[w] = pdi->cur_depth[w];
1627                         log_issue(pdi, pci, t, "D");
1628                         break;
1629                 case __BLK_TA_COMPLETE:
1630                         if (pdi->cur_depth[w])
1631                                 pdi->cur_depth[w]--;
1632                         account_c(t, pci, w, t->bytes);
1633                         log_complete(pdi, pci, t, "C");
1634                         break;
1635                 case __BLK_TA_PLUG:
1636                         log_action(pci, t, "P");
1637                         break;
1638                 case __BLK_TA_UNPLUG_IO:
1639                         account_unplug(t, pci, 0);
1640                         log_unplug(pci, t, "U");
1641                         break;
1642                 case __BLK_TA_UNPLUG_TIMER:
1643                         account_unplug(t, pci, 1);
1644                         log_unplug(pci, t, "UT");
1645                         break;
1646                 case __BLK_TA_SPLIT:
1647                         log_split(pci, t, "X");
1648                         break;
1649                 case __BLK_TA_BOUNCE:
1650                         log_generic(pci, t, "B");
1651                         break;
1652                 case __BLK_TA_REMAP:
1653                         log_generic(pci, t, "A");
1654                         break;
1655                 case __BLK_TA_DRV_DATA:
1656                         have_drv_data = 1;
1657                         /* dump to binary file only */
1658                         break;
1659                 default:
1660                         fprintf(stderr, "Bad fs action %x\n", t->action);
1661                         break;
1662         }
1663 }
1664
1665 static void dump_trace(struct blk_io_trace *t, struct per_cpu_info *pci,
1666                        struct per_dev_info *pdi)
1667 {
1668         if (text_output) {
1669                 if (t->action == BLK_TN_MESSAGE)
1670                         handle_notify(t);
1671                 else if (t->action & BLK_TC_ACT(BLK_TC_PC))
1672                         dump_trace_pc(t, pdi, pci);
1673                 else
1674                         dump_trace_fs(t, pdi, pci);
1675         }
1676
1677         if (!pdi->events)
1678                 pdi->first_reported_time = t->time;
1679
1680         pdi->events++;
1681
1682         if (bin_output_msgs ||
1683                             !(t->action & BLK_TC_ACT(BLK_TC_NOTIFY) &&
1684                               t->action == BLK_TN_MESSAGE))
1685                 output_binary(t, sizeof(*t) + t->pdu_len);
1686 }
1687
1688 /*
1689  * print in a proper way, not too small and not too big. if more than
1690  * 1000,000K, turn into M and so on
1691  */
1692 static char *size_cnv(char *dst, unsigned long long num, int in_kb)
1693 {
1694         char suff[] = { '\0', 'K', 'M', 'G', 'P' };
1695         unsigned int i = 0;
1696
1697         if (in_kb)
1698                 i++;
1699
1700         while (num > 1000 * 1000ULL && (i < sizeof(suff) - 1)) {
1701                 i++;
1702                 num /= 1000;
1703         }
1704
1705         sprintf(dst, "%'8Lu%c", num, suff[i]);
1706         return dst;
1707 }
1708
1709 static void dump_io_stats(struct per_dev_info *pdi, struct io_stats *ios,
1710                           char *msg)
1711 {
1712         static char x[256], y[256];
1713
1714         fprintf(ofp, "%s\n", msg);
1715
1716         fprintf(ofp, " Reads Queued:    %s, %siB\t",
1717                         size_cnv(x, ios->qreads, 0),
1718                         size_cnv(y, ios->qread_kb + (ios->qread_b>>10), 1));
1719         fprintf(ofp, " Writes Queued:    %s, %siB\n",
1720                         size_cnv(x, ios->qwrites, 0),
1721                         size_cnv(y, ios->qwrite_kb + (ios->qwrite_b>>10), 1));
1722         fprintf(ofp, " Read Dispatches: %s, %siB\t",
1723                         size_cnv(x, ios->ireads, 0),
1724                         size_cnv(y, ios->iread_kb + (ios->iread_b>>10), 1));
1725         fprintf(ofp, " Write Dispatches: %s, %siB\n",
1726                         size_cnv(x, ios->iwrites, 0),
1727                         size_cnv(y, ios->iwrite_kb + (ios->iwrite_b>>10), 1));
1728         fprintf(ofp, " Reads Requeued:  %s\t\t", size_cnv(x, ios->rrqueue, 0));
1729         fprintf(ofp, " Writes Requeued:  %s\n", size_cnv(x, ios->wrqueue, 0));
1730         fprintf(ofp, " Reads Completed: %s, %siB\t",
1731                         size_cnv(x, ios->creads, 0),
1732                         size_cnv(y, ios->cread_kb + (ios->cread_b>>10), 1));
1733         fprintf(ofp, " Writes Completed: %s, %siB\n",
1734                         size_cnv(x, ios->cwrites, 0),
1735                         size_cnv(y, ios->cwrite_kb + (ios->cwrite_b>>10), 1));
1736         fprintf(ofp, " Read Merges:     %s, %siB\t",
1737                         size_cnv(x, ios->mreads, 0),
1738                         size_cnv(y, ios->mread_kb + (ios->mread_b>>10), 1));
1739         fprintf(ofp, " Write Merges:     %s, %siB\n",
1740                         size_cnv(x, ios->mwrites, 0),
1741                         size_cnv(y, ios->mwrite_kb + (ios->mwrite_b>>10), 1));
1742         if (pdi) {
1743                 fprintf(ofp, " Read depth:      %'8u%8c\t", pdi->max_depth[0], ' ');
1744                 fprintf(ofp, " Write depth:      %'8u\n", pdi->max_depth[1]);
1745         }
1746         if (ios->qreads_pc || ios->qwrites_pc || ios->ireads_pc || ios->iwrites_pc ||
1747             ios->rrqueue_pc || ios->wrqueue_pc || ios->creads_pc || ios->cwrites_pc) {
1748                 fprintf(ofp, " PC Reads Queued: %s, %siB\t",
1749                         size_cnv(x, ios->qreads_pc, 0),
1750                         size_cnv(y,
1751                                 ios->qread_kb_pc + (ios->qread_b_pc>>10), 1));
1752                 fprintf(ofp, " PC Writes Queued: %s, %siB\n",
1753                         size_cnv(x, ios->qwrites_pc, 0),
1754                         size_cnv(y,
1755                                 ios->qwrite_kb_pc + (ios->qwrite_b_pc>>10), 1));
1756                 fprintf(ofp, " PC Read Disp.:   %s, %siB\t",
1757                         size_cnv(x, ios->ireads_pc, 0),
1758                         size_cnv(y,
1759                                 ios->iread_kb_pc + (ios->iread_b_pc>>10), 1));
1760                 fprintf(ofp, " PC Write Disp.:   %s, %siB\n",
1761                         size_cnv(x, ios->iwrites_pc, 0),
1762                         size_cnv(y,
1763                                 ios->iwrite_kb_pc + (ios->iwrite_b_pc>>10),
1764                                 1));
1765                 fprintf(ofp, " PC Reads Req.:   %s\t\t", size_cnv(x, ios->rrqueue_pc, 0));
1766                 fprintf(ofp, " PC Writes Req.:   %s\n", size_cnv(x, ios->wrqueue_pc, 0));
1767                 fprintf(ofp, " PC Reads Compl.: %s\t\t", size_cnv(x, ios->creads_pc, 0));
1768                 fprintf(ofp, " PC Writes Compl.: %s\n", size_cnv(x, ios->cwrites_pc, 0));
1769         }
1770         fprintf(ofp, " IO unplugs:      %'8lu%8c\t", ios->io_unplugs, ' ');
1771         fprintf(ofp, " Timer unplugs:    %'8lu\n", ios->timer_unplugs);
1772 }
1773
1774 static void dump_wait_stats(struct per_process_info *ppi)
1775 {
1776         unsigned long rawait = ppi->longest_allocation_wait[0] / 1000;
1777         unsigned long rdwait = ppi->longest_dispatch_wait[0] / 1000;
1778         unsigned long rcwait = ppi->longest_completion_wait[0] / 1000;
1779         unsigned long wawait = ppi->longest_allocation_wait[1] / 1000;
1780         unsigned long wdwait = ppi->longest_dispatch_wait[1] / 1000;
1781         unsigned long wcwait = ppi->longest_completion_wait[1] / 1000;
1782
1783         fprintf(ofp, " Allocation wait: %'8lu%8c\t", rawait, ' ');
1784         fprintf(ofp, " Allocation wait:  %'8lu\n", wawait);
1785         fprintf(ofp, " Dispatch wait:   %'8lu%8c\t", rdwait, ' ');
1786         fprintf(ofp, " Dispatch wait:    %'8lu\n", wdwait);
1787         fprintf(ofp, " Completion wait: %'8lu%8c\t", rcwait, ' ');
1788         fprintf(ofp, " Completion wait:  %'8lu\n", wcwait);
1789 }
1790
1791 static int ppi_name_compare(const void *p1, const void *p2)
1792 {
1793         struct per_process_info *ppi1 = *((struct per_process_info **) p1);
1794         struct per_process_info *ppi2 = *((struct per_process_info **) p2);
1795         int res;
1796
1797         res = strverscmp(ppi1->ppm->comm, ppi2->ppm->comm);
1798         if (!res)
1799                 res = ppi1->ppm->pid > ppi2->ppm->pid;
1800
1801         return res;
1802 }
1803
1804 static int ppi_event_compare(const void *p1, const void *p2)
1805 {
1806         struct per_process_info *ppi1 = *((struct per_process_info **) p1);
1807         struct per_process_info *ppi2 = *((struct per_process_info **) p2);
1808         struct io_stats *ios1 = &ppi1->io_stats;
1809         struct io_stats *ios2 = &ppi2->io_stats;
1810         unsigned long io1, io2;
1811         unsigned long long kb1,kb2;
1812         int sort_by_kb = 1;
1813
1814         io1 = io2 = 0;
1815         kb1 = kb2 = 0;
1816
1817         switch (per_process_stats_event) {
1818         case SORT_PROG_EVENT_QKB: /* KB: Queued read and write */
1819                 kb1 = ios1->qwrite_kb + (ios1->qwrite_b>>10) +
1820                         ios1->qread_kb + (ios1->qread_b>>10);
1821                 kb2 = ios2->qwrite_kb + (ios2->qwrite_b>>10) +
1822                         ios2->qread_kb + (ios2->qread_b>>10);
1823                 break;
1824         case SORT_PROG_EVENT_RKB: /* KB: Queued Read */
1825                 kb1 = ios1->qread_kb + (ios1->qread_b>>10);
1826                 kb2 = ios2->qread_kb + (ios2->qread_b>>10);
1827                 break;
1828         case SORT_PROG_EVENT_WKB: /* KB: Queued Write */
1829                 kb1 = ios1->qwrite_kb + (ios1->qwrite_b>>10);
1830                 kb2 = ios2->qwrite_kb + (ios2->qwrite_b>>10);
1831                 break;
1832         case SORT_PROG_EVENT_CKB: /* KB: Complete */
1833                 kb1 = ios1->cwrite_kb + (ios1->cwrite_b>>10) +
1834                         ios1->cread_kb + (ios1->cread_b>>10);
1835                 kb2 = ios2->cwrite_kb + (ios2->cwrite_b>>10) +
1836                         ios2->cread_kb + (ios2->cread_b>>10);
1837                 break;
1838         case SORT_PROG_EVENT_QIO: /* IO: Queued read and write */
1839                 sort_by_kb = 0;
1840                 io1 = ios1->qreads + ios1->qwrites;
1841                 io2 = ios2->qreads + ios2->qwrites;
1842                 break;
1843         case SORT_PROG_EVENT_RIO: /* IO: Queued Read */
1844                 sort_by_kb = 0;
1845                 io1 = ios1->qreads;
1846                 io2 = ios2->qreads;
1847                 break;
1848         case SORT_PROG_EVENT_WIO: /* IO: Queued Write */
1849                 sort_by_kb = 0;
1850                 io1 = ios1->qwrites;
1851                 io2 = ios2->qwrites;
1852                 break;
1853         case SORT_PROG_EVENT_CIO: /* IO: Complete */
1854                 sort_by_kb = 0;
1855                 io1 = ios1->creads + ios1->cwrites;
1856                 io2 = ios2->creads + ios2->cwrites;
1857                 break;
1858         }
1859
1860
1861         /* compare kb */
1862         if (sort_by_kb) {
1863                 if (kb1 > kb2)
1864                         return 1;
1865                 else if (kb1 == kb2)
1866                         return 0;
1867                 return -1;
1868         }
1869
1870         /* compare io */
1871         if (io1 > io2)
1872                 return 1;
1873         else if (io1 == io2)
1874                 return 0;
1875         return -1;
1876 }
1877
1878 static int ppi_compare(const void *p1, const void *p2)
1879 {
1880         if (per_process_stats_event == SORT_PROG_EVENT_N)
1881                 return ppi_name_compare(p1, p2);
1882
1883         return ppi_event_compare(p1, p2);
1884 }
1885
1886 static void sort_process_list(void)
1887 {
1888         struct per_process_info **ppis;
1889         struct per_process_info *ppi;
1890         int i = 0;
1891
1892         ppis = malloc(ppi_list_entries * sizeof(struct per_process_info *));
1893
1894         ppi = ppi_list;
1895         while (ppi) {
1896                 ppis[i++] = ppi;
1897                 ppi = ppi->list_next;
1898         }
1899
1900         qsort(ppis, ppi_list_entries, sizeof(ppi), ppi_compare);
1901
1902         i = ppi_list_entries - 1;
1903         ppi_list = NULL;
1904         while (i >= 0) {
1905                 ppi = ppis[i];
1906
1907                 ppi->list_next = ppi_list;
1908                 ppi_list = ppi;
1909                 i--;
1910         }
1911
1912         free(ppis);
1913 }
1914
1915 static void show_process_stats(void)
1916 {
1917         struct per_process_info *ppi;
1918
1919         sort_process_list();
1920
1921         ppi = ppi_list;
1922         while (ppi) {
1923                 struct process_pid_map *ppm = ppi->ppm;
1924                 char name[64];
1925
1926                 if (ppi->more_than_one)
1927                         sprintf(name, "%s (%u, ...)", ppm->comm, ppm->pid);
1928                 else
1929                         sprintf(name, "%s (%u)", ppm->comm, ppm->pid);
1930
1931                 dump_io_stats(NULL, &ppi->io_stats, name);
1932                 dump_wait_stats(ppi);
1933                 ppi = ppi->list_next;
1934         }
1935
1936         fprintf(ofp, "\n");
1937 }
1938
1939 static void show_device_and_cpu_stats(void)
1940 {
1941         struct per_dev_info *pdi;
1942         struct per_cpu_info *pci;
1943         struct io_stats total, *ios;
1944         unsigned long long rrate, wrate, msec;
1945         int i, j, pci_events;
1946         char line[3 + 8/*cpu*/ + 2 + 32/*dev*/ + 3];
1947         char name[32];
1948         double ratio;
1949
1950         for (pdi = devices, i = 0; i < ndevices; i++, pdi++) {
1951
1952                 memset(&total, 0, sizeof(total));
1953                 pci_events = 0;
1954
1955                 if (i > 0)
1956                         fprintf(ofp, "\n");
1957
1958                 for (pci = pdi->cpus, j = 0; j < pdi->ncpus; j++, pci++) {
1959                         if (!pci->nelems)
1960                                 continue;
1961
1962                         ios = &pci->io_stats;
1963                         total.qreads += ios->qreads;
1964                         total.qwrites += ios->qwrites;
1965                         total.creads += ios->creads;
1966                         total.cwrites += ios->cwrites;
1967                         total.mreads += ios->mreads;
1968                         total.mwrites += ios->mwrites;
1969                         total.ireads += ios->ireads;
1970                         total.iwrites += ios->iwrites;
1971                         total.rrqueue += ios->rrqueue;
1972                         total.wrqueue += ios->wrqueue;
1973                         total.qread_kb += ios->qread_kb;
1974                         total.qwrite_kb += ios->qwrite_kb;
1975                         total.cread_kb += ios->cread_kb;
1976                         total.cwrite_kb += ios->cwrite_kb;
1977                         total.iread_kb += ios->iread_kb;
1978                         total.iwrite_kb += ios->iwrite_kb;
1979                         total.mread_kb += ios->mread_kb;
1980                         total.mwrite_kb += ios->mwrite_kb;
1981                         total.qread_b += ios->qread_b;
1982                         total.qwrite_b += ios->qwrite_b;
1983                         total.cread_b += ios->cread_b;
1984                         total.cwrite_b += ios->cwrite_b;
1985                         total.iread_b += ios->iread_b;
1986                         total.iwrite_b += ios->iwrite_b;
1987                         total.mread_b += ios->mread_b;
1988                         total.mwrite_b += ios->mwrite_b;
1989
1990                         total.qreads_pc += ios->qreads_pc;
1991                         total.qwrites_pc += ios->qwrites_pc;
1992                         total.creads_pc += ios->creads_pc;
1993                         total.cwrites_pc += ios->cwrites_pc;
1994                         total.ireads_pc += ios->ireads_pc;
1995                         total.iwrites_pc += ios->iwrites_pc;
1996                         total.rrqueue_pc += ios->rrqueue_pc;
1997                         total.wrqueue_pc += ios->wrqueue_pc;
1998                         total.qread_kb_pc += ios->qread_kb_pc;
1999                         total.qwrite_kb_pc += ios->qwrite_kb_pc;
2000                         total.iread_kb_pc += ios->iread_kb_pc;
2001                         total.iwrite_kb_pc += ios->iwrite_kb_pc;
2002                         total.qread_b_pc += ios->qread_b_pc;
2003                         total.qwrite_b_pc += ios->qwrite_b_pc;
2004                         total.iread_b_pc += ios->iread_b_pc;
2005                         total.iwrite_b_pc += ios->iwrite_b_pc;
2006
2007                         total.timer_unplugs += ios->timer_unplugs;
2008                         total.io_unplugs += ios->io_unplugs;
2009
2010                         snprintf(line, sizeof(line) - 1, "CPU%d (%s):",
2011                                  j, get_dev_name(pdi, name, sizeof(name)));
2012                         dump_io_stats(pdi, ios, line);
2013                         pci_events++;
2014                 }
2015
2016                 if (pci_events > 1) {
2017                         fprintf(ofp, "\n");
2018                         snprintf(line, sizeof(line) - 1, "Total (%s):",
2019                                  get_dev_name(pdi, name, sizeof(name)));
2020                         dump_io_stats(NULL, &total, line);
2021                 }
2022
2023                 wrate = rrate = 0;
2024                 msec = (pdi->last_reported_time - pdi->first_reported_time) / 1000000;
2025                 if (msec) {
2026                         rrate = ((1000 * total.cread_kb) + total.cread_b) /
2027                                                                         msec;
2028                         wrate = ((1000 * total.cwrite_kb) + total.cwrite_b) /
2029                                                                         msec;
2030                 }
2031
2032                 fprintf(ofp, "\nThroughput (R/W): %'LuKiB/s / %'LuKiB/s\n",
2033                         rrate, wrate);
2034                 fprintf(ofp, "Events (%s): %'Lu entries\n",
2035                         get_dev_name(pdi, line, sizeof(line)), pdi->events);
2036
2037                 collect_pdi_skips(pdi);
2038                 if (!pdi->skips && !pdi->events)
2039                         ratio = 0.0;
2040                 else
2041                         ratio = 100.0 * ((double)pdi->seq_skips /
2042                                         (double)(pdi->events + pdi->seq_skips));
2043                 fprintf(ofp, "Skips: %'lu forward (%'llu - %5.1lf%%)\n",
2044                         pdi->skips, pdi->seq_skips, ratio);
2045         }
2046 }
2047
2048 static void correct_abs_start_time(void)
2049 {
2050         long delta = genesis_time - start_timestamp;
2051
2052         abs_start_time.tv_sec  += SECONDS(delta);
2053         abs_start_time.tv_nsec += NANO_SECONDS(delta);
2054         if (abs_start_time.tv_nsec < 0) {
2055                 abs_start_time.tv_nsec += 1000000000;
2056                 abs_start_time.tv_sec -= 1;
2057         } else
2058         if (abs_start_time.tv_nsec > 1000000000) {
2059                 abs_start_time.tv_nsec -= 1000000000;
2060                 abs_start_time.tv_sec += 1;
2061         }
2062 }
2063
2064 static void find_genesis(void)
2065 {
2066         struct trace *t = trace_list;
2067
2068         genesis_time = -1ULL;
2069         while (t != NULL) {
2070                 if (t->bit->time < genesis_time)
2071                         genesis_time = t->bit->time;
2072
2073                 t = t->next;
2074         }
2075
2076         /* The time stamp record will usually be the first
2077          * record in the trace, but not always.
2078          */
2079         if (start_timestamp
2080          && start_timestamp != genesis_time) {
2081                 correct_abs_start_time();
2082         }
2083 }
2084
2085 static inline int check_stopwatch(struct blk_io_trace *bit)
2086 {
2087         if (bit->time < stopwatch_end &&
2088             bit->time >= stopwatch_start)
2089                 return 0;
2090
2091         return 1;
2092 }
2093
2094 /*
2095  * return youngest entry read
2096  */
2097 static int sort_entries(unsigned long long *youngest)
2098 {
2099         struct per_dev_info *pdi = NULL;
2100         struct per_cpu_info *pci = NULL;
2101         struct trace *t;
2102
2103         if (!genesis_time)
2104                 find_genesis();
2105
2106         *youngest = 0;
2107         while ((t = trace_list) != NULL) {
2108                 struct blk_io_trace *bit = t->bit;
2109
2110                 trace_list = t->next;
2111
2112                 bit->time -= genesis_time;
2113
2114                 if (bit->time < *youngest || !*youngest)
2115                         *youngest = bit->time;
2116
2117                 if (!pdi || pdi->dev != bit->device) {
2118                         pdi = get_dev_info(bit->device);
2119                         pci = NULL;
2120                 }
2121
2122                 if (!pci || pci->cpu != bit->cpu)
2123                         pci = get_cpu_info(pdi, bit->cpu);
2124
2125                 if (bit->sequence < pci->smallest_seq_read)
2126                         pci->smallest_seq_read = bit->sequence;
2127
2128                 if (check_stopwatch(bit)) {
2129                         bit_free(bit);
2130                         t_free(t);
2131                         continue;
2132                 }
2133
2134                 if (trace_rb_insert_sort(t))
2135                         return -1;
2136         }
2137
2138         return 0;
2139 }
2140
2141 /*
2142  * to continue, we must have traces from all online cpus in the tree
2143  */
2144 static int check_cpu_map(struct per_dev_info *pdi)
2145 {
2146         unsigned long *cpu_map;
2147         struct rb_node *n;
2148         struct trace *__t;
2149         unsigned int i;
2150         int ret, cpu;
2151
2152         /*
2153          * create a map of the cpus we have traces for
2154          */
2155         cpu_map = malloc(pdi->cpu_map_max / sizeof(long));
2156         memset(cpu_map, 0, sizeof(*cpu_map));
2157         n = rb_first(&rb_sort_root);
2158         while (n) {
2159                 __t = rb_entry(n, struct trace, rb_node);
2160                 cpu = __t->bit->cpu;
2161
2162                 cpu_map[CPU_IDX(cpu)] |= (1UL << CPU_BIT(cpu));
2163                 n = rb_next(n);
2164         }
2165
2166         /*
2167          * we can't continue if pdi->cpu_map has entries set that we don't
2168          * have in the sort rbtree. the opposite is not a problem, though
2169          */
2170         ret = 0;
2171         for (i = 0; i < pdi->cpu_map_max / CPUS_PER_LONG; i++) {
2172                 if (pdi->cpu_map[i] & ~(cpu_map[i])) {
2173                         ret = 1;
2174                         break;
2175                 }
2176         }
2177
2178         free(cpu_map);
2179         return ret;
2180 }
2181
2182 static int check_sequence(struct per_dev_info *pdi, struct trace *t, int force)
2183 {
2184         struct blk_io_trace *bit = t->bit;
2185         unsigned long expected_sequence;
2186         struct per_cpu_info *pci;
2187         struct trace *__t;
2188
2189         pci = get_cpu_info(pdi, bit->cpu);
2190         expected_sequence = pci->last_sequence + 1;
2191
2192         if (!expected_sequence) {
2193                 /*
2194                  * 1 should be the first entry, just allow it
2195                  */
2196                 if (bit->sequence == 1)
2197                         return 0;
2198                 if (bit->sequence == pci->smallest_seq_read)
2199                         return 0;
2200
2201                 return check_cpu_map(pdi);
2202         }
2203
2204         if (bit->sequence == expected_sequence)
2205                 return 0;
2206
2207         /*
2208          * we may not have seen that sequence yet. if we are not doing
2209          * the final run, break and wait for more entries.
2210          */
2211         if (expected_sequence < pci->smallest_seq_read) {
2212                 __t = trace_rb_find_last(pdi, pci, expected_sequence);
2213                 if (!__t)
2214                         goto skip;
2215
2216                 __put_trace_last(pdi, __t);
2217                 return 0;
2218         } else if (!force) {
2219                 return 1;
2220         } else {
2221 skip:
2222                 if (check_current_skips(pci, bit->sequence))
2223                         return 0;
2224
2225                 if (expected_sequence < bit->sequence)
2226                         insert_skip(pci, expected_sequence, bit->sequence - 1);
2227                 return 0;
2228         }
2229 }
2230
2231 static void show_entries_rb(int force)
2232 {
2233         struct per_dev_info *pdi = NULL;
2234         struct per_cpu_info *pci = NULL;
2235         struct blk_io_trace *bit;
2236         struct rb_node *n;
2237         struct trace *t;
2238
2239         while ((n = rb_first(&rb_sort_root)) != NULL) {
2240                 if (is_done() && !force && !pipeline)
2241                         break;
2242
2243                 t = rb_entry(n, struct trace, rb_node);
2244                 bit = t->bit;
2245
2246                 if (read_sequence - t->read_sequence < 1 && !force)
2247                         break;
2248
2249                 if (!pdi || pdi->dev != bit->device) {
2250                         pdi = get_dev_info(bit->device);
2251                         pci = NULL;
2252                 }
2253
2254                 if (!pdi) {
2255                         fprintf(stderr, "Unknown device ID? (%d,%d)\n",
2256                                 MAJOR(bit->device), MINOR(bit->device));
2257                         break;
2258                 }
2259
2260                 if (!(bit->action == BLK_TN_MESSAGE) &&
2261                     check_sequence(pdi, t, force))
2262                         break;
2263
2264                 if (!force && bit->time > last_allowed_time)
2265                         break;
2266
2267                 check_time(pdi, bit);
2268
2269                 if (!pci || pci->cpu != bit->cpu)
2270                         pci = get_cpu_info(pdi, bit->cpu);
2271
2272                 if (!(bit->action == BLK_TN_MESSAGE))
2273                         pci->last_sequence = bit->sequence;
2274
2275                 pci->nelems++;
2276
2277                 if (bit->action & (act_mask << BLK_TC_SHIFT))
2278                         dump_trace(bit, pci, pdi);
2279
2280                 put_trace(pdi, t);
2281         }
2282 }
2283
2284 static int read_data(int fd, void *buffer, int bytes, int block, int *fdblock)
2285 {
2286         int ret, bytes_left, fl;
2287         void *p;
2288
2289         if (block != *fdblock) {
2290                 fl = fcntl(fd, F_GETFL);
2291
2292                 if (!block) {
2293                         *fdblock = 0;
2294                         fcntl(fd, F_SETFL, fl | O_NONBLOCK);
2295                 } else {
2296                         *fdblock = 1;
2297                         fcntl(fd, F_SETFL, fl & ~O_NONBLOCK);
2298                 }
2299         }
2300
2301         bytes_left = bytes;
2302         p = buffer;
2303         while (bytes_left > 0) {
2304                 ret = read(fd, p, bytes_left);
2305                 if (!ret)
2306                         return 1;
2307                 else if (ret < 0) {
2308                         if (errno != EAGAIN) {
2309                                 perror("read");
2310                                 return -1;
2311                         }
2312
2313                         /*
2314                          * never do partial reads. we can return if we
2315                          * didn't read anything and we should not block,
2316                          * otherwise wait for data
2317                          */
2318                         if ((bytes_left == bytes) && !block)
2319                                 return 1;
2320
2321                         usleep(10);
2322                         continue;
2323                 } else {
2324                         p += ret;
2325                         bytes_left -= ret;
2326                 }
2327         }
2328
2329         return 0;
2330 }
2331
2332 static inline __u16 get_pdulen(struct blk_io_trace *bit)
2333 {
2334         if (data_is_native)
2335                 return bit->pdu_len;
2336
2337         return __bswap_16(bit->pdu_len);
2338 }
2339
2340 static inline __u32 get_magic(struct blk_io_trace *bit)
2341 {
2342         if (data_is_native)
2343                 return bit->magic;
2344
2345         return __bswap_32(bit->magic);
2346 }
2347
2348 static int read_events(int fd, int always_block, int *fdblock)
2349 {
2350         struct per_dev_info *pdi = NULL;
2351         unsigned int events = 0;
2352
2353         while (!is_done() && events < rb_batch) {
2354                 struct blk_io_trace *bit;
2355                 struct trace *t;
2356                 int pdu_len, should_block, ret;
2357                 __u32 magic;
2358
2359                 bit = bit_alloc();
2360
2361                 should_block = !events || always_block;
2362
2363                 ret = read_data(fd, bit, sizeof(*bit), should_block, fdblock);
2364                 if (ret) {
2365                         bit_free(bit);
2366                         if (!events && ret < 0)
2367                                 events = ret;
2368                         break;
2369                 }
2370
2371                 /*
2372                  * look at first trace to check whether we need to convert
2373                  * data in the future
2374                  */
2375                 if (data_is_native == -1 && check_data_endianness(bit->magic))
2376                         break;
2377
2378                 magic = get_magic(bit);
2379                 if ((magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
2380                         fprintf(stderr, "Bad magic %x\n", magic);
2381                         break;
2382                 }
2383
2384                 pdu_len = get_pdulen(bit);
2385                 if (pdu_len) {
2386                         void *ptr = realloc(bit, sizeof(*bit) + pdu_len);
2387
2388                         if (read_data(fd, ptr + sizeof(*bit), pdu_len, 1, fdblock)) {
2389                                 bit_free(ptr);
2390                                 break;
2391                         }
2392
2393                         bit = ptr;
2394                 }
2395
2396                 trace_to_cpu(bit);
2397
2398                 if (verify_trace(bit)) {
2399                         bit_free(bit);
2400                         continue;
2401                 }
2402
2403                 /*
2404                  * not a real trace, so grab and handle it here
2405                  */
2406                 if (bit->action & BLK_TC_ACT(BLK_TC_NOTIFY) && bit->action != BLK_TN_MESSAGE) {
2407                         handle_notify(bit);
2408                         output_binary(bit, sizeof(*bit) + bit->pdu_len);
2409                         continue;
2410                 }
2411
2412                 t = t_alloc();
2413                 memset(t, 0, sizeof(*t));
2414                 t->bit = bit;
2415                 t->read_sequence = read_sequence;
2416
2417                 t->next = trace_list;
2418                 trace_list = t;
2419
2420                 if (!pdi || pdi->dev != bit->device)
2421                         pdi = get_dev_info(bit->device);
2422
2423                 if (bit->time > pdi->last_read_time)
2424                         pdi->last_read_time = bit->time;
2425
2426                 events++;
2427         }
2428
2429         return events;
2430 }
2431
2432 /*
2433  * Managing input streams
2434  */
2435
2436 struct ms_stream {
2437         struct ms_stream *next;
2438         struct trace *first, *last;
2439         struct per_dev_info *pdi;
2440         unsigned int cpu;
2441 };
2442
2443 #define MS_HASH(d, c) ((MAJOR(d) & 0xff) ^ (MINOR(d) & 0xff) ^ (cpu & 0xff))
2444
2445 struct ms_stream *ms_head;
2446 struct ms_stream *ms_hash[256];
2447
2448 static void ms_sort(struct ms_stream *msp);
2449 static int ms_prime(struct ms_stream *msp);
2450
2451 static inline struct trace *ms_peek(struct ms_stream *msp)
2452 {
2453         return (msp == NULL) ? NULL : msp->first;
2454 }
2455
2456 static inline __u64 ms_peek_time(struct ms_stream *msp)
2457 {
2458         return ms_peek(msp)->bit->time;
2459 }
2460
2461 static inline void ms_resort(struct ms_stream *msp)
2462 {
2463         if (msp->next && ms_peek_time(msp) > ms_peek_time(msp->next)) {
2464                 ms_head = msp->next;
2465                 msp->next = NULL;
2466                 ms_sort(msp);
2467         }
2468 }
2469
2470 static inline void ms_deq(struct ms_stream *msp)
2471 {
2472         msp->first = msp->first->next;
2473         if (!msp->first) {
2474                 msp->last = NULL;
2475                 if (!ms_prime(msp)) {
2476                         ms_head = msp->next;
2477                         msp->next = NULL;
2478                         return;
2479                 }
2480         }
2481
2482         ms_resort(msp);
2483 }
2484
2485 static void ms_sort(struct ms_stream *msp)
2486 {
2487         __u64 msp_t = ms_peek_time(msp);
2488         struct ms_stream *this_msp = ms_head;
2489
2490         if (this_msp == NULL)
2491                 ms_head = msp;
2492         else if (msp_t < ms_peek_time(this_msp)) {
2493                 msp->next = this_msp;
2494                 ms_head = msp;
2495         }
2496         else {
2497                 while (this_msp->next && ms_peek_time(this_msp->next) < msp_t)
2498                         this_msp = this_msp->next;
2499
2500                 msp->next = this_msp->next;
2501                 this_msp->next = msp;
2502         }
2503 }
2504
2505 static int ms_prime(struct ms_stream *msp)
2506 {
2507         __u32 magic;
2508         unsigned int i;
2509         struct trace *t;
2510         struct per_dev_info *pdi = msp->pdi;
2511         struct per_cpu_info *pci = get_cpu_info(pdi, msp->cpu);
2512         struct blk_io_trace *bit = NULL;
2513         int ret, pdu_len, ndone = 0;
2514
2515         for (i = 0; !is_done() && pci->fd >= 0 && i < rb_batch; i++) {
2516                 bit = bit_alloc();
2517                 ret = read_data(pci->fd, bit, sizeof(*bit), 1, &pci->fdblock);
2518                 if (ret)
2519                         goto err;
2520
2521                 if (data_is_native == -1 && check_data_endianness(bit->magic))
2522                         goto err;
2523
2524                 magic = get_magic(bit);
2525                 if ((magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
2526                         fprintf(stderr, "Bad magic %x\n", magic);
2527                         goto err;
2528
2529                 }
2530
2531                 pdu_len = get_pdulen(bit);
2532                 if (pdu_len) {
2533                         void *ptr = realloc(bit, sizeof(*bit) + pdu_len);
2534                         ret = read_data(pci->fd, ptr + sizeof(*bit), pdu_len,
2535                                                              1, &pci->fdblock);
2536                         if (ret) {
2537                                 free(ptr);
2538                                 bit = NULL;
2539                                 goto err;
2540                         }
2541
2542                         bit = ptr;
2543                 }
2544
2545                 trace_to_cpu(bit);
2546                 if (verify_trace(bit))
2547                         goto err;
2548
2549                 if (bit->cpu != pci->cpu) {
2550                         fprintf(stderr, "cpu %d trace info has error cpu %d\n",
2551                                 pci->cpu, bit->cpu);
2552                         continue;
2553                 }
2554
2555                 if (bit->action & BLK_TC_ACT(BLK_TC_NOTIFY) && bit->action != BLK_TN_MESSAGE) {
2556                         handle_notify(bit);
2557                         output_binary(bit, sizeof(*bit) + bit->pdu_len);
2558                         bit_free(bit);
2559
2560                         i -= 1;
2561                         continue;
2562                 }
2563
2564                 if (bit->time > pdi->last_read_time)
2565                         pdi->last_read_time = bit->time;
2566
2567                 t = t_alloc();
2568                 memset(t, 0, sizeof(*t));
2569                 t->bit = bit;
2570
2571                 if (msp->first == NULL)
2572                         msp->first = msp->last = t;
2573                 else {
2574                         msp->last->next = t;
2575                         msp->last = t;
2576                 }
2577
2578                 ndone++;
2579         }
2580
2581         return ndone;
2582
2583 err:
2584         if (bit) bit_free(bit);
2585
2586         cpu_mark_offline(pdi, pci->cpu);
2587         close(pci->fd);
2588         pci->fd = -1;
2589
2590         return ndone;
2591 }
2592
2593 static struct ms_stream *ms_alloc(struct per_dev_info *pdi, int cpu)
2594 {
2595         struct ms_stream *msp = malloc(sizeof(*msp));
2596
2597         msp->next = NULL;
2598         msp->first = msp->last = NULL;
2599         msp->pdi = pdi;
2600         msp->cpu = cpu;
2601
2602         if (ms_prime(msp))
2603                 ms_sort(msp);
2604
2605         return msp;
2606 }
2607
2608 static int setup_file(struct per_dev_info *pdi, int cpu)
2609 {
2610         int len = 0;
2611         struct stat st;
2612         char *p, *dname;
2613         struct per_cpu_info *pci = get_cpu_info(pdi, cpu);
2614
2615         pci->cpu = cpu;
2616         pci->fdblock = -1;
2617
2618         p = strdup(pdi->name);
2619         dname = dirname(p);
2620         if (strcmp(dname, ".")) {
2621                 input_dir = dname;
2622                 p = strdup(pdi->name);
2623                 strcpy(pdi->name, basename(p));
2624         }
2625         free(p);
2626
2627         if (input_dir)
2628                 len = sprintf(pci->fname, "%s/", input_dir);
2629
2630         snprintf(pci->fname + len, sizeof(pci->fname)-1-len,
2631                  "%s.blktrace.%d", pdi->name, pci->cpu);
2632         if (stat(pci->fname, &st) < 0)
2633                 return 0;
2634         if (!st.st_size)
2635                 return 1;
2636
2637         pci->fd = open(pci->fname, O_RDONLY);
2638         if (pci->fd < 0) {
2639                 perror(pci->fname);
2640                 return 0;
2641         }
2642
2643         printf("Input file %s added\n", pci->fname);
2644         cpu_mark_online(pdi, pci->cpu);
2645
2646         pdi->nfiles++;
2647         ms_alloc(pdi, pci->cpu);
2648
2649         return 1;
2650 }
2651
2652 static int handle(struct ms_stream *msp)
2653 {
2654         struct trace *t;
2655         struct per_dev_info *pdi;
2656         struct per_cpu_info *pci;
2657         struct blk_io_trace *bit;
2658
2659         t = ms_peek(msp);
2660
2661         bit = t->bit;
2662         pdi = msp->pdi;
2663         pci = get_cpu_info(pdi, msp->cpu);
2664         pci->nelems++;
2665         bit->time -= genesis_time;
2666
2667         if (t->bit->time > stopwatch_end)
2668                 return 0;
2669
2670         pdi->last_reported_time = bit->time;
2671         if ((bit->action & (act_mask << BLK_TC_SHIFT))&&
2672             t->bit->time >= stopwatch_start)
2673                 dump_trace(bit, pci, pdi);
2674
2675         ms_deq(msp);
2676
2677         if (text_output)
2678                 trace_rb_insert_last(pdi, t);
2679         else {
2680                 bit_free(t->bit);
2681                 t_free(t);
2682         }
2683
2684         return 1;
2685 }
2686
2687 /*
2688  * Check if we need to sanitize the name. We allow 'foo', or if foo.blktrace.X
2689  * is given, then strip back down to 'foo' to avoid missing files.
2690  */
2691 static int name_fixup(char *name)
2692 {
2693         char *b;
2694
2695         if (!name)
2696                 return 1;
2697
2698         b = strstr(name, ".blktrace.");
2699         if (b)
2700                 *b = '\0';
2701
2702         return 0;
2703 }
2704
2705 static int do_file(void)
2706 {
2707         int i, cpu, ret;
2708         struct per_dev_info *pdi;
2709
2710         /*
2711          * first prepare all files for reading
2712          */
2713         for (i = 0; i < ndevices; i++) {
2714                 pdi = &devices[i];
2715                 ret = name_fixup(pdi->name);
2716                 if (ret)
2717                         return ret;
2718
2719                 for (cpu = 0; setup_file(pdi, cpu); cpu++)
2720                         ;
2721
2722                 if (!cpu) {
2723                         fprintf(stderr,"No input files found for %s\n",
2724                                 pdi->name);
2725                         return 1;
2726                 }
2727         }
2728
2729         /*
2730          * Get the initial time stamp
2731          */
2732         if (ms_head)
2733                 genesis_time = ms_peek_time(ms_head);
2734
2735         /*
2736          * Correct abs_start_time if necessary
2737          */
2738         if (start_timestamp
2739          && start_timestamp != genesis_time) {
2740                 correct_abs_start_time();
2741         }
2742
2743         /*
2744          * Keep processing traces while any are left
2745          */
2746         while (!is_done() && ms_head && handle(ms_head))
2747                 ;
2748
2749         return 0;
2750 }
2751
2752 static void do_pipe(int fd)
2753 {
2754         unsigned long long youngest;
2755         int events, fdblock;
2756
2757         last_allowed_time = -1ULL;
2758         fdblock = -1;
2759         while ((events = read_events(fd, 0, &fdblock)) > 0) {
2760                 read_sequence++;
2761         
2762 #if 0
2763                 smallest_seq_read = -1U;
2764 #endif
2765
2766                 if (sort_entries(&youngest))
2767                         break;
2768
2769                 if (youngest > stopwatch_end)
2770                         break;
2771
2772                 show_entries_rb(0);
2773         }
2774
2775         if (rb_sort_entries)
2776                 show_entries_rb(1);
2777 }
2778
2779 static int do_fifo(void)
2780 {
2781         int fd;
2782
2783         if (!strcmp(pipename, "-"))
2784                 fd = dup(STDIN_FILENO);
2785         else
2786                 fd = open(pipename, O_RDONLY);
2787
2788         if (fd == -1) {
2789                 perror("dup stdin");
2790                 return -1;
2791         }
2792
2793         do_pipe(fd);
2794         close(fd);
2795         return 0;
2796 }
2797
2798 static void show_stats(void)
2799 {
2800         if (!ofp)
2801                 return;
2802         if (stats_printed)
2803                 return;
2804
2805         stats_printed = 1;
2806
2807         if (per_process_stats)
2808                 show_process_stats();
2809
2810         if (per_device_and_cpu_stats)
2811                 show_device_and_cpu_stats();
2812
2813         fflush(ofp);
2814 }
2815
2816 static void handle_sigint(__attribute__((__unused__)) int sig)
2817 {
2818         done = 1;
2819 }
2820
2821 /*
2822  * Extract start and duration times from a string, allowing
2823  * us to specify a time interval of interest within a trace.
2824  * Format: "duration" (start is zero) or "start:duration".
2825  */
2826 static int find_stopwatch_interval(char *string)
2827 {
2828         double value;
2829         char *sp;
2830
2831         value = strtod(string, &sp);
2832         if (sp == string) {
2833                 fprintf(stderr,"Invalid stopwatch timer: %s\n", string);
2834                 return 1;
2835         }
2836         if (*sp == ':') {
2837                 stopwatch_start = DOUBLE_TO_NANO_ULL(value);
2838                 string = sp + 1;
2839                 value = strtod(string, &sp);
2840                 if (sp == string || *sp != '\0') {
2841                         fprintf(stderr,"Invalid stopwatch duration time: %s\n",
2842                                 string);
2843                         return 1;
2844                 }
2845         } else if (*sp != '\0') {
2846                 fprintf(stderr,"Invalid stopwatch start timer: %s\n", string);
2847                 return 1;
2848         }
2849         stopwatch_end = DOUBLE_TO_NANO_ULL(value);
2850         if (stopwatch_end <= stopwatch_start) {
2851                 fprintf(stderr, "Invalid stopwatch interval: %Lu -> %Lu\n",
2852                         stopwatch_start, stopwatch_end);
2853                 return 1;
2854         }
2855
2856         return 0;
2857 }
2858
2859 static int is_pipe(const char *str)
2860 {
2861         struct stat st;
2862
2863         if (!strcmp(str, "-"))
2864                 return 1;
2865         if (!stat(str, &st) && S_ISFIFO(st.st_mode))
2866                 return 1;
2867
2868         return 0;
2869 }
2870
2871 static int get_program_sort_event(const char *str)
2872 {
2873         char evt = str[0];
2874
2875         switch (evt) {
2876         case 'N':
2877                 per_process_stats_event = SORT_PROG_EVENT_N;
2878                 break;
2879         case 'Q':
2880                 per_process_stats_event = SORT_PROG_EVENT_QKB;
2881                 break;
2882         case 'q':
2883                 per_process_stats_event = SORT_PROG_EVENT_QIO;
2884                 break;
2885         case 'R':
2886                 per_process_stats_event = SORT_PROG_EVENT_RKB;
2887                 break;
2888         case 'r':
2889                 per_process_stats_event = SORT_PROG_EVENT_RIO;
2890                 break;
2891         case 'W':
2892                 per_process_stats_event = SORT_PROG_EVENT_WKB;
2893                 break;
2894         case 'w':
2895                 per_process_stats_event = SORT_PROG_EVENT_WIO;
2896                 break;
2897         case 'C':
2898                 per_process_stats_event = SORT_PROG_EVENT_CKB;
2899                 break;
2900         case 'c':
2901                 per_process_stats_event = SORT_PROG_EVENT_CIO;
2902                 break;
2903         default:
2904                 return 1;
2905         }
2906
2907         return 0;
2908 }
2909
2910 #define S_OPTS  "a:A:b:D:d:f:F:hi:o:OqsS:tw:vVM"
2911 static char usage_str[] =    "\n\n" \
2912         "-i <file>           | --input=<file>\n" \
2913         "[ -a <action field> | --act-mask=<action field> ]\n" \
2914         "[ -A <action mask>  | --set-mask=<action mask> ]\n" \
2915         "[ -b <traces>       | --batch=<traces> ]\n" \
2916         "[ -d <file>         | --dump-binary=<file> ]\n" \
2917         "[ -D <dir>          | --input-directory=<dir> ]\n" \
2918         "[ -f <format>       | --format=<format> ]\n" \
2919         "[ -F <spec>         | --format-spec=<spec> ]\n" \
2920         "[ -h                | --hash-by-name ]\n" \
2921         "[ -o <file>         | --output=<file> ]\n" \
2922         "[ -O                | --no-text-output ]\n" \
2923         "[ -q                | --quiet ]\n" \
2924         "[ -s                | --per-program-stats ]\n" \
2925         "[ -S <event>        | --sort-program-stats=<event> ]\n" \
2926         "[ -t                | --track-ios ]\n" \
2927         "[ -w <time>         | --stopwatch=<time> ]\n" \
2928         "[ -M                | --no-msgs\n" \
2929         "[ -v                | --verbose ]\n" \
2930         "[ -V                | --version ]\n\n" \
2931         "\t-a Only trace specified actions. See documentation\n" \
2932         "\t-A Give trace mask as a single value. See documentation\n" \
2933         "\t-b stdin read batching\n" \
2934         "\t-d Output file. If specified, binary data is written to file\n" \
2935         "\t-D Directory to prepend to input file names\n" \
2936         "\t-f Output format. Customize the output format. The format field\n" \
2937         "\t   identifies can be found in the documentation\n" \
2938         "\t-F Format specification. Can be found in the documentation\n" \
2939         "\t-h Hash processes by name, not pid\n" \
2940         "\t-i Input file containing trace data, or '-' for stdin\n" \
2941         "\t-o Output file. If not given, output is stdout\n" \
2942         "\t-O Do NOT output text data\n" \
2943         "\t-q Quiet. Don't display any stats at the end of the trace\n" \
2944         "\t-s Show per-program io statistics\n" \
2945         "\t-S Show per-program io statistics sorted by N/Q/q/R/r/W/w/C/c\n" \
2946         "\t   N:Name, Q/q:Queued(read & write), R/r:Queued Read, W/w:Queued Write, C/c:Complete.\n" \
2947         "\t   Sort programs by how much data(KB): Q,R,W,C.\n" \
2948         "\t   Sort programs by how many IO operations: q,r,w,c.\n" \
2949         "\t   if -S was used, the -s parameter will be ignored.\n" \
2950         "\t-t Track individual ios. Will tell you the time a request took\n" \
2951         "\t   to get queued, to get dispatched, and to get completed\n" \
2952         "\t-w Only parse data between the given time interval in seconds.\n" \
2953         "\t   If 'start' isn't given, blkparse defaults the start time to 0\n" \
2954         "\t-M Do not output messages to binary file\n" \
2955         "\t-v More verbose for marginal errors\n" \
2956         "\t-V Print program version info\n\n";
2957
2958 static void usage(char *prog)
2959 {
2960         fprintf(stderr, "Usage: %s %s", prog, usage_str);
2961 }
2962
2963 int main(int argc, char *argv[])
2964 {
2965         int i, c, ret, mode;
2966         int act_mask_tmp = 0;
2967         char *ofp_buffer = NULL;
2968         char *bin_ofp_buffer = NULL;
2969
2970         while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
2971                 switch (c) {
2972                 case 'a':
2973                         i = find_mask_map(optarg);
2974                         if (i < 0) {
2975                                 fprintf(stderr,"Invalid action mask %s\n",
2976                                         optarg);
2977                                 return 1;
2978                         }
2979                         act_mask_tmp |= i;
2980                         break;
2981
2982                 case 'A':
2983                         if ((sscanf(optarg, "%x", &i) != 1) || 
2984                                                         !valid_act_opt(i)) {
2985                                 fprintf(stderr,
2986                                         "Invalid set action mask %s/0x%x\n",
2987                                         optarg, i);
2988                                 return 1;
2989                         }
2990                         act_mask_tmp = i;
2991                         break;
2992                 case 'i':
2993                         if (is_pipe(optarg) && !pipeline) {
2994                                 pipeline = 1;
2995                                 pipename = strdup(optarg);
2996                         } else if (resize_devices(optarg) != 0)
2997                                 return 1;
2998                         break;
2999                 case 'D':
3000                         input_dir = optarg;
3001                         break;
3002                 case 'o':
3003                         output_name = optarg;
3004                         break;
3005                 case 'O':
3006                         text_output = 0;
3007                         break;
3008                 case 'b':
3009                         rb_batch = atoi(optarg);
3010                         if (rb_batch <= 0)
3011                                 rb_batch = RB_BATCH_DEFAULT;
3012                         break;
3013                 case 's':
3014                         per_process_stats = 1;
3015                         break;
3016                 case 'S':
3017                         per_process_stats = 1;
3018                         if (get_program_sort_event(optarg))
3019                                 return 1;
3020                         break;
3021                 case 't':
3022                         track_ios = 1;
3023                         break;
3024                 case 'q':
3025                         per_device_and_cpu_stats = 0;
3026                         break;
3027                 case 'w':
3028                         if (find_stopwatch_interval(optarg) != 0)
3029                                 return 1;
3030                         break;
3031                 case 'f':
3032                         set_all_format_specs(optarg);
3033                         break;
3034                 case 'F':
3035                         if (add_format_spec(optarg) != 0)
3036                                 return 1;
3037                         break;
3038                 case 'h':
3039                         ppi_hash_by_pid = 0;
3040                         break;
3041                 case 'v':
3042                         verbose++;
3043                         break;
3044                 case 'V':
3045                         printf("%s version %s\n", argv[0], blkparse_version);
3046                         return 0;
3047                 case 'd':
3048                         dump_binary = optarg;
3049                         break;
3050                 case 'M':
3051                         bin_output_msgs = 0;
3052                         break;
3053                 default:
3054                         usage(argv[0]);
3055                         return 1;
3056                 }
3057         }
3058
3059         while (optind < argc) {
3060                 if (is_pipe(argv[optind]) && !pipeline) {
3061                         pipeline = 1;
3062                         pipename = strdup(argv[optind]);
3063                 } else if (resize_devices(argv[optind]) != 0)
3064                         return 1;
3065                 optind++;
3066         }
3067
3068         if (!pipeline && !ndevices) {
3069                 usage(argv[0]);
3070                 return 1;
3071         }
3072
3073         if (act_mask_tmp != 0)
3074                 act_mask = act_mask_tmp;
3075
3076         memset(&rb_sort_root, 0, sizeof(rb_sort_root));
3077
3078         signal(SIGINT, handle_sigint);
3079         signal(SIGHUP, handle_sigint);
3080         signal(SIGTERM, handle_sigint);
3081
3082         setlocale(LC_NUMERIC, "en_US");
3083
3084         if (text_output) {
3085                 if (!output_name) {
3086                         ofp = fdopen(STDOUT_FILENO, "w");
3087                         mode = _IOLBF;
3088                 } else {
3089                         char ofname[PATH_MAX];
3090
3091                         snprintf(ofname, sizeof(ofname) - 1, "%s", output_name);
3092                         ofp = fopen(ofname, "w");
3093                         mode = _IOFBF;
3094                 }
3095
3096                 if (!ofp) {
3097                         perror("fopen");
3098                         return 1;
3099                 }
3100
3101                 ofp_buffer = malloc(4096);
3102                 if (setvbuf(ofp, ofp_buffer, mode, 4096)) {
3103                         perror("setvbuf");
3104                         return 1;
3105                 }
3106         }
3107
3108         if (dump_binary) {
3109                 if (!strcmp(dump_binary, "-"))
3110                         dump_fp = stdout;
3111                 else {
3112                         dump_fp = fopen(dump_binary, "w");
3113                         if (!dump_fp) {
3114                                 perror(dump_binary);
3115                                 dump_binary = NULL;
3116                                 return 1;
3117                         }
3118                 }
3119                 bin_ofp_buffer = malloc(128 * 1024);
3120                 if (setvbuf(dump_fp, bin_ofp_buffer, _IOFBF, 128 * 1024)) {
3121                         perror("setvbuf binary");
3122                         return 1;
3123                 }
3124         }
3125
3126         if (pipeline)
3127                 ret = do_fifo();
3128         else
3129                 ret = do_file();
3130
3131         if (!ret)
3132                 show_stats();
3133
3134         if (have_drv_data && !dump_binary)
3135                 printf("\ndiscarded traces containing low-level device driver "
3136                        "specific data (only available in binary output)\n");
3137
3138         if (ofp_buffer) {
3139                 fflush(ofp);
3140                 free(ofp_buffer);
3141         }
3142         if (bin_ofp_buffer) {
3143                 fflush(dump_fp);
3144                 free(bin_ofp_buffer);
3145         }
3146         return ret;
3147 }