[PATCH] blkparse: allow a few more skips
[blktrace.git] / blkparse.c
1 /*
2  * block queue tracing parse application
3  *
4  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation; either version 2 of the License, or
9  *  (at your option) any later version.
10  *
11  *  This program is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with this program; if not, write to the Free Software
18  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  *
20  */
21 #include <sys/types.h>
22 #include <sys/stat.h>
23 #include <unistd.h>
24 #include <stdio.h>
25 #include <fcntl.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <getopt.h>
29 #include <errno.h>
30 #include <signal.h>
31 #include <locale.h>
32 #include <limits.h>
33
34 #include "blktrace.h"
35 #include "rbtree.h"
36
37 #define SECONDS(x)              ((unsigned long long)(x) / 1000000000)
38 #define NANO_SECONDS(x)         ((unsigned long long)(x) % 1000000000)
39 #define DOUBLE_TO_NANO_ULL(d)   ((unsigned long long)((d) * 1000000000))
40
41 #define MINORBITS       20
42 #define MINORMASK       ((1U << MINORBITS) - 1)
43 #define MAJOR(dev)      ((unsigned int) ((dev) >> MINORBITS))
44 #define MINOR(dev)      ((unsigned int) ((dev) & MINORMASK))
45
46 #define min(a, b)       ((a) < (b) ? (a) : (b))
47
48 struct io_stats {
49         unsigned long qreads, qwrites, creads, cwrites, mreads, mwrites;
50         unsigned long ireads, iwrites;
51         unsigned long long qread_kb, qwrite_kb, cread_kb, cwrite_kb;
52         unsigned long long iread_kb, iwrite_kb;
53         unsigned long io_unplugs, timer_unplugs;
54 };
55
56 struct per_cpu_info {
57         int cpu;
58         int nelems;
59
60         int fd;
61         char fname[128];
62
63         struct io_stats io_stats;
64 };
65
66 struct per_dev_info {
67         dev_t id;
68         char *name;
69
70         int backwards;
71         unsigned long long events;
72         unsigned long long last_reported_time;
73         struct io_stats io_stats;
74         unsigned long last_sequence;
75         unsigned long skips;
76
77         int ncpus;
78         struct per_cpu_info *cpus;
79 };
80
81 struct per_process_info {
82         char name[16];
83         __u32 pid;
84         struct io_stats io_stats;
85         struct per_process_info *hash_next, *list_next;
86
87         /*
88          * individual io stats
89          */
90         unsigned long long longest_allocation_wait[2];
91         unsigned long long longest_dispatch_wait[2];
92         unsigned long long longest_completion_wait[2];
93 };
94
95 #define PPI_HASH_SHIFT  (8)
96 static struct per_process_info *ppi_hash[1 << PPI_HASH_SHIFT];
97 static struct per_process_info *ppi_list;
98
99 #define S_OPTS  "i:o:b:stqw:"
100 static struct option l_opts[] = {
101         {
102                 .name = "input",
103                 .has_arg = 1,
104                 .flag = NULL,
105                 .val = 'i'
106         },
107         {
108                 .name = "output",
109                 .has_arg = 1,
110                 .flag = NULL,
111                 .val = 'o'
112         },
113         {
114                 .name = "batch",
115                 .has_arg = 1,
116                 .flag = NULL,
117                 .val = 'b'
118         },
119         {
120                 .name = "per program stats",
121                 .has_arg = 0,
122                 .flag = NULL,
123                 .val = 's'
124         },
125         {
126                 .name = "track ios",
127                 .has_arg = 0,
128                 .flag = NULL,
129                 .val = 't'
130         },
131         {
132                 .name = "quiet",
133                 .has_arg = 0,
134                 .flag = NULL,
135                 .val = 'q'
136         },
137         {
138                 .name = "stopwatch",
139                 .has_arg = 1,
140                 .flag = NULL,
141                 .val = 'w'
142         },
143         {
144                 .name = NULL,
145                 .has_arg = 0,
146                 .flag = NULL,
147                 .val = 0
148         }
149 };
150
151 /*
152  * for sorting the displayed output
153  */
154 struct trace {
155         struct blk_io_trace *bit;
156         struct rb_node rb_node;
157         struct trace *next;
158         int skipped;
159 };
160
161 static struct rb_root rb_sort_root;
162 static struct rb_root rb_track_root;
163
164 static struct trace *trace_list;
165
166 /*
167  * for tracking individual ios
168  */
169 struct io_track {
170         struct rb_node rb_node;
171
172         dev_t device;
173         __u64 sector;
174         __u32 pid;
175         unsigned long long allocation_time;
176         unsigned long long queue_time;
177         unsigned long long dispatch_time;
178         unsigned long long completion_time;
179 };
180
181 static int ndevices;
182 static struct per_dev_info *devices;
183 static char *get_dev_name(struct per_dev_info *, char *, int);
184
185 static FILE *ofp;
186 static char *output_name;
187
188 static unsigned long long genesis_time;
189 static unsigned long long stopwatch_start;      /* start from zero by default */
190 static unsigned long long stopwatch_end = ULONG_LONG_MAX;       /* "infinity" */
191
192 static int per_process_stats;
193 static int track_ios;
194
195 #define RB_BATCH_DEFAULT        (1024)
196 static int rb_batch = RB_BATCH_DEFAULT;
197
198 static int pipeline;
199
200 #define is_done()       (*(volatile int *)(&done))
201 static volatile int done;
202
203 static inline unsigned long hash_long(unsigned long val)
204 {
205 #if __WORDSIZE == 32
206         val *= 0x9e370001UL;
207 #elif __WORDSIZE == 64
208         val *= 0x9e37fffffffc0001UL;
209 #else
210 #error unknown word size
211 #endif
212
213         return val >> (__WORDSIZE - PPI_HASH_SHIFT);
214 }
215
216 static inline void add_process_to_hash(struct per_process_info *ppi)
217 {
218         const int hash_idx = hash_long(ppi->pid);
219
220         ppi->hash_next = ppi_hash[hash_idx];
221         ppi_hash[hash_idx] = ppi;
222 }
223
224 static inline void add_process_to_list(struct per_process_info *ppi)
225 {
226         ppi->list_next = ppi_list;
227         ppi_list = ppi;
228 }
229
230 static struct per_process_info *find_process_by_pid(__u32 pid)
231 {
232         const int hash_idx = hash_long(pid);
233         struct per_process_info *ppi;
234
235         ppi = ppi_hash[hash_idx];
236         while (ppi) {
237                 if (ppi->pid == pid)
238                         return ppi;
239
240                 ppi = ppi->hash_next;
241         }
242
243         return NULL;
244 }
245
246 static inline int trace_rb_insert(struct trace *t)
247 {
248         struct rb_node **p = &rb_sort_root.rb_node;
249         struct rb_node *parent = NULL;
250         struct trace *__t;
251
252         if (genesis_time == 0 || t->bit->time < genesis_time)
253                 genesis_time = t->bit->time;
254
255         while (*p) {
256                 parent = *p;
257                 __t = rb_entry(parent, struct trace, rb_node);
258
259                 if (t->bit->time < __t->bit->time)
260                         p = &(*p)->rb_left;
261                 else if (t->bit->time > __t->bit->time)
262                         p = &(*p)->rb_right;
263                 else if (t->bit->device < __t->bit->device)
264                         p = &(*p)->rb_left;
265                 else if (t->bit->device > __t->bit->device)
266                         p = &(*p)->rb_right;
267                 else if (t->bit->sequence < __t->bit->sequence)
268                         p = &(*p)->rb_left;
269                 else if (t->bit->sequence > __t->bit->sequence)
270                         p = &(*p)->rb_right;
271                 else if (t->bit->device == __t->bit->device) {
272                         fprintf(stderr,
273                                 "sequence alias (%d) on device %d,%d!\n",
274                                 t->bit->sequence,
275                                 MAJOR(t->bit->device), MINOR(t->bit->device));
276                         return 1;
277                 }
278         }
279
280         rb_link_node(&t->rb_node, parent, p);
281         rb_insert_color(&t->rb_node, &rb_sort_root);
282         return 0;
283 }
284
285 static inline int track_rb_insert(struct io_track *iot)
286 {
287         struct rb_node **p = &rb_track_root.rb_node;
288         struct rb_node *parent = NULL;
289         struct io_track *__iot;
290
291         while (*p) {
292                 parent = *p;
293
294                 __iot = rb_entry(parent, struct io_track, rb_node);
295
296                 if (iot->device < __iot->device)
297                         p = &(*p)->rb_left;
298                 else if (iot->device > __iot->device)
299                         p = &(*p)->rb_right;
300                 else if (iot->sector < __iot->sector)
301                         p = &(*p)->rb_left;
302                 else if (iot->sector > __iot->sector)
303                         p = &(*p)->rb_right;
304                 else {
305                         fprintf(stderr,
306                                 "sector alias (%llu) on device %d,%d!\n",
307                                 iot->sector,
308                                 MAJOR(iot->device), MINOR(iot->device));
309                         return 1;
310                 }
311         }
312
313         rb_link_node(&iot->rb_node, parent, p);
314         rb_insert_color(&iot->rb_node, &rb_track_root);
315         return 0;
316 }
317
318 static struct io_track *__find_track(dev_t device, __u64 sector)
319 {
320         struct rb_node **p = &rb_track_root.rb_node;
321         struct rb_node *parent = NULL;
322         struct io_track *__iot;
323
324         while (*p) {
325                 parent = *p;
326                 
327                 __iot = rb_entry(parent, struct io_track, rb_node);
328
329                 if (device < __iot->device)
330                         p = &(*p)->rb_left;
331                 else if (device > __iot->device)
332                         p = &(*p)->rb_right;
333                 else if (sector < __iot->sector)
334                         p = &(*p)->rb_left;
335                 else if (sector > __iot->sector)
336                         p = &(*p)->rb_right;
337                 else
338                         return __iot;
339         }
340
341         return NULL;
342 }
343
344 static struct io_track *find_track(__u32 pid, dev_t device, __u64 sector)
345 {
346         struct io_track *iot;
347
348         iot = __find_track(device, sector);
349         if (!iot) {
350                 iot = malloc(sizeof(*iot));
351                 iot->pid = pid;
352                 iot->device = device;
353                 iot->sector = sector;
354                 track_rb_insert(iot);
355         }
356
357         return iot;
358 }
359
360 static void log_track_frontmerge(struct blk_io_trace *t)
361 {
362         struct io_track *iot;
363
364         if (!track_ios)
365                 return;
366
367         iot = __find_track(t->device, t->sector + (t->bytes >> 9));
368         if (!iot) {
369                 fprintf(stderr, "failed to find mergeable event\n");
370                 return;
371         }
372
373         rb_erase(&iot->rb_node, &rb_track_root);
374         iot->sector -= t->bytes >> 9;
375         track_rb_insert(iot);
376 }
377
378 static void log_track_getrq(struct blk_io_trace *t)
379 {
380         struct io_track *iot;
381
382         if (!track_ios)
383                 return;
384
385         iot = find_track(t->pid, t->device, t->sector);
386         iot->allocation_time = t->time;
387 }
388
389
390 /*
391  * return time between rq allocation and queue
392  */
393 static unsigned long long log_track_queue(struct blk_io_trace *t)
394 {
395         unsigned long long elapsed;
396         struct io_track *iot;
397
398         if (!track_ios)
399                 return -1;
400
401         iot = find_track(t->pid, t->device, t->sector);
402         iot->queue_time = t->time;
403         elapsed = iot->queue_time - iot->allocation_time;
404
405         if (per_process_stats) {
406                 struct per_process_info *ppi = find_process_by_pid(iot->pid);
407                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
408
409                 if (ppi && elapsed > ppi->longest_allocation_wait[w])
410                         ppi->longest_allocation_wait[w] = elapsed;
411         }
412
413         return elapsed;
414 }
415
416 /*
417  * return time between queue and issue
418  */
419 static unsigned long long log_track_issue(struct blk_io_trace *t)
420 {
421         unsigned long long elapsed;
422         struct io_track *iot;
423
424         if (!track_ios)
425                 return -1;
426         if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
427                 return -1;
428
429         iot = __find_track(t->device, t->sector);
430         if (!iot) {
431                 fprintf(stderr, "failed to find issue event\n");
432                 return -1;
433         }
434
435         iot->dispatch_time = t->time;
436         elapsed = iot->dispatch_time - iot->queue_time;
437
438         if (per_process_stats) {
439                 struct per_process_info *ppi = find_process_by_pid(iot->pid);
440                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
441
442                 if (ppi && elapsed > ppi->longest_dispatch_wait[w])
443                         ppi->longest_dispatch_wait[w] = elapsed;
444         }
445
446         return elapsed;
447 }
448
449 /*
450  * return time between dispatch and complete
451  */
452 static unsigned long long log_track_complete(struct blk_io_trace *t)
453 {
454         unsigned long long elapsed;
455         struct io_track *iot;
456
457         if (!track_ios)
458                 return -1;
459         if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
460                 return -1;
461
462         iot = __find_track(t->device, t->sector);
463         if (!iot) {
464                 fprintf(stderr, "failed to find complete event\n");
465                 return -1;
466         }
467
468         iot->completion_time = t->time;
469         elapsed = iot->completion_time - iot->dispatch_time;
470
471         if (per_process_stats) {
472                 struct per_process_info *ppi = find_process_by_pid(iot->pid);
473                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
474
475                 if (ppi && elapsed > ppi->longest_completion_wait[w])
476                         ppi->longest_completion_wait[w] = elapsed;
477         }
478
479         /*
480          * kill the trace, we don't need it after completion
481          */
482         rb_erase(&iot->rb_node, &rb_track_root);
483         free(iot);
484
485         return elapsed;
486 }
487
488
489 static struct io_stats *find_process_io_stats(__u32 pid, char *name)
490 {
491         struct per_process_info *ppi = find_process_by_pid(pid);
492
493         if (!ppi) {
494                 ppi = malloc(sizeof(*ppi));
495                 memset(ppi, 0, sizeof(*ppi));
496                 strncpy(ppi->name, name, sizeof(ppi->name));
497                 ppi->pid = pid;
498                 add_process_to_hash(ppi);
499                 add_process_to_list(ppi);
500         }
501
502         return &ppi->io_stats;
503 }
504
505
506 static void resize_cpu_info(struct per_dev_info *pdi, int cpu)
507 {
508         struct per_cpu_info *cpus = pdi->cpus;
509         int ncpus = pdi->ncpus;
510         int new_count = cpu + 1;
511         int new_space, size;
512         char *new_start;
513
514         size = new_count * sizeof(struct per_cpu_info);
515         cpus = realloc(cpus, size);
516         if (!cpus) {
517                 char name[20];
518                 fprintf(stderr, "Out of memory, CPU info for device %s (%d)\n",
519                         get_dev_name(pdi, name, sizeof(name)), size);
520                 exit(1);
521         }
522
523         new_start = (char *)cpus + (ncpus * sizeof(struct per_cpu_info));
524         new_space = (new_count - ncpus) * sizeof(struct per_cpu_info);
525         memset(new_start, 0, new_space);
526
527         pdi->ncpus = new_count;
528         pdi->cpus = cpus;
529 }
530
531 static struct per_cpu_info *get_cpu_info(struct per_dev_info *pdi, int cpu)
532 {
533         struct per_cpu_info *pci;
534
535         if (cpu >= pdi->ncpus)
536                 resize_cpu_info(pdi, cpu);
537
538         pci = &pdi->cpus[cpu];
539         pci->cpu = cpu;
540         return pci;
541 }
542
543
544 static int resize_devices(char *name)
545 {
546         int size = (ndevices + 1) * sizeof(struct per_dev_info);
547
548         devices = realloc(devices, size);
549         if (!devices) {
550                 fprintf(stderr, "Out of memory, device %s (%d)\n", name, size);
551                 return 1;
552         }
553         memset(&devices[ndevices], 0, sizeof(struct per_dev_info));
554         devices[ndevices].name = name;
555         ndevices++;
556         return 0;
557 }
558
559 static struct per_dev_info *get_dev_info(dev_t id)
560 {
561         struct per_dev_info *pdi;
562         int i;
563
564         for (i = 0; i < ndevices; i++)
565                 if (devices[i].id == id)
566                         return &devices[i];
567
568         if (resize_devices(NULL) != 0)
569                 return NULL;
570
571         pdi = &devices[ndevices - 1];
572         pdi->id = id;
573         return pdi;
574 }
575
576 static char *get_dev_name(struct per_dev_info *pdi, char *buffer, int size)
577 {
578         if (pdi->name)
579                 snprintf(buffer, size, "%s", pdi->name);
580         else
581                 snprintf(buffer, size, "%d,%d", MAJOR(pdi->id), MINOR(pdi->id));
582         return buffer;
583 }
584
585 static void check_time(struct per_dev_info *pdi, struct blk_io_trace *bit)
586 {
587         unsigned long long this = bit->time;
588         unsigned long long last = pdi->last_reported_time;
589
590         pdi->backwards = (this < last) ? 'B' : ' ';
591         pdi->last_reported_time = this;
592 }
593
594 static inline void __account_m(struct io_stats *ios, struct blk_io_trace *t,
595                                int rw)
596 {
597         if (rw) {
598                 ios->mwrites++;
599                 ios->qwrite_kb += t->bytes >> 10;
600         } else {
601                 ios->mreads++;
602                 ios->qread_kb += t->bytes >> 10;
603         }
604 }
605
606 static inline void account_m(struct blk_io_trace *t, struct per_cpu_info *pci,
607                              int rw)
608 {
609         __account_m(&pci->io_stats, t, rw);
610
611         if (per_process_stats) {
612                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
613
614                 __account_m(ios, t, rw);
615         }
616 }
617
618 static inline void __account_q(struct io_stats *ios, struct blk_io_trace *t,
619                                int rw)
620 {
621         if (rw) {
622                 ios->qwrites++;
623                 ios->qwrite_kb += t->bytes >> 10;
624         } else {
625                 ios->qreads++;
626                 ios->qread_kb += t->bytes >> 10;
627         }
628 }
629
630 static inline void account_q(struct blk_io_trace *t, struct per_cpu_info *pci,
631                              int rw)
632 {
633         __account_q(&pci->io_stats, t, rw);
634
635         if (per_process_stats) {
636                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
637
638                 __account_q(ios, t, rw);
639         }
640 }
641
642 static inline void __account_c(struct io_stats *ios, int rw, unsigned int bytes)
643 {
644         if (rw) {
645                 ios->cwrites++;
646                 ios->cwrite_kb += bytes >> 10;
647         } else {
648                 ios->creads++;
649                 ios->cread_kb += bytes >> 10;
650         }
651 }
652
653 static inline void account_c(struct blk_io_trace *t, struct per_cpu_info *pci,
654                              int rw, int bytes)
655 {
656         __account_c(&pci->io_stats, rw, bytes);
657
658         if (per_process_stats) {
659                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
660
661                 __account_c(ios, rw, bytes);
662         }
663 }
664
665 static inline void __account_i(struct io_stats *ios, int rw, unsigned int bytes)
666 {
667         if (rw) {
668                 ios->iwrites++;
669                 ios->iwrite_kb += bytes >> 10;
670         } else {
671                 ios->ireads++;
672                 ios->iread_kb += bytes >> 10;
673         }
674 }
675
676 static inline void account_i(struct blk_io_trace *t, struct per_cpu_info *pci,
677                              int rw)
678 {
679         __account_i(&pci->io_stats, rw, t->bytes);
680
681         if (per_process_stats) {
682                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
683
684                 __account_i(ios, rw, t->bytes);
685         }
686 }
687
688 static inline void __account_unplug(struct io_stats *ios, int timer)
689 {
690         if (timer)
691                 ios->timer_unplugs++;
692         else
693                 ios->io_unplugs++;
694 }
695
696 static inline void account_unplug(struct blk_io_trace *t,
697                                   struct per_cpu_info *pci, int timer)
698 {
699         __account_unplug(&pci->io_stats, timer);
700
701         if (per_process_stats) {
702                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
703
704                 __account_unplug(ios, timer);
705         }
706 }
707
708 static void output(struct per_cpu_info *pci, char *s)
709 {
710         fprintf(ofp, "%s", s);
711 }
712
713 static char hstring[256];
714 static char tstring[256];
715
716 static inline char *setup_header(struct per_cpu_info *pci,
717                                  struct blk_io_trace *t, char *act)
718 {
719         int w = t->action & BLK_TC_ACT(BLK_TC_WRITE);
720         int b = t->action & BLK_TC_ACT(BLK_TC_BARRIER);
721         int s = t->action & BLK_TC_ACT(BLK_TC_SYNC);
722         char rwbs[4];
723         int i = 0;
724
725         if (w)
726                 rwbs[i++] = 'W';
727         else
728                 rwbs[i++] = 'R';
729         if (b)
730                 rwbs[i++] = 'B';
731         if (s)
732                 rwbs[i++] = 'S';
733
734         rwbs[i] = '\0';
735
736         sprintf(hstring, "%3d,%-3d %2d %8ld %5Lu.%09Lu %5u %2s %3s",
737                 MAJOR(t->device), MINOR(t->device), pci->cpu,
738                 (unsigned long)t->sequence, SECONDS(t->time),
739                 NANO_SECONDS(t->time), t->pid, act, rwbs);
740
741         return hstring;
742 }
743
744 static void log_complete(struct per_cpu_info *pci, struct blk_io_trace *t,
745                          char *act)
746 {
747         unsigned long long elapsed = log_track_complete(t);
748
749         if (elapsed != -1ULL) {
750                 unsigned long usec = elapsed / 1000;
751
752                 sprintf(tstring,"%s %Lu + %u (%8lu) [%d]\n",
753                         setup_header(pci, t, act),
754                         (unsigned long long)t->sector, t->bytes >> 9,
755                         usec, t->error);
756         } else {
757                 sprintf(tstring,"%s %Lu + %u [%d]\n", setup_header(pci, t, act),
758                         (unsigned long long)t->sector, t->bytes >> 9, t->error);
759         }
760         
761         output(pci, tstring);
762 }
763
764 static void log_queue(struct per_cpu_info *pci, struct blk_io_trace *t,
765                       char *act)
766 {
767         unsigned long long elapsed = log_track_queue(t);
768
769         if (elapsed != -1ULL) {
770                 unsigned long usec = elapsed / 1000;
771
772                 sprintf(tstring,"%s %Lu + %u (%8lu) [%s]\n",
773                         setup_header(pci, t, act),
774                         (unsigned long long)t->sector, t->bytes >> 9,
775                         usec, t->comm);
776         } else {
777                 sprintf(tstring,"%s %Lu + %u [%s]\n", setup_header(pci, t, act),
778                         (unsigned long long)t->sector, t->bytes >> 9, t->comm);
779         }
780         output(pci, tstring);
781 }
782
783 static void log_issue(struct per_cpu_info *pci, struct blk_io_trace *t,
784                       char *act)
785 {
786         unsigned long long elapsed = log_track_issue(t);
787
788         if (elapsed != -1ULL) {
789                 double usec = (double) elapsed / 1000;
790
791                 sprintf(tstring,"%s %Lu + %u (%8.2f) [%s]\n",
792                         setup_header(pci, t, act),
793                         (unsigned long long)t->sector, t->bytes >> 9,
794                         usec, t->comm);
795         } else {
796                 sprintf(tstring,"%s %Lu + %u [%s]\n", setup_header(pci, t, act),
797                         (unsigned long long)t->sector, t->bytes >> 9, t->comm);
798         }
799
800         output(pci, tstring);
801 }
802
803 static void log_merge(struct per_cpu_info *pci, struct blk_io_trace *t,
804                       char *act)
805 {
806         if (act[0] == 'F')
807                 log_track_frontmerge(t);
808
809         sprintf(tstring,"%s %Lu + %u [%s]\n", setup_header(pci, t, act),
810                 (unsigned long long)t->sector, t->bytes >> 9, t->comm);
811         output(pci, tstring);
812 }
813
814 static void log_action(struct per_cpu_info *pci, struct blk_io_trace *t,
815                         char *act)
816 {
817         sprintf(tstring,"%s [%s]\n", setup_header(pci, t, act), t->comm);
818         output(pci, tstring);
819 }
820
821 static void log_generic(struct per_cpu_info *pci, struct blk_io_trace *t,
822                         char *act)
823 {
824         sprintf(tstring,"%s %Lu + %u [%s]\n", setup_header(pci, t, act),
825                 (unsigned long long)t->sector, t->bytes >> 9, t->comm);
826         output(pci, tstring);
827 }
828
829 static int log_unplug(struct per_cpu_info *pci, struct blk_io_trace *t,
830                       char *act)
831 {
832         __u64 *depth;
833         int len;
834
835         len = sprintf(tstring,"%s [%s] ", setup_header(pci, t, act), t->comm);
836         depth = (__u64 *) ((char *) t + sizeof(*t));
837         sprintf(tstring + len, "%u\n", (unsigned int) be64_to_cpu(*depth));
838         output(pci, tstring);
839
840         return 0;
841 }
842
843 static int log_pc(struct per_cpu_info *pci, struct blk_io_trace *t, char *act)
844 {
845         unsigned char *buf;
846         int i;
847
848         sprintf(tstring,"%s ", setup_header(pci, t, act));
849         output(pci, tstring);
850
851         buf = (unsigned char *) t + sizeof(*t);
852         for (i = 0; i < t->pdu_len; i++) {
853                 sprintf(tstring,"%02x ", buf[i]);
854                 output(pci, tstring);
855         }
856
857         if (act[0] == 'C') {
858                 sprintf(tstring,"[%d]\n", t->error);
859                 output(pci, tstring);
860         } else {
861                 sprintf(tstring,"[%s]\n", t->comm);
862                 output(pci, tstring);
863         }
864         return 0;
865 }
866
867 static int dump_trace_pc(struct blk_io_trace *t, struct per_cpu_info *pci)
868 {
869         int ret = 0;
870
871         switch (t->action & 0xffff) {
872                 case __BLK_TA_QUEUE:
873                         log_generic(pci, t, "Q");
874                         break;
875                 case __BLK_TA_GETRQ:
876                         log_generic(pci, t, "G");
877                         break;
878                 case __BLK_TA_SLEEPRQ:
879                         log_generic(pci, t, "S");
880                         break;
881                 case __BLK_TA_REQUEUE:
882                         log_generic(pci, t, "R");
883                         break;
884                 case __BLK_TA_ISSUE:
885                         ret = log_pc(pci, t, "D");
886                         break;
887                 case __BLK_TA_COMPLETE:
888                         log_pc(pci, t, "C");
889                         break;
890                 default:
891                         fprintf(stderr, "Bad pc action %x\n", t->action);
892                         ret = 1;
893                         break;
894         }
895         
896         return ret;
897 }
898
899 static void dump_trace_fs(struct blk_io_trace *t, struct per_cpu_info *pci)
900 {
901         int w = t->action & BLK_TC_ACT(BLK_TC_WRITE);
902         int act = t->action & 0xffff;
903
904         switch (act) {
905                 case __BLK_TA_QUEUE:
906                         account_q(t, pci, w);
907                         log_queue(pci, t, "Q");
908                         break;
909                 case __BLK_TA_BACKMERGE:
910                         account_m(t, pci, w);
911                         log_merge(pci, t, "M");
912                         break;
913                 case __BLK_TA_FRONTMERGE:
914                         account_m(t, pci, w);
915                         log_merge(pci, t, "F");
916                         break;
917                 case __BLK_TA_GETRQ:
918                         log_track_getrq(t);
919                         log_generic(pci, t, "G");
920                         break;
921                 case __BLK_TA_SLEEPRQ:
922                         log_generic(pci, t, "S");
923                         break;
924                 case __BLK_TA_REQUEUE:
925                         account_c(t, pci, w, -t->bytes);
926                         log_queue(pci, t, "R");
927                         break;
928                 case __BLK_TA_ISSUE:
929                         account_i(t, pci, w);
930                         log_issue(pci, t, "D");
931                         break;
932                 case __BLK_TA_COMPLETE:
933                         account_c(t, pci, w, t->bytes);
934                         log_complete(pci, t, "C");
935                         break;
936                 case __BLK_TA_PLUG:
937                         log_action(pci, t, "P");
938                         break;
939                 case __BLK_TA_UNPLUG_IO:
940                         account_unplug(t, pci, 0);
941                         log_unplug(pci, t, "U");
942                         break;
943                 case __BLK_TA_UNPLUG_TIMER:
944                         account_unplug(t, pci, 1);
945                         log_unplug(pci, t, "UT");
946                         break;
947                 default:
948                         fprintf(stderr, "Bad fs action %x\n", t->action);
949                         break;
950         }
951 }
952
953 static int dump_trace(struct blk_io_trace *t, struct per_cpu_info *pci,
954                         struct per_dev_info *pdi)
955 {
956         int ret = 0;
957
958         if (t->action & BLK_TC_ACT(BLK_TC_PC))
959                 ret = dump_trace_pc(t, pci);
960         else
961                 dump_trace_fs(t, pci);
962
963         pdi->events++;
964         return ret;
965 }
966
967 static void dump_io_stats(struct io_stats *ios, char *msg)
968 {
969         fprintf(ofp, "%s\n", msg);
970
971         fprintf(ofp, " Reads Queued:    %'8lu, %'8LuKiB\t", ios->qreads, ios->qread_kb);
972         fprintf(ofp, " Writes Queued:    %'8lu, %'8LuKiB\n", ios->qwrites,ios->qwrite_kb);
973
974         fprintf(ofp, " Read Dispatches: %'8lu, %'8LuKiB\t", ios->ireads, ios->iread_kb);
975         fprintf(ofp, " Write Dispatches: %'8lu, %'8LuKiB\n", ios->iwrites,ios->iwrite_kb);
976         fprintf(ofp, " Reads Completed: %'8lu, %'8LuKiB\t", ios->creads, ios->cread_kb);
977         fprintf(ofp, " Writes Completed: %'8lu, %'8LuKiB\n", ios->cwrites,ios->cwrite_kb);
978         fprintf(ofp, " Read Merges:     %'8lu%8c\t", ios->mreads, ' ');
979         fprintf(ofp, " Write Merges:     %'8lu\n", ios->mwrites);
980         fprintf(ofp, " IO unplugs:      %'8lu%8c\t", ios->io_unplugs, ' ');
981         fprintf(ofp, " Timer unplugs:    %'8lu\n", ios->timer_unplugs);
982 }
983
984 static void dump_wait_stats(struct per_process_info *ppi)
985 {
986         unsigned long rawait = ppi->longest_allocation_wait[0] / 1000;
987         unsigned long rdwait = ppi->longest_dispatch_wait[0] / 1000;
988         unsigned long rcwait = ppi->longest_completion_wait[0] / 1000;
989         unsigned long wawait = ppi->longest_allocation_wait[1] / 1000;
990         unsigned long wdwait = ppi->longest_dispatch_wait[1] / 1000;
991         unsigned long wcwait = ppi->longest_completion_wait[1] / 1000;
992
993         fprintf(ofp, " Allocation wait: %'8lu%8c\t", rawait, ' ');
994         fprintf(ofp, " Allocation wait:  %'8lu\n", wawait);
995         fprintf(ofp, " Dispatch wait:   %'8lu%8c\t", rdwait, ' ');
996         fprintf(ofp, " Dispatch wait:    %'8lu\n", wdwait);
997         fprintf(ofp, " Completion wait: %'8lu%8c\t", rcwait, ' ');
998         fprintf(ofp, " Completion wait:  %'8lu\n", wcwait);
999 }
1000
1001 static void show_process_stats(void)
1002 {
1003         struct per_process_info *ppi;
1004
1005         ppi = ppi_list;
1006         while (ppi) {
1007                 dump_io_stats(&ppi->io_stats, ppi->name);
1008                 dump_wait_stats(ppi);
1009                 ppi = ppi->list_next;
1010         }
1011
1012         fprintf(ofp, "\n");
1013 }
1014
1015 static void show_device_and_cpu_stats(void)
1016 {
1017         struct per_dev_info *pdi;
1018         struct per_cpu_info *pci;
1019         struct io_stats total, *ios;
1020         int i, j, pci_events;
1021         char line[3 + 8/*cpu*/ + 2 + 32/*dev*/ + 3];
1022         char name[32];
1023
1024         for (pdi = devices, i = 0; i < ndevices; i++, pdi++) {
1025
1026                 memset(&total, 0, sizeof(total));
1027                 pci_events = 0;
1028
1029                 if (i > 0)
1030                         fprintf(ofp, "\n");
1031
1032                 for (pci = pdi->cpus, j = 0; j < pdi->ncpus; j++, pci++) {
1033                         if (!pci->nelems)
1034                                 continue;
1035
1036                         ios = &pci->io_stats;
1037                         total.qreads += ios->qreads;
1038                         total.qwrites += ios->qwrites;
1039                         total.creads += ios->creads;
1040                         total.cwrites += ios->cwrites;
1041                         total.mreads += ios->mreads;
1042                         total.mwrites += ios->mwrites;
1043                         total.ireads += ios->ireads;
1044                         total.iwrites += ios->iwrites;
1045                         total.qread_kb += ios->qread_kb;
1046                         total.qwrite_kb += ios->qwrite_kb;
1047                         total.cread_kb += ios->cread_kb;
1048                         total.cwrite_kb += ios->cwrite_kb;
1049                         total.iread_kb += ios->iread_kb;
1050                         total.iwrite_kb += ios->iwrite_kb;
1051                         total.timer_unplugs += ios->timer_unplugs;
1052                         total.io_unplugs += ios->io_unplugs;
1053
1054                         snprintf(line, sizeof(line) - 1, "CPU%d (%s):",
1055                                  j, get_dev_name(pdi, name, sizeof(name)));
1056                         dump_io_stats(ios, line);
1057                         pci_events++;
1058                 }
1059
1060                 if (pci_events > 1) {
1061                         fprintf(ofp, "\n");
1062                         snprintf(line, sizeof(line) - 1, "Total (%s):",
1063                                  get_dev_name(pdi, name, sizeof(name)));
1064                         dump_io_stats(&total, line);
1065                 }
1066
1067                 fprintf(ofp, "\nEvents (%s): %'Lu entries, %'lu skips\n",
1068                         get_dev_name(pdi, line, sizeof(line)), pdi->events,
1069                         pdi->skips);
1070         }
1071 }
1072
1073 static struct blk_io_trace *find_trace(void *p, unsigned long offset)
1074 {
1075         unsigned long max_offset = offset;
1076         unsigned long off;
1077         struct blk_io_trace *bit;
1078         __u32 magic;
1079
1080         for (off = 0; off < max_offset; off++) {
1081                 bit = p + off;
1082
1083                 magic = be32_to_cpu(bit->magic);
1084                 if ((magic & 0xffffff00) == BLK_IO_TRACE_MAGIC)
1085                         return bit;
1086         }
1087
1088         return NULL;
1089 }
1090
1091 static int sort_entries(void)
1092 {
1093         struct per_dev_info *pdi;
1094         struct per_cpu_info *pci;
1095         struct blk_io_trace *bit;
1096         struct trace *t;
1097         int nr = 0;
1098
1099         while ((t = trace_list) != NULL) {
1100
1101                 trace_list = t->next;
1102                 bit = t->bit;
1103
1104                 memset(&t->rb_node, 0, sizeof(t->rb_node));
1105
1106                 if (verify_trace(bit))
1107                         break;
1108                 if (trace_rb_insert(t))
1109                         return -1;
1110
1111                 pdi = get_dev_info(bit->device);
1112                 pci = get_cpu_info(pdi, bit->cpu);
1113                 pci->nelems++;
1114
1115                 nr++;
1116         }
1117
1118         return nr;
1119 }
1120
1121 static void show_entries_rb(int piped)
1122 {
1123         struct per_dev_info *pdi;
1124         struct blk_io_trace *bit;
1125         struct rb_node *n;
1126         struct trace *t;
1127         int cpu;
1128
1129         while ((n = rb_first(&rb_sort_root)) != NULL) {
1130
1131                 t = rb_entry(n, struct trace, rb_node);
1132                 bit = t->bit;
1133
1134                 pdi = get_dev_info(bit->device);
1135                 if (!pdi) {
1136                         fprintf(stderr, "Unknown device ID? (%d,%d)\n",
1137                                 MAJOR(bit->device), MINOR(bit->device));
1138                         break;
1139                 }
1140                 cpu = bit->cpu;
1141                 if (cpu > pdi->ncpus) {
1142                         fprintf(stderr, "Unknown CPU ID? (%d, device %d,%d)\n",
1143                                 cpu, MAJOR(bit->device), MINOR(bit->device));
1144                         break;
1145                 }
1146
1147                 /*
1148                  * back off displaying more info if we are out of sync
1149                  * on SMP systems. to prevent stalling on lost events,
1150                  * only allow an event to us a few times
1151                  */
1152                 if (piped && bit->sequence != (pdi->last_sequence + 1)) {
1153                         if (t->skipped < 5) {
1154                                 t->skipped++;
1155                                 break;
1156                         } else {
1157                                 fprintf(stderr, "skipping from %lu to %u\n", pdi->last_sequence, bit->sequence);
1158                                 pdi->skips++;
1159                         }
1160                 }
1161
1162                 pdi->last_sequence = bit->sequence;
1163
1164                 bit->time -= genesis_time;
1165                 if (bit->time < stopwatch_start)
1166                         continue;
1167                 if (bit->time >= stopwatch_end)
1168                         break;
1169
1170                 check_time(pdi, bit);
1171
1172                 if (dump_trace(bit, &pdi->cpus[cpu], pdi))
1173                         break;
1174
1175                 rb_erase(&t->rb_node, &rb_sort_root);
1176
1177                 if (piped) {
1178                         free(bit);
1179                         free(t);
1180                 }
1181         }
1182 }
1183
1184 static int read_data(int fd, void *buffer, int bytes, int block)
1185 {
1186         int ret, bytes_left, fl;
1187         void *p;
1188
1189         fl = fcntl(fd, F_GETFL);
1190
1191         if (!block)
1192                 fcntl(fd, F_SETFL, fl | O_NONBLOCK);
1193         else
1194                 fcntl(fd, F_SETFL, fl & ~O_NONBLOCK);
1195
1196         bytes_left = bytes;
1197         p = buffer;
1198         while (bytes_left > 0) {
1199                 ret = read(fd, p, bytes_left);
1200                 if (!ret)
1201                         return 1;
1202                 else if (ret < 0) {
1203                         if (errno != EAGAIN)
1204                                 perror("read");
1205                         return -1;
1206                 } else {
1207                         p += ret;
1208                         bytes_left -= ret;
1209                 }
1210         }
1211
1212         return 0;
1213 }
1214
1215 /*
1216  * Find the traces in 'tb' and add them to the list for sorting and
1217  * displaying
1218  */
1219 static int find_entries(void *tb, unsigned long size)
1220 {
1221         struct blk_io_trace *bit;
1222         struct trace *t;
1223         void *start = tb;
1224
1225         while (tb - start <= size - sizeof(*bit)) {
1226                 bit = find_trace(tb, size - (tb - start));
1227                 if (!bit)
1228                         break;
1229
1230                 t = malloc(sizeof(*t));
1231                 memset(t, 0, sizeof(*t));
1232                 t->bit = bit;
1233
1234                 trace_to_cpu(bit);
1235                 t->next = trace_list;
1236                 trace_list = t;
1237
1238                 tb += sizeof(*bit) + bit->pdu_len;
1239         }
1240
1241         return 0;
1242 }
1243
1244 static int do_file(void)
1245 {
1246         int i, j, nfiles = 0, nelems;
1247
1248         for (i = 0; i < ndevices; i++) {
1249                 for (j = 0;; j++, nfiles++) {
1250                         struct per_dev_info *pdi;
1251                         struct per_cpu_info *pci;
1252                         struct stat st;
1253                         void *tb;
1254
1255                         pdi = &devices[i];
1256                         pci = get_cpu_info(pdi, j);
1257                         pci->cpu = j;
1258
1259                         snprintf(pci->fname, sizeof(pci->fname)-1,
1260                                  "%s_out.%d", pdi->name, j);
1261                         if (stat(pci->fname, &st) < 0)
1262                                 break;
1263                         if (!st.st_size)
1264                                 continue;
1265
1266                         printf("Processing %s\n", pci->fname);
1267
1268                         tb = malloc(st.st_size);
1269                         if (!tb) {
1270                                 fprintf(stderr, "Out of memory, skip file %s\n",
1271                                         pci->fname);
1272                                 continue;
1273                         }
1274
1275                         pci->fd = open(pci->fname, O_RDONLY);
1276                         if (pci->fd < 0) {
1277                                 perror(pci->fname);
1278                                 free(tb);
1279                                 continue;
1280                         }
1281
1282                         if (read_data(pci->fd, tb, st.st_size, 1)) {
1283                                 close(pci->fd);
1284                                 free(tb);
1285                                 continue;
1286                         }
1287
1288                         if (find_entries(tb, st.st_size)) {
1289                                 close(pci->fd);
1290                                 free(tb);
1291                         }
1292
1293                         nelems = sort_entries();
1294                         if (nelems == -1) {
1295                                 close(pci->fd);
1296                                 free(tb);
1297                                 continue;
1298                         }
1299
1300                         printf("Completed %s (CPU%d %d, entries)\n",
1301                                 pci->fname, j, nelems);
1302                         close(pci->fd);
1303                 }
1304         }
1305
1306         if (!nfiles) {
1307                 fprintf(stderr, "No files found\n");
1308                 return 1;
1309         }
1310
1311         show_entries_rb(0);
1312         return 0;
1313 }
1314
1315 static int read_sort_events(int fd)
1316 {
1317         int events = 0;
1318
1319         do {
1320                 struct blk_io_trace *bit;
1321                 struct trace *t;
1322                 int pdu_len;
1323                 __u32 magic;
1324
1325                 bit = malloc(sizeof(*bit));
1326
1327                 if (read_data(fd, bit, sizeof(*bit), !events))
1328                         break;
1329
1330                 magic = be32_to_cpu(bit->magic);
1331                 if ((magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
1332                         fprintf(stderr, "Bad magic %x\n", magic);
1333                         break;
1334                 }
1335
1336                 pdu_len = be16_to_cpu(bit->pdu_len);
1337                 if (pdu_len) {
1338                         void *ptr = realloc(bit, sizeof(*bit) + pdu_len);
1339
1340                         if (read_data(fd, ptr + sizeof(*bit), pdu_len, 1))
1341                                 break;
1342
1343                         bit = ptr;
1344                 }
1345
1346                 t = malloc(sizeof(*t));
1347                 memset(t, 0, sizeof(*t));
1348                 t->bit = bit;
1349
1350                 trace_to_cpu(bit);
1351                 t->next = trace_list;
1352                 trace_list = t;
1353
1354                 events++;
1355         } while (!is_done() && events < rb_batch);
1356
1357         return events;
1358 }
1359
1360 static int do_stdin(void)
1361 {
1362         int fd;
1363
1364         fd = dup(STDIN_FILENO);
1365         do {
1366                 int events;
1367
1368                 events = read_sort_events(fd);
1369                 if (!events)
1370                         break;
1371         
1372                 if (sort_entries() == -1)
1373                         break;
1374
1375                 show_entries_rb(1);
1376         } while (1);
1377
1378         close(fd);
1379         return 0;
1380 }
1381
1382 static void flush_output(void)
1383 {
1384         fflush(ofp);
1385 }
1386
1387 static void handle_sigint(int sig)
1388 {
1389         done = 1;
1390         flush_output();
1391 }
1392
1393 /*
1394  * Extract start and duration times from a string, allowing
1395  * us to specify a time interval of interest within a trace.
1396  * Format: "duration" (start is zero) or "start:duration".
1397  */
1398 static int find_stopwatch_interval(char *string)
1399 {
1400         double value;
1401         char *sp;
1402
1403         value = strtod(string, &sp);
1404         if (sp == string) {
1405                 fprintf(stderr,"Invalid stopwatch timer: %s\n", string);
1406                 return 1;
1407         }
1408         if (*sp == ':') {
1409                 stopwatch_start = DOUBLE_TO_NANO_ULL(value);
1410                 string = sp + 1;
1411                 value = strtod(string, &sp);
1412                 if (sp == string || *sp != '\0') {
1413                         fprintf(stderr,"Invalid stopwatch duration time: %s\n",
1414                                 string);
1415                         return 1;
1416                 }
1417         } else if (*sp != '\0') {
1418                 fprintf(stderr,"Invalid stopwatch start timer: %s\n", string);
1419                 return 1;
1420         }
1421         stopwatch_end = stopwatch_start + DOUBLE_TO_NANO_ULL(value);
1422         return 0;
1423 }
1424
1425 static void usage(char *prog)
1426 {
1427         fprintf(stderr, "Usage: %s "
1428                 "[-i <name>] [-o <output>] [-s] [-w N[:n]] <name>...\n",
1429                 prog);
1430 }
1431
1432 int main(int argc, char *argv[])
1433 {
1434         char *ofp_buffer;
1435         int c, ret, mode;
1436         int per_device_and_cpu_stats = 1;
1437
1438         while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
1439                 switch (c) {
1440                 case 'i':
1441                         if (!strcmp(optarg, "-") && !pipeline)
1442                                 pipeline = 1;
1443                         else if (resize_devices(optarg) != 0)
1444                                 return 1;
1445                         break;
1446                 case 'o':
1447                         output_name = optarg;
1448                         break;
1449                 case 'b':
1450                         rb_batch = atoi(optarg);
1451                         if (rb_batch <= 0)
1452                                 rb_batch = RB_BATCH_DEFAULT;
1453                         break;
1454                 case 's':
1455                         per_process_stats = 1;
1456                         break;
1457                 case 't':
1458                         track_ios = 1;
1459                         break;
1460                 case 'q':
1461                         per_device_and_cpu_stats = 0;
1462                         break;
1463                 case 'w':
1464                         if (find_stopwatch_interval(optarg) != 0)
1465                                 return 1;
1466                         break;
1467                 default:
1468                         usage(argv[0]);
1469                         return 1;
1470                 }
1471         }
1472
1473         while (optind < argc) {
1474                 if (!strcmp(argv[optind], "-") && !pipeline)
1475                         pipeline = 1;
1476                 else if (resize_devices(argv[optind]) != 0)
1477                         return 1;
1478                 optind++;
1479         }
1480
1481         if (!pipeline && !ndevices) {
1482                 usage(argv[0]);
1483                 return 1;
1484         }
1485
1486         memset(&rb_sort_root, 0, sizeof(rb_sort_root));
1487         memset(&rb_track_root, 0, sizeof(rb_track_root));
1488
1489         signal(SIGINT, handle_sigint);
1490         signal(SIGHUP, handle_sigint);
1491         signal(SIGTERM, handle_sigint);
1492
1493         setlocale(LC_NUMERIC, "en_US");
1494
1495         if (!output_name) {
1496                 ofp = fdopen(STDOUT_FILENO, "w");
1497                 mode = _IOLBF;
1498         } else {
1499                 char ofname[128];
1500
1501                 snprintf(ofname, sizeof(ofname) - 1, "%s.log", output_name);
1502                 ofp = fopen(ofname, "w");
1503                 mode = _IOFBF;
1504         }
1505
1506         if (!ofp) {
1507                 perror("fopen");
1508                 return 1;
1509         }
1510
1511         ofp_buffer = malloc(4096);      
1512         if (setvbuf(ofp, ofp_buffer, mode, 4096)) {
1513                 perror("setvbuf");
1514                 return 1;
1515         }
1516
1517         if (pipeline)
1518                 ret = do_stdin();
1519         else
1520                 ret = do_file();
1521
1522         if (per_process_stats)
1523                 show_process_stats();
1524
1525         if (per_device_and_cpu_stats)
1526                 show_device_and_cpu_stats();
1527
1528         flush_output();
1529         return ret;
1530 }