[PATCH] blkparse: fixup stats
[blktrace.git] / blkparse.c
1 /*
2  * block queue tracing parse application
3  *
4  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation; either version 2 of the License, or
9  *  (at your option) any later version.
10  *
11  *  This program is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with this program; if not, write to the Free Software
18  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  *
20  */
21 #include <sys/types.h>
22 #include <sys/stat.h>
23 #include <unistd.h>
24 #include <stdio.h>
25 #include <fcntl.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <getopt.h>
29 #include <errno.h>
30 #include <signal.h>
31 #include <locale.h>
32
33 #include "blktrace.h"
34 #include "rbtree.h"
35
36 #define SECONDS(x)              ((unsigned long long)(x) / 1000000000)
37 #define NANO_SECONDS(x)         ((unsigned long long)(x) % 1000000000)
38
39 static int backwards;
40 static unsigned long long genesis_time, last_reported_time;
41
42 struct io_stats {
43         unsigned long qreads, qwrites, creads, cwrites, mreads, mwrites;
44         unsigned long ireads, iwrites;
45         unsigned long long qread_kb, qwrite_kb, cread_kb, cwrite_kb;
46         unsigned long long iread_kb, iwrite_kb;
47 };
48
49 struct per_cpu_info {
50         int cpu;
51         int nelems;
52
53         int fd;
54         char fname[128];
55
56         struct io_stats io_stats;
57 };
58
59 struct per_process_info {
60         char name[16];
61         __u32 pid;
62         struct io_stats io_stats;
63         struct per_process_info *hash_next, *list_next;
64
65         /*
66          * individual io stats
67          */
68         unsigned long long longest_allocation_wait[2];
69         unsigned long long longest_dispatch_wait[2];
70         unsigned long long longest_completion_wait[2];
71 };
72
73 #define PPI_HASH_SHIFT  (8)
74 static struct per_process_info *ppi_hash[1 << PPI_HASH_SHIFT];
75 static struct per_process_info *ppi_list;
76
77 #define S_OPTS  "i:o:b:st"
78 static struct option l_opts[] = {
79         {
80                 .name = "input",
81                 .has_arg = 1,
82                 .flag = NULL,
83                 .val = 'i'
84         },
85         {
86                 .name = "output",
87                 .has_arg = 1,
88                 .flag = NULL,
89                 .val = 'o'
90         },
91         {
92                 .name = "batch",
93                 .has_arg = 1,
94                 .flag = NULL,
95                 .val = 'b'
96         },
97         {
98                 .name = "per program stats",
99                 .has_arg = 0,
100                 .flag = NULL,
101                 .val = 's'
102         },
103         {
104                 .name = "track ios",
105                 .has_arg = 0,
106                 .flag = NULL,
107                 .val = 't'
108         },
109         {
110                 .name = NULL,
111                 .has_arg = 0,
112                 .flag = NULL,
113                 .val = 0
114         }
115 };
116
117 static struct rb_root rb_sort_root;
118 static struct rb_root rb_track_root;
119
120 /*
121  * for sorting the displayed output
122  */
123 struct trace {
124         struct blk_io_trace *bit;
125         struct rb_node rb_node;
126 };
127
128 /*
129  * for tracking individual ios
130  */
131 struct io_track {
132         struct rb_node rb_node;
133
134         __u64 sector;
135         __u32 pid;
136         unsigned long long allocation_time;
137         unsigned long long queue_time;
138         unsigned long long dispatch_time;
139         unsigned long long completion_time;
140 };
141
142 static int max_cpus;
143 static struct per_cpu_info *per_cpu_info;
144
145 static unsigned long long events;
146
147 static char *dev, *output_name;
148 static FILE *ofp;
149
150 static int per_process_stats;
151 static int track_ios;
152
153 #define RB_BATCH_DEFAULT        (1024)
154 static int rb_batch = RB_BATCH_DEFAULT;
155
156 #define is_done()       (*(volatile int *)(&done))
157 static volatile int done;
158
159 static inline unsigned long hash_long(unsigned long val)
160 {
161 #if __WORDSIZE == 32
162         val *= 0x9e370001UL;
163 #elif __WORDSIZE == 64
164         val *= 0x9e37fffffffc0001UL;
165 #else
166 #error unknown word size
167 #endif
168
169         return val >> (__WORDSIZE - PPI_HASH_SHIFT);
170 }
171
172 static inline void add_process_to_hash(struct per_process_info *ppi)
173 {
174         const int hash_idx = hash_long(ppi->pid);
175
176         ppi->hash_next = ppi_hash[hash_idx];
177         ppi_hash[hash_idx] = ppi;
178 }
179
180 static inline void add_process_to_list(struct per_process_info *ppi)
181 {
182         ppi->list_next = ppi_list;
183         ppi_list = ppi;
184 }
185
186 static struct per_process_info *find_process_by_pid(__u32 pid)
187 {
188         const int hash_idx = hash_long(pid);
189         struct per_process_info *ppi;
190
191         ppi = ppi_hash[hash_idx];
192         while (ppi) {
193                 if (ppi->pid == pid)
194                         return ppi;
195
196                 ppi = ppi->hash_next;
197         }
198
199         return NULL;
200 }
201
202 static inline int trace_rb_insert(struct trace *t)
203 {
204         struct rb_node **p = &rb_sort_root.rb_node;
205         struct rb_node *parent = NULL;
206         struct trace *__t;
207
208         while (*p) {
209                 parent = *p;
210                 __t = rb_entry(parent, struct trace, rb_node);
211
212                 if (t->bit->sequence < __t->bit->sequence)
213                         p = &(*p)->rb_left;
214                 else if (t->bit->sequence > __t->bit->sequence)
215                         p = &(*p)->rb_right;
216                 else {
217                         fprintf(stderr, "sequence alias!\n");
218                         return 1;
219                 }
220         }
221
222         rb_link_node(&t->rb_node, parent, p);
223         rb_insert_color(&t->rb_node, &rb_sort_root);
224         return 0;
225 }
226
227 static inline int track_rb_insert(struct io_track *iot)
228 {
229         struct rb_node **p = &rb_track_root.rb_node;
230         struct rb_node *parent = NULL;
231         struct io_track *__iot;
232
233         while (*p) {
234                 parent = *p;
235                 
236                 __iot = rb_entry(parent, struct io_track, rb_node);
237
238                 if (iot->sector < __iot->sector)
239                         p = &(*p)->rb_left;
240                 else if (iot->sector > __iot->sector)
241                         p = &(*p)->rb_right;
242                 else {
243                         fprintf(stderr, "sequence alias!\n");
244                         return 1;
245                 }
246         }
247
248         rb_link_node(&iot->rb_node, parent, p);
249         rb_insert_color(&iot->rb_node, &rb_track_root);
250         return 0;
251 }
252
253 static struct io_track *__find_track(__u64 sector)
254 {
255         struct rb_node **p = &rb_track_root.rb_node;
256         struct rb_node *parent = NULL;
257         struct io_track *__iot;
258
259         while (*p) {
260                 parent = *p;
261                 
262                 __iot = rb_entry(parent, struct io_track, rb_node);
263
264                 if (sector < __iot->sector)
265                         p = &(*p)->rb_left;
266                 else if (sector > __iot->sector)
267                         p = &(*p)->rb_right;
268                 else
269                         return __iot;
270         }
271
272         return NULL;
273 }
274
275 static struct io_track *find_track(__u32 pid, __u64 sector)
276 {
277         struct io_track *iot = __find_track(sector);
278
279         iot = __find_track(sector);
280         if (!iot) {
281                 iot = malloc(sizeof(*iot));
282                 iot->pid = pid;
283                 iot->sector = sector;
284                 track_rb_insert(iot);
285         }
286
287         return iot;
288 }
289
290 static void log_track_merge(struct blk_io_trace *t)
291 {
292         struct io_track *iot;
293
294         if (!track_ios)
295                 return;
296         if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
297                 return;
298
299         iot = __find_track(t->sector - (t->bytes >> 10));
300         if (!iot) {
301                 fprintf(stderr, "Trying to merge on non-existing request\n");
302                 return;
303         }
304
305         rb_erase(&iot->rb_node, &rb_track_root);
306         iot->sector -= t->bytes >> 10;
307         track_rb_insert(iot);
308 }
309
310 static void log_track_getrq(struct blk_io_trace *t)
311 {
312         struct io_track *iot;
313
314         if (!track_ios)
315                 return;
316
317         iot = find_track(t->pid, t->sector);
318         iot->allocation_time = t->time;
319 }
320
321
322 /*
323  * return time between rq allocation and queue
324  */
325 static unsigned long long log_track_queue(struct blk_io_trace *t)
326 {
327         unsigned long long elapsed;
328         struct io_track *iot;
329
330         if (!track_ios)
331                 return -1;
332
333         iot = find_track(t->pid, t->sector);
334         iot->queue_time = t->time;
335         elapsed = iot->queue_time - iot->allocation_time;
336
337         if (per_process_stats) {
338                 struct per_process_info *ppi = find_process_by_pid(iot->pid);
339                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
340
341                 if (ppi && elapsed > ppi->longest_allocation_wait[w])
342                         ppi->longest_allocation_wait[w] = elapsed;
343         }
344
345         return elapsed;
346 }
347
348 /*
349  * return time between queue and issue
350  */
351 static unsigned long long log_track_issue(struct blk_io_trace *t)
352 {
353         unsigned long long elapsed;
354         struct io_track *iot;
355
356         if (!track_ios)
357                 return -1;
358         if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
359                 return -1;
360
361         iot = __find_track(t->sector);
362         if (!iot) {
363                 fprintf(stderr, "Trying to issue on non-existing request\n");
364                 return -1;
365         }
366
367         iot->dispatch_time = t->time;
368         elapsed = iot->dispatch_time - iot->queue_time;
369
370         if (per_process_stats) {
371                 struct per_process_info *ppi = find_process_by_pid(iot->pid);
372                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
373
374                 if (ppi && elapsed > ppi->longest_dispatch_wait[w])
375                         ppi->longest_dispatch_wait[w] = elapsed;
376         }
377
378         return elapsed;
379 }
380
381 /*
382  * return time between dispatch and complete
383  */
384 static unsigned long long log_track_complete(struct blk_io_trace *t)
385 {
386         unsigned long long elapsed;
387         struct io_track *iot;
388
389         if (!track_ios)
390                 return -1;
391         if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
392                 return -1;
393
394         iot = __find_track(t->sector);
395         if (!iot) {
396                 fprintf(stderr, "Trying to dispatch on non-existing request\n");
397                 return -1;
398         }
399
400         iot->completion_time = t->time;
401         elapsed = iot->completion_time - iot->dispatch_time;
402
403         if (per_process_stats) {
404                 struct per_process_info *ppi = find_process_by_pid(iot->pid);
405                 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
406
407                 if (ppi && elapsed > ppi->longest_completion_wait[w])
408                         ppi->longest_completion_wait[w] = elapsed;
409         }
410
411         /*
412          * kill the trace, we don't need it after completion
413          */
414         rb_erase(&iot->rb_node, &rb_track_root);
415         free(iot);
416
417         return elapsed;
418 }
419
420
421 static struct io_stats *find_process_io_stats(__u32 pid, char *name)
422 {
423         struct per_process_info *ppi = find_process_by_pid(pid);
424
425         if (!ppi) {
426                 ppi = malloc(sizeof(*ppi));
427                 memset(ppi, 0, sizeof(*ppi));
428                 strncpy(ppi->name, name, sizeof(ppi->name));
429                 ppi->pid = pid;
430                 add_process_to_hash(ppi);
431                 add_process_to_list(ppi);
432         }
433
434         return &ppi->io_stats;
435 }
436
437 static void resize_cpu_info(int cpuid)
438 {
439         int new_space, new_max = cpuid + 1;
440         char *new_start;
441
442         per_cpu_info = realloc(per_cpu_info, new_max * sizeof(*per_cpu_info));
443         if (!per_cpu_info) {
444                 fprintf(stderr, "Cannot allocate CPU info -- %d x %d bytes\n",
445                         new_max, (int) sizeof(*per_cpu_info));
446                 exit(1);
447         }
448
449         new_start = (char *)per_cpu_info + (max_cpus * sizeof(*per_cpu_info));
450         new_space = (new_max - max_cpus) * sizeof(*per_cpu_info);
451         memset(new_start, 0, new_space);
452         max_cpus = new_max;
453 }
454
455 static struct per_cpu_info *get_cpu_info(int cpu)
456 {
457         struct per_cpu_info *pci;
458
459         if (cpu >= max_cpus)
460                 resize_cpu_info(cpu);
461
462         /*
463          * ->cpu might already be set, but just set it unconditionally
464          */
465         pci = &per_cpu_info[cpu];
466         pci->cpu = cpu;
467
468         return pci;
469 }
470
471 static inline void check_time(struct blk_io_trace *bit)
472 {
473         unsigned long long this = bit->time;
474         unsigned long long last = last_reported_time;
475
476         backwards = (this < last) ? 'B' : ' ';
477         last_reported_time = this;
478 }
479
480 static inline void __account_m(struct io_stats *ios, struct blk_io_trace *t,
481                                int rw)
482 {
483         if (rw) {
484                 ios->mwrites++;
485                 ios->qwrite_kb += t->bytes >> 10;
486         } else {
487                 ios->mreads++;
488                 ios->qread_kb += t->bytes >> 10;
489         }
490 }
491
492 static inline void account_m(struct blk_io_trace *t, struct per_cpu_info *pci,
493                              int rw)
494 {
495         __account_m(&pci->io_stats, t, rw);
496
497         if (per_process_stats) {
498                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
499
500                 __account_m(ios, t, rw);
501         }
502 }
503
504 static inline void __account_q(struct io_stats *ios, struct blk_io_trace *t,
505                                int rw)
506 {
507         if (rw) {
508                 ios->qwrites++;
509                 ios->qwrite_kb += t->bytes >> 10;
510         } else {
511                 ios->qreads++;
512                 ios->qread_kb += t->bytes >> 10;
513         }
514 }
515
516 static inline void account_q(struct blk_io_trace *t, struct per_cpu_info *pci,
517                              int rw)
518 {
519         __account_q(&pci->io_stats, t, rw);
520
521         if (per_process_stats) {
522                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
523
524                 __account_q(ios, t, rw);
525         }
526 }
527
528 static inline void __account_c(struct io_stats *ios, int rw, unsigned int bytes)
529 {
530         if (rw) {
531                 ios->cwrites++;
532                 ios->cwrite_kb += bytes >> 10;
533         } else {
534                 ios->creads++;
535                 ios->cread_kb += bytes >> 10;
536         }
537 }
538
539 static inline void account_c(struct blk_io_trace *t, struct per_cpu_info *pci,
540                              int rw, int bytes)
541 {
542         __account_c(&pci->io_stats, rw, bytes);
543
544         if (per_process_stats) {
545                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
546
547                 __account_c(ios, rw, bytes);
548         }
549 }
550
551 static inline void __account_i(struct io_stats *ios, int rw, unsigned int bytes)
552 {
553         if (rw) {
554                 ios->iwrites++;
555                 ios->iwrite_kb += bytes >> 10;
556         } else {
557                 ios->ireads++;
558                 ios->iread_kb += bytes >> 10;
559         }
560 }
561
562 static inline void account_i(struct blk_io_trace *t, struct per_cpu_info *pci,
563                              int rw)
564 {
565         __account_i(&pci->io_stats, rw, t->bytes);
566
567         if (per_process_stats) {
568                 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
569
570                 __account_i(ios, rw, t->bytes);
571         }
572 }
573
574 static void output(struct per_cpu_info *pci, char *s)
575 {
576         fprintf(ofp, "%s", s);
577 }
578
579 static char hstring[256];
580 static char tstring[256];
581
582 static inline char *setup_header(struct per_cpu_info *pci,
583                                  struct blk_io_trace *t, char act)
584 {
585         int w = t->action & BLK_TC_ACT(BLK_TC_WRITE);
586         int b = t->action & BLK_TC_ACT(BLK_TC_BARRIER);
587         int s = t->action & BLK_TC_ACT(BLK_TC_SYNC);
588         char rwbs[4];
589         int i = 0;
590
591         if (w)
592                 rwbs[i++] = 'W';
593         else
594                 rwbs[i++] = 'R';
595         if (b)
596                 rwbs[i++] = 'B';
597         if (s)
598                 rwbs[i++] = 'S';
599
600         rwbs[i] = '\0';
601
602         sprintf(hstring, "%2d %8ld %5Lu.%09Lu %5u %c %3s",
603                 pci->cpu,
604                 (unsigned long)t->sequence, SECONDS(t->time), 
605                 NANO_SECONDS(t->time), t->pid, act, rwbs);
606
607         return hstring;
608 }
609
610 static void log_complete(struct per_cpu_info *pci, struct blk_io_trace *t,
611                          char act)
612 {
613         unsigned long long elapsed = log_track_complete(t);
614
615         if (elapsed != -1ULL) {
616                 unsigned long usec = elapsed / 1000;
617
618                 sprintf(tstring,"%s %Lu + %u (%8lu) [%d]\n",
619                         setup_header(pci, t, act),
620                         (unsigned long long)t->sector, t->bytes >> 9,
621                         usec, t->error);
622         } else {
623                 sprintf(tstring,"%s %Lu + %u [%d]\n", setup_header(pci, t, act),
624                         (unsigned long long)t->sector, t->bytes >> 9, t->error);
625         }
626         
627         output(pci, tstring);
628 }
629
630 static void log_queue(struct per_cpu_info *pci, struct blk_io_trace *t,
631                       char act)
632 {
633         unsigned long long elapsed = log_track_queue(t);
634
635         if (elapsed != -1ULL) {
636                 unsigned long usec = elapsed / 1000;
637
638                 sprintf(tstring,"%s %Lu + %u (%8lu) [%s]\n",
639                         setup_header(pci, t, act),
640                         (unsigned long long)t->sector, t->bytes >> 9,
641                         usec, t->comm);
642         } else {
643                 sprintf(tstring,"%s %Lu + %u [%s]\n", setup_header(pci, t, act),
644                         (unsigned long long)t->sector, t->bytes >> 9, t->comm);
645         }
646         output(pci, tstring);
647 }
648
649 static void log_issue(struct per_cpu_info *pci, struct blk_io_trace *t,
650                       char act)
651 {
652         unsigned long long elapsed = log_track_issue(t);
653
654         if (elapsed != -1ULL) {
655                 double usec = (double) elapsed / 1000;
656
657                 sprintf(tstring,"%s %Lu + %u (%8.2f) [%s]\n",
658                         setup_header(pci, t, act),
659                         (unsigned long long)t->sector, t->bytes >> 9,
660                         usec, t->comm);
661         } else {
662                 sprintf(tstring,"%s %Lu + %u [%s]\n", setup_header(pci, t, act),
663                         (unsigned long long)t->sector, t->bytes >> 9, t->comm);
664         }
665
666         output(pci, tstring);
667 }
668
669 static void log_merge(struct per_cpu_info *pci, struct blk_io_trace *t,
670                       char act)
671 {
672         log_track_merge(t);
673
674         sprintf(tstring,"%s %Lu + %u [%s]\n", setup_header(pci, t, act),
675                 (unsigned long long)t->sector, t->bytes >> 9, t->comm);
676         output(pci, tstring);
677 }
678
679 static void log_generic(struct per_cpu_info *pci, struct blk_io_trace *t,
680                         char act)
681 {
682         sprintf(tstring,"%s %Lu + %u [%s]\n", setup_header(pci, t, act),
683                 (unsigned long long)t->sector, t->bytes >> 9, t->comm);
684         output(pci, tstring);
685 }
686
687 static int log_pc(struct per_cpu_info *pci, struct blk_io_trace *t, char act)
688 {
689         unsigned char *buf;
690         int i;
691
692         sprintf(tstring,"%s ", setup_header(pci, t, act));
693         output(pci, tstring);
694
695         buf = (unsigned char *) t + sizeof(*t);
696         for (i = 0; i < t->pdu_len; i++) {
697                 sprintf(tstring,"%02x ", buf[i]);
698                 output(pci, tstring);
699         }
700
701         if (act == 'C') {
702                 sprintf(tstring,"[%d]\n", t->error);
703                 output(pci, tstring);
704         } else {
705                 sprintf(tstring,"[%s]\n", t->comm);
706                 output(pci, tstring);
707         }
708         return 0;
709 }
710
711 static int dump_trace_pc(struct blk_io_trace *t, struct per_cpu_info *pci)
712 {
713         int ret = 0;
714
715         switch (t->action & 0xffff) {
716                 case __BLK_TA_QUEUE:
717                         log_generic(pci, t, 'Q');
718                         break;
719                 case __BLK_TA_GETRQ:
720                         log_generic(pci, t, 'G');
721                         break;
722                 case __BLK_TA_SLEEPRQ:
723                         log_generic(pci, t, 'S');
724                         break;
725                 case __BLK_TA_REQUEUE:
726                         log_generic(pci, t, 'R');
727                         break;
728                 case __BLK_TA_ISSUE:
729                         ret = log_pc(pci, t, 'D');
730                         break;
731                 case __BLK_TA_COMPLETE:
732                         log_pc(pci, t, 'C');
733                         break;
734                 default:
735                         fprintf(stderr, "Bad pc action %x\n", t->action);
736                         ret = 1;
737                         break;
738         }
739         
740         return ret;
741 }
742
743 static void dump_trace_fs(struct blk_io_trace *t, struct per_cpu_info *pci)
744 {
745         int w = t->action & BLK_TC_ACT(BLK_TC_WRITE);
746         int act = t->action & 0xffff;
747
748         switch (act) {
749                 case __BLK_TA_QUEUE:
750                         account_q(t, pci, w);
751                         log_queue(pci, t, 'Q');
752                         break;
753                 case __BLK_TA_BACKMERGE:
754                         account_m(t, pci, w);
755                         log_merge(pci, t, 'M');
756                         break;
757                 case __BLK_TA_FRONTMERGE:
758                         account_m(t, pci, w);
759                         log_merge(pci, t, 'F');
760                         break;
761                 case __BLK_TA_GETRQ:
762                         log_track_getrq(t);
763                         log_generic(pci, t, 'G');
764                         break;
765                 case __BLK_TA_SLEEPRQ:
766                         log_generic(pci, t, 'S');
767                         break;
768                 case __BLK_TA_REQUEUE:
769                         account_c(t, pci, w, -t->bytes);
770                         log_queue(pci, t, 'R');
771                         break;
772                 case __BLK_TA_ISSUE:
773                         account_i(t, pci, w);
774                         log_issue(pci, t, 'D');
775                         break;
776                 case __BLK_TA_COMPLETE:
777                         account_c(t, pci, w, t->bytes);
778                         log_complete(pci, t, 'C');
779                         break;
780                 default:
781                         fprintf(stderr, "Bad fs action %x\n", t->action);
782                         break;
783         }
784 }
785
786 static int dump_trace(struct blk_io_trace *t, struct per_cpu_info *pci)
787 {
788         int ret = 0;
789
790         if (t->action & BLK_TC_ACT(BLK_TC_PC))
791                 ret = dump_trace_pc(t, pci);
792         else
793                 dump_trace_fs(t, pci);
794
795         events++;
796         return ret;
797 }
798
799 static void dump_io_stats(struct io_stats *ios, char *msg)
800 {
801         fprintf(ofp, "%s\n", msg);
802
803         fprintf(ofp, " Reads Queued:    %'8lu, %'8LuKiB\t", ios->qreads, ios->qread_kb);
804         fprintf(ofp, " Writes Queued:    %'8lu, %'8LuKiB\n", ios->qwrites,ios->qwrite_kb);
805
806         fprintf(ofp, " Read Dispatches: %'8lu, %'8LuKiB\t", ios->ireads, ios->iread_kb);
807         fprintf(ofp, " Write Dispatches: %'8lu, %'8LuKiB\n", ios->iwrites,ios->iwrite_kb);
808         fprintf(ofp, " Reads Completed: %'8lu, %'8LuKiB\t", ios->creads, ios->cread_kb);
809         fprintf(ofp, " Writes Completed: %'8lu, %'8LuKiB\n", ios->cwrites,ios->cwrite_kb);
810         fprintf(ofp, " Read Merges:     %'8lu%8c\t", ios->mreads, ' ');
811
812         fprintf(ofp, " Write Merges:     %'8lu\n", ios->mwrites);
813 }
814
815 static void dump_wait_stats(struct per_process_info *ppi)
816 {
817         unsigned long rawait = ppi->longest_allocation_wait[0] / 1000;
818         unsigned long rdwait = ppi->longest_dispatch_wait[0] / 1000;
819         unsigned long rcwait = ppi->longest_completion_wait[0] / 1000;
820         unsigned long wawait = ppi->longest_allocation_wait[1] / 1000;
821         unsigned long wdwait = ppi->longest_dispatch_wait[1] / 1000;
822         unsigned long wcwait = ppi->longest_completion_wait[1] / 1000;
823
824         fprintf(ofp, " Allocation wait: %'8lu%8c\t", rawait, ' ');
825         fprintf(ofp, " Allocation wait:  %'8lu\n", wawait);
826         fprintf(ofp, " Dispatch wait:   %'8lu%8c\t", rdwait, ' ');
827         fprintf(ofp, " Dispatch wait:    %'8lu\n", wdwait);
828         fprintf(ofp, " Completion wait: %'8lu%8c\t", rcwait, ' ');
829         fprintf(ofp, " Completion wait:  %'8lu\n", wcwait);
830 }
831
832 static void show_process_stats(void)
833 {
834         struct per_process_info *ppi;
835
836         ppi = ppi_list;
837         while (ppi) {
838                 dump_io_stats(&ppi->io_stats, ppi->name);
839                 dump_wait_stats(ppi);
840                 ppi = ppi->list_next;
841         }
842
843         fprintf(ofp, "\n");
844 }
845
846 static void show_cpu_stats(void)
847 {
848         struct per_cpu_info foo, *pci;
849         struct io_stats *ios;
850         int i, pci_events = 0;
851
852         memset(&foo, 0, sizeof(foo));
853
854         for (i = 0; i < max_cpus; i++) {
855                 char cpu[8];
856
857                 pci = &per_cpu_info[i];
858                 ios = &pci->io_stats;
859
860                 if (!pci->nelems)
861                         continue;
862
863                 foo.io_stats.qreads += ios->qreads;
864                 foo.io_stats.qwrites += ios->qwrites;
865                 foo.io_stats.creads += ios->creads;
866                 foo.io_stats.cwrites += ios->cwrites;
867                 foo.io_stats.mreads += ios->mreads;
868                 foo.io_stats.mwrites += ios->mwrites;
869                 foo.io_stats.ireads += ios->ireads;
870                 foo.io_stats.iwrites += ios->iwrites;
871                 foo.io_stats.qread_kb += ios->qread_kb;
872                 foo.io_stats.qwrite_kb += ios->qwrite_kb;
873                 foo.io_stats.cread_kb += ios->cread_kb;
874                 foo.io_stats.cwrite_kb += ios->cwrite_kb;
875                 foo.io_stats.iread_kb += ios->iread_kb;
876                 foo.io_stats.iwrite_kb += ios->iwrite_kb;
877
878                 snprintf(cpu, sizeof(cpu) - 1, "CPU%d:", i);
879                 dump_io_stats(ios, cpu);
880                 pci_events++;
881         }
882
883         if (pci_events > 1) {
884                 fprintf(ofp, "\n");
885                 dump_io_stats(&foo.io_stats, "Total:");
886         }
887
888         fprintf(ofp, "\nEvents: %'Lu\n", events);
889 }
890
891 #define min(a, b)       ((a) < (b) ? (a) : (b))
892
893 static struct blk_io_trace *find_trace(void *p, unsigned long offset, int nr)
894 {
895         unsigned long max_offset = min(offset,nr * sizeof(struct blk_io_trace));
896         unsigned long off;
897         struct blk_io_trace *bit;
898         __u32 magic;
899
900         for (off = 0; off < max_offset; off++) {
901                 bit = p + off;
902
903                 magic = be32_to_cpu(bit->magic);
904                 if ((magic & 0xffffff00) == BLK_IO_TRACE_MAGIC)
905                         return bit;
906         }
907
908         return NULL;
909 }
910
911 static int sort_entries(void *traces, unsigned long offset, int nr)
912 {
913         struct per_cpu_info *pci;
914         struct blk_io_trace *bit;
915         struct trace *t;
916         void *start = traces;
917         int nelems = 0;
918
919         while (traces - start <= offset - sizeof(*bit)) {
920                 if (!nr)
921                         break;
922
923                 bit = find_trace(traces, offset - (traces - start), nr);
924                 if (!bit)
925                         break;
926
927                 t = malloc(sizeof(*t));
928                 t->bit = bit;
929                 memset(&t->rb_node, 0, sizeof(t->rb_node));
930
931                 trace_to_cpu(bit);
932
933                 if (verify_trace(bit))
934                         break;
935
936                 pci = get_cpu_info(bit->cpu);
937                 pci->nelems++;
938
939                 if (trace_rb_insert(t))
940                         return -1;
941
942                 traces += sizeof(*bit) + bit->pdu_len;
943                 nelems++;
944                 nr--;
945         }
946
947         return nelems;
948 }
949
950 static void free_entries_rb(void)
951 {
952         struct rb_node *n;
953
954         while ((n = rb_first(&rb_sort_root)) != NULL) {
955                 struct trace *t = rb_entry(n, struct trace, rb_node);
956
957                 rb_erase(&t->rb_node, &rb_sort_root);
958                 free(t);
959         }
960 }
961
962 static void show_entries_rb(void)
963 {
964         struct blk_io_trace *bit;
965         struct rb_node *n;
966         struct trace *t;
967         int cpu;
968
969         n = rb_first(&rb_sort_root);
970         if (!n)
971                 return;
972
973         do {
974                 t = rb_entry(n, struct trace, rb_node);
975                 bit = t->bit;
976
977                 cpu = bit->cpu;
978                 if (cpu > max_cpus) {
979                         fprintf(stderr, "CPU number too large (%d)\n", cpu);
980                         break;
981                 }
982
983                 if (genesis_time == 0)
984                         genesis_time = bit->time;
985                 bit->time -= genesis_time;
986
987                 check_time(bit);
988
989                 if (dump_trace(bit, &per_cpu_info[cpu]))
990                         break;
991
992         } while ((n = rb_next(n)) != NULL);
993 }
994
995 static int read_data(int fd, void *buffer, int bytes, int block)
996 {
997         int ret, bytes_left, fl;
998         void *p;
999
1000         fl = fcntl(fd, F_GETFL);
1001
1002         if (!block)
1003                 fcntl(fd, F_SETFL, fl | O_NONBLOCK);
1004         else
1005                 fcntl(fd, F_SETFL, fl & ~O_NONBLOCK);
1006
1007         bytes_left = bytes;
1008         p = buffer;
1009         while (bytes_left > 0) {
1010                 ret = read(fd, p, bytes_left);
1011                 if (!ret)
1012                         return 1;
1013                 else if (ret < 0) {
1014                         if (errno != EAGAIN)
1015                                 perror("read");
1016                         return -1;
1017                 } else {
1018                         p += ret;
1019                         bytes_left -= ret;
1020                 }
1021         }
1022
1023         return 0;
1024 }
1025
1026 static int do_file(void)
1027 {
1028         int i, nfiles;
1029
1030         for (i = 0, nfiles = 0;; i++, nfiles++) {
1031                 struct per_cpu_info *pci;
1032                 struct stat st;
1033                 void *tb;
1034
1035                 pci = get_cpu_info(i);
1036
1037                 snprintf(pci->fname, sizeof(pci->fname)-1,"%s_out.%d", dev, i);
1038                 if (stat(pci->fname, &st) < 0)
1039                         break;
1040                 if (!st.st_size)
1041                         continue;
1042
1043                 printf("Processing %s\n", pci->fname);
1044
1045                 tb = malloc(st.st_size);
1046
1047                 pci->fd = open(pci->fname, O_RDONLY);
1048                 if (pci->fd < 0) {
1049                         perror(pci->fname);
1050                         break;
1051                 }
1052
1053                 if (read_data(pci->fd, tb, st.st_size, 1))
1054                         break;
1055
1056                 if (sort_entries(tb, st.st_size, ~0U) == -1)
1057                         break;
1058
1059                 close(pci->fd);
1060                 printf("\t%2d %10s %15d\n", i, pci->fname, pci->nelems);
1061
1062         }
1063
1064         if (!nfiles) {
1065                 fprintf(stderr, "No files found\n");
1066                 return 1;
1067         }
1068
1069         show_entries_rb();
1070         return 0;
1071 }
1072
1073 static void resize_buffer(void **buffer, long *size, long offset)
1074 {
1075         long old_size = *size;
1076
1077         *size *= 2;
1078         *buffer = realloc(*buffer, *size);
1079         memset(*buffer + offset, 0, *size - old_size);
1080 }
1081
1082 static int read_sort_events(int fd, void **buffer, long *max_offset)
1083 {
1084         long offset;
1085         int events;
1086
1087         if (*max_offset == 0) {
1088                 *max_offset = 128 * sizeof(struct blk_io_trace);
1089                 *buffer = malloc(*max_offset);
1090         }
1091
1092         events = 0;
1093         offset = 0;
1094
1095         do {
1096                 struct blk_io_trace *t;
1097                 int pdu_len;
1098
1099                 if (*max_offset - offset < sizeof(*t))
1100                         resize_buffer(buffer, max_offset, offset);
1101
1102                 if (read_data(fd, *buffer + offset, sizeof(*t), !events)) {
1103                         if (events)
1104                                 break;
1105
1106                         usleep(1000);
1107                         continue;
1108                 }
1109
1110                 t = *buffer + offset;
1111                 offset += sizeof(*t);
1112
1113                 pdu_len = be16_to_cpu(t->pdu_len);
1114                 if (pdu_len) {
1115                         if (*max_offset - offset < pdu_len)
1116                                 resize_buffer(buffer, max_offset, offset);
1117
1118                         if (read_data(fd, *buffer + offset, pdu_len, 1))
1119                                 break;
1120
1121                         offset += pdu_len;
1122                 }
1123
1124                 events++;
1125         } while (!is_done() && events < rb_batch);
1126
1127         return events;
1128 }
1129
1130 static int do_stdin(void)
1131 {
1132         int fd;
1133         void *ptr;
1134         long max_offset;
1135
1136         fd = dup(STDIN_FILENO);
1137         max_offset = 0;
1138         do {
1139                 int events;
1140
1141                 events = read_sort_events(fd, &ptr, &max_offset);
1142                 if (!events)
1143                         break;
1144         
1145                 if (sort_entries(ptr, ~0UL, events) == -1)
1146                         break;
1147
1148                 show_entries_rb();
1149                 free_entries_rb();
1150         } while (1);
1151
1152         free(ptr);
1153         close(fd);
1154         return 0;
1155 }
1156
1157 static void flush_output(void)
1158 {
1159         fflush(ofp);
1160 }
1161
1162 static void handle_sigint(int sig)
1163 {
1164         done = 1;
1165         flush_output();
1166 }
1167
1168 static void usage(char *prog)
1169 {
1170         fprintf(stderr, "Usage: %s -i <name> [-o <output>][-s]\n", prog);
1171 }
1172
1173 int main(int argc, char *argv[])
1174 {
1175         char *ofp_buffer;
1176         int c, ret, mode;
1177
1178         while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
1179                 switch (c) {
1180                 case 'i':
1181                         dev = optarg;
1182                         break;
1183                 case 'o':
1184                         output_name = optarg;
1185                         break;
1186                 case 'b':
1187                         rb_batch = atoi(optarg);
1188                         if (rb_batch <= 0)
1189                                 rb_batch = RB_BATCH_DEFAULT;
1190                         break;
1191                 case 's':
1192                         per_process_stats = 1;
1193                         break;
1194                 case 't':
1195                         track_ios = 1;
1196                         break;
1197                 default:
1198                         usage(argv[0]);
1199                         return 1;
1200                 }
1201         }
1202
1203         if (!dev) {
1204                 usage(argv[0]);
1205                 return 1;
1206         }
1207
1208         memset(&rb_sort_root, 0, sizeof(rb_sort_root));
1209         memset(&rb_track_root, 0, sizeof(rb_track_root));
1210
1211         signal(SIGINT, handle_sigint);
1212         signal(SIGHUP, handle_sigint);
1213         signal(SIGTERM, handle_sigint);
1214
1215         setlocale(LC_NUMERIC, "en_US");
1216
1217         if (!output_name) {
1218                 ofp = fdopen(STDOUT_FILENO, "w");
1219                 mode = _IOLBF;
1220         } else {
1221                 char ofname[128];
1222
1223                 snprintf(ofname, sizeof(ofname) - 1, "%s.log", output_name);
1224                 ofp = fopen(ofname, "w");
1225                 mode = _IOFBF;
1226         }
1227
1228         if (!ofp) {
1229                 perror("fopen");
1230                 return 1;
1231         }
1232
1233         ofp_buffer = malloc(4096);      
1234         if (setvbuf(ofp, ofp_buffer, mode, 4096)) {
1235                 perror("setvbuf");
1236                 return 1;
1237         }
1238
1239         if (!strcmp(dev, "-"))
1240                 ret = do_stdin();
1241         else
1242                 ret = do_file();
1243
1244         if (per_process_stats)
1245                 show_process_stats();
1246
1247         show_cpu_stats();
1248
1249         flush_output();
1250         return ret;
1251 }