[PATCH] blkparse: split queue and insert into two operations
[blktrace.git] / blkparse.c
CommitLineData
d956a2cd
JA
1/*
2 * block queue tracing parse application
3 *
4 * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 */
d0ca268b
JA
21#include <sys/types.h>
22#include <sys/stat.h>
23#include <unistd.h>
24#include <stdio.h>
25#include <fcntl.h>
26#include <stdlib.h>
8fc0abbc 27#include <string.h>
d5396421 28#include <getopt.h>
412819ce
JA
29#include <errno.h>
30#include <signal.h>
d69db225 31#include <locale.h>
46e6968b 32#include <limits.h>
ab197ca7 33#include <ctype.h>
d0ca268b 34
8fc0abbc
JA
35#include "blktrace.h"
36#include "rbtree.h"
d0ca268b 37
2e3e8ded
JA
38#define SECONDS(x) ((unsigned long long)(x) / 1000000000)
39#define NANO_SECONDS(x) ((unsigned long long)(x) % 1000000000)
46e6968b 40#define DOUBLE_TO_NANO_ULL(d) ((unsigned long long)((d) * 1000000000))
cfab07eb 41
e7c9f3ff
NS
42#define MINORBITS 20
43#define MINORMASK ((1U << MINORBITS) - 1)
44#define MAJOR(dev) ((unsigned int) ((dev) >> MINORBITS))
45#define MINOR(dev) ((unsigned int) ((dev) & MINORMASK))
46
47#define min(a, b) ((a) < (b) ? (a) : (b))
d5396421 48
ab197ca7
AB
49#define HEADER "%D %2c %8s %5T.%9t %5p %2a %3d "
50
152f6476
JA
51struct io_stats {
52 unsigned long qreads, qwrites, creads, cwrites, mreads, mwrites;
53 unsigned long ireads, iwrites;
54 unsigned long long qread_kb, qwrite_kb, cread_kb, cwrite_kb;
55 unsigned long long iread_kb, iwrite_kb;
06639b27 56 unsigned long io_unplugs, timer_unplugs;
152f6476
JA
57};
58
d5396421 59struct per_cpu_info {
d0ca268b
JA
60 int cpu;
61 int nelems;
d0ca268b
JA
62
63 int fd;
87b72777 64 char fname[128];
d0ca268b 65
152f6476
JA
66 struct io_stats io_stats;
67};
8fc0abbc 68
e7c9f3ff
NS
69struct per_dev_info {
70 dev_t id;
71 char *name;
72
73 int backwards;
74 unsigned long long events;
75 unsigned long long last_reported_time;
76 struct io_stats io_stats;
cb2a1a62 77 unsigned long last_sequence;
0aa8a62d 78 unsigned long skips;
e7c9f3ff
NS
79
80 int ncpus;
81 struct per_cpu_info *cpus;
82};
83
152f6476
JA
84struct per_process_info {
85 char name[16];
86 __u32 pid;
87 struct io_stats io_stats;
88 struct per_process_info *hash_next, *list_next;
50adc0ba
JA
89
90 /*
91 * individual io stats
92 */
b9d40d6f
JA
93 unsigned long long longest_allocation_wait[2];
94 unsigned long long longest_dispatch_wait[2];
95 unsigned long long longest_completion_wait[2];
d0ca268b
JA
96};
97
152f6476
JA
98#define PPI_HASH_SHIFT (8)
99static struct per_process_info *ppi_hash[1 << PPI_HASH_SHIFT];
100static struct per_process_info *ppi_list;
886ecf0e 101static int ppi_list_entries;
152f6476 102
ab197ca7 103#define S_OPTS "i:o:b:stqw:f:F:"
d5396421
JA
104static struct option l_opts[] = {
105 {
106 .name = "input",
428683db 107 .has_arg = required_argument,
d5396421
JA
108 .flag = NULL,
109 .val = 'i'
110 },
111 {
112 .name = "output",
428683db 113 .has_arg = required_argument,
d5396421
JA
114 .flag = NULL,
115 .val = 'o'
116 },
79f19470
JA
117 {
118 .name = "batch",
428683db 119 .has_arg = required_argument,
79f19470
JA
120 .flag = NULL,
121 .val = 'b'
122 },
152f6476
JA
123 {
124 .name = "per program stats",
428683db 125 .has_arg = no_argument,
152f6476
JA
126 .flag = NULL,
127 .val = 's'
128 },
7997c5b0
JA
129 {
130 .name = "track ios",
428683db 131 .has_arg = no_argument,
7997c5b0
JA
132 .flag = NULL,
133 .val = 't'
134 },
1e1c60f1
NS
135 {
136 .name = "quiet",
428683db 137 .has_arg = no_argument,
1e1c60f1
NS
138 .flag = NULL,
139 .val = 'q'
140 },
46e6968b
NS
141 {
142 .name = "stopwatch",
428683db 143 .has_arg = required_argument,
46e6968b
NS
144 .flag = NULL,
145 .val = 'w'
146 },
ab197ca7
AB
147 {
148 .name = "format",
428683db 149 .has_arg = required_argument,
ab197ca7
AB
150 .flag = NULL,
151 .val = 'f'
152 },
153 {
154 .name = "format-spec",
428683db 155 .has_arg = required_argument,
ab197ca7
AB
156 .flag = NULL,
157 .val = 'F'
158 },
d5396421
JA
159};
160
7997c5b0
JA
161/*
162 * for sorting the displayed output
163 */
8fc0abbc
JA
164struct trace {
165 struct blk_io_trace *bit;
166 struct rb_node rb_node;
cb2a1a62
JA
167 struct trace *next;
168 int skipped;
8fc0abbc
JA
169};
170
cb2a1a62
JA
171static struct rb_root rb_sort_root;
172static struct rb_root rb_track_root;
173
174static struct trace *trace_list;
175
7997c5b0
JA
176/*
177 * for tracking individual ios
178 */
179struct io_track {
180 struct rb_node rb_node;
181
e7c9f3ff 182 dev_t device;
7997c5b0
JA
183 __u64 sector;
184 __u32 pid;
95c15013 185 unsigned long long allocation_time;
7997c5b0
JA
186 unsigned long long queue_time;
187 unsigned long long dispatch_time;
188 unsigned long long completion_time;
189};
190
e7c9f3ff
NS
191static int ndevices;
192static struct per_dev_info *devices;
193static char *get_dev_name(struct per_dev_info *, char *, int);
d0ca268b 194
152f6476 195static FILE *ofp;
e7c9f3ff
NS
196static char *output_name;
197
198static unsigned long long genesis_time;
46e6968b
NS
199static unsigned long long stopwatch_start; /* start from zero by default */
200static unsigned long long stopwatch_end = ULONG_LONG_MAX; /* "infinity" */
152f6476
JA
201
202static int per_process_stats;
7997c5b0 203static int track_ios;
d0ca268b 204
79f19470
JA
205#define RB_BATCH_DEFAULT (1024)
206static int rb_batch = RB_BATCH_DEFAULT;
207
e7c9f3ff
NS
208static int pipeline;
209
412819ce
JA
210#define is_done() (*(volatile int *)(&done))
211static volatile int done;
212
152f6476
JA
213static inline unsigned long hash_long(unsigned long val)
214{
16ef714e
JA
215#if __WORDSIZE == 32
216 val *= 0x9e370001UL;
217#elif __WORDSIZE == 64
218 val *= 0x9e37fffffffc0001UL;
219#else
220#error unknown word size
221#endif
222
223 return val >> (__WORDSIZE - PPI_HASH_SHIFT);
152f6476
JA
224}
225
226static inline void add_process_to_hash(struct per_process_info *ppi)
227{
228 const int hash_idx = hash_long(ppi->pid);
229
230 ppi->hash_next = ppi_hash[hash_idx];
231 ppi_hash[hash_idx] = ppi;
232}
233
234static inline void add_process_to_list(struct per_process_info *ppi)
235{
236 ppi->list_next = ppi_list;
237 ppi_list = ppi;
886ecf0e 238 ppi_list_entries++;
152f6476
JA
239}
240
241static struct per_process_info *find_process_by_pid(__u32 pid)
242{
243 const int hash_idx = hash_long(pid);
244 struct per_process_info *ppi;
245
246 ppi = ppi_hash[hash_idx];
247 while (ppi) {
248 if (ppi->pid == pid)
249 return ppi;
250
251 ppi = ppi->hash_next;
252 }
253
254 return NULL;
255}
256
7997c5b0
JA
257static inline int trace_rb_insert(struct trace *t)
258{
259 struct rb_node **p = &rb_sort_root.rb_node;
260 struct rb_node *parent = NULL;
261 struct trace *__t;
262
e7c9f3ff
NS
263 if (genesis_time == 0 || t->bit->time < genesis_time)
264 genesis_time = t->bit->time;
265
7997c5b0
JA
266 while (*p) {
267 parent = *p;
268 __t = rb_entry(parent, struct trace, rb_node);
269
e7c9f3ff
NS
270 if (t->bit->time < __t->bit->time)
271 p = &(*p)->rb_left;
272 else if (t->bit->time > __t->bit->time)
273 p = &(*p)->rb_right;
274 else if (t->bit->device < __t->bit->device)
275 p = &(*p)->rb_left;
276 else if (t->bit->device > __t->bit->device)
277 p = &(*p)->rb_right;
278 else if (t->bit->sequence < __t->bit->sequence)
7997c5b0
JA
279 p = &(*p)->rb_left;
280 else if (t->bit->sequence > __t->bit->sequence)
281 p = &(*p)->rb_right;
e7c9f3ff
NS
282 else if (t->bit->device == __t->bit->device) {
283 fprintf(stderr,
284 "sequence alias (%d) on device %d,%d!\n",
285 t->bit->sequence,
286 MAJOR(t->bit->device), MINOR(t->bit->device));
7997c5b0
JA
287 return 1;
288 }
289 }
290
291 rb_link_node(&t->rb_node, parent, p);
292 rb_insert_color(&t->rb_node, &rb_sort_root);
293 return 0;
294}
295
296static inline int track_rb_insert(struct io_track *iot)
297{
298 struct rb_node **p = &rb_track_root.rb_node;
299 struct rb_node *parent = NULL;
300 struct io_track *__iot;
301
302 while (*p) {
303 parent = *p;
e7c9f3ff 304
7997c5b0
JA
305 __iot = rb_entry(parent, struct io_track, rb_node);
306
e7c9f3ff
NS
307 if (iot->device < __iot->device)
308 p = &(*p)->rb_left;
309 else if (iot->device > __iot->device)
310 p = &(*p)->rb_right;
311 else if (iot->sector < __iot->sector)
7997c5b0
JA
312 p = &(*p)->rb_left;
313 else if (iot->sector > __iot->sector)
314 p = &(*p)->rb_right;
315 else {
e7c9f3ff 316 fprintf(stderr,
ab197ca7
AB
317 "sector alias (%Lu) on device %d,%d!\n",
318 (unsigned long long) iot->sector,
e7c9f3ff 319 MAJOR(iot->device), MINOR(iot->device));
7997c5b0
JA
320 return 1;
321 }
322 }
323
324 rb_link_node(&iot->rb_node, parent, p);
325 rb_insert_color(&iot->rb_node, &rb_track_root);
326 return 0;
327}
328
e7c9f3ff 329static struct io_track *__find_track(dev_t device, __u64 sector)
7997c5b0
JA
330{
331 struct rb_node **p = &rb_track_root.rb_node;
332 struct rb_node *parent = NULL;
333 struct io_track *__iot;
334
335 while (*p) {
336 parent = *p;
337
338 __iot = rb_entry(parent, struct io_track, rb_node);
339
e7c9f3ff
NS
340 if (device < __iot->device)
341 p = &(*p)->rb_left;
342 else if (device > __iot->device)
343 p = &(*p)->rb_right;
344 else if (sector < __iot->sector)
7997c5b0
JA
345 p = &(*p)->rb_left;
346 else if (sector > __iot->sector)
347 p = &(*p)->rb_right;
348 else
349 return __iot;
350 }
351
352 return NULL;
353}
354
e7c9f3ff 355static struct io_track *find_track(__u32 pid, dev_t device, __u64 sector)
7997c5b0 356{
916b5501 357 struct io_track *iot;
7997c5b0 358
e7c9f3ff 359 iot = __find_track(device, sector);
7997c5b0
JA
360 if (!iot) {
361 iot = malloc(sizeof(*iot));
50adc0ba 362 iot->pid = pid;
e7c9f3ff 363 iot->device = device;
7997c5b0
JA
364 iot->sector = sector;
365 track_rb_insert(iot);
366 }
367
368 return iot;
369}
370
a01516de 371static void log_track_frontmerge(struct blk_io_trace *t)
2e3e8ded
JA
372{
373 struct io_track *iot;
374
375 if (!track_ios)
376 return;
2e3e8ded 377
a01516de 378 iot = __find_track(t->device, t->sector + (t->bytes >> 9));
cb2a1a62
JA
379 if (!iot) {
380 fprintf(stderr, "failed to find mergeable event\n");
381 return;
2e3e8ded 382 }
cb2a1a62
JA
383
384 rb_erase(&iot->rb_node, &rb_track_root);
a01516de 385 iot->sector -= t->bytes >> 9;
cb2a1a62 386 track_rb_insert(iot);
2e3e8ded
JA
387}
388
95c15013 389static void log_track_getrq(struct blk_io_trace *t)
2e3e8ded
JA
390{
391 struct io_track *iot;
392
393 if (!track_ios)
394 return;
395
e7c9f3ff 396 iot = find_track(t->pid, t->device, t->sector);
95c15013
JA
397 iot->allocation_time = t->time;
398}
399
400
401/*
b6076a9b 402 * return time between rq allocation and insertion
95c15013 403 */
b6076a9b 404static unsigned long long log_track_insert(struct blk_io_trace *t)
95c15013 405{
50adc0ba 406 unsigned long long elapsed;
95c15013
JA
407 struct io_track *iot;
408
409 if (!track_ios)
410 return -1;
411
e7c9f3ff 412 iot = find_track(t->pid, t->device, t->sector);
2e3e8ded 413 iot->queue_time = t->time;
50adc0ba
JA
414 elapsed = iot->queue_time - iot->allocation_time;
415
416 if (per_process_stats) {
417 struct per_process_info *ppi = find_process_by_pid(iot->pid);
b9d40d6f 418 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
50adc0ba 419
b9d40d6f
JA
420 if (ppi && elapsed > ppi->longest_allocation_wait[w])
421 ppi->longest_allocation_wait[w] = elapsed;
50adc0ba
JA
422 }
423
424 return elapsed;
2e3e8ded
JA
425}
426
427/*
428 * return time between queue and issue
429 */
430static unsigned long long log_track_issue(struct blk_io_trace *t)
431{
50adc0ba 432 unsigned long long elapsed;
2e3e8ded
JA
433 struct io_track *iot;
434
435 if (!track_ios)
436 return -1;
437 if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
438 return -1;
439
e7c9f3ff 440 iot = __find_track(t->device, t->sector);
cb2a1a62
JA
441 if (!iot) {
442 fprintf(stderr, "failed to find issue event\n");
2e3e8ded 443 return -1;
cb2a1a62 444 }
2e3e8ded
JA
445
446 iot->dispatch_time = t->time;
50adc0ba
JA
447 elapsed = iot->dispatch_time - iot->queue_time;
448
449 if (per_process_stats) {
450 struct per_process_info *ppi = find_process_by_pid(iot->pid);
b9d40d6f 451 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
50adc0ba 452
b9d40d6f
JA
453 if (ppi && elapsed > ppi->longest_dispatch_wait[w])
454 ppi->longest_dispatch_wait[w] = elapsed;
50adc0ba
JA
455 }
456
457 return elapsed;
2e3e8ded
JA
458}
459
460/*
461 * return time between dispatch and complete
462 */
463static unsigned long long log_track_complete(struct blk_io_trace *t)
464{
465 unsigned long long elapsed;
466 struct io_track *iot;
467
468 if (!track_ios)
469 return -1;
470 if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
471 return -1;
472
e7c9f3ff 473 iot = __find_track(t->device, t->sector);
cb2a1a62
JA
474 if (!iot) {
475 fprintf(stderr, "failed to find complete event\n");
2e3e8ded 476 return -1;
cb2a1a62 477 }
2e3e8ded
JA
478
479 iot->completion_time = t->time;
480 elapsed = iot->completion_time - iot->dispatch_time;
481
50adc0ba
JA
482 if (per_process_stats) {
483 struct per_process_info *ppi = find_process_by_pid(iot->pid);
b9d40d6f 484 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
50adc0ba 485
b9d40d6f
JA
486 if (ppi && elapsed > ppi->longest_completion_wait[w])
487 ppi->longest_completion_wait[w] = elapsed;
50adc0ba
JA
488 }
489
2e3e8ded
JA
490 /*
491 * kill the trace, we don't need it after completion
492 */
493 rb_erase(&iot->rb_node, &rb_track_root);
494 free(iot);
495
496 return elapsed;
497}
498
499
152f6476
JA
500static struct io_stats *find_process_io_stats(__u32 pid, char *name)
501{
502 struct per_process_info *ppi = find_process_by_pid(pid);
503
504 if (!ppi) {
505 ppi = malloc(sizeof(*ppi));
506 memset(ppi, 0, sizeof(*ppi));
507 strncpy(ppi->name, name, sizeof(ppi->name));
508 ppi->pid = pid;
509 add_process_to_hash(ppi);
510 add_process_to_list(ppi);
511 }
512
513 return &ppi->io_stats;
514}
515
e7c9f3ff
NS
516
517static void resize_cpu_info(struct per_dev_info *pdi, int cpu)
a718bd37 518{
e7c9f3ff
NS
519 struct per_cpu_info *cpus = pdi->cpus;
520 int ncpus = pdi->ncpus;
521 int new_count = cpu + 1;
522 int new_space, size;
a718bd37
NS
523 char *new_start;
524
e7c9f3ff
NS
525 size = new_count * sizeof(struct per_cpu_info);
526 cpus = realloc(cpus, size);
527 if (!cpus) {
528 char name[20];
529 fprintf(stderr, "Out of memory, CPU info for device %s (%d)\n",
530 get_dev_name(pdi, name, sizeof(name)), size);
a718bd37
NS
531 exit(1);
532 }
533
e7c9f3ff
NS
534 new_start = (char *)cpus + (ncpus * sizeof(struct per_cpu_info));
535 new_space = (new_count - ncpus) * sizeof(struct per_cpu_info);
a718bd37 536 memset(new_start, 0, new_space);
e7c9f3ff
NS
537
538 pdi->ncpus = new_count;
539 pdi->cpus = cpus;
540}
cb2a1a62 541
e7c9f3ff
NS
542static struct per_cpu_info *get_cpu_info(struct per_dev_info *pdi, int cpu)
543{
cb2a1a62
JA
544 struct per_cpu_info *pci;
545
e7c9f3ff
NS
546 if (cpu >= pdi->ncpus)
547 resize_cpu_info(pdi, cpu);
cb2a1a62
JA
548
549 pci = &pdi->cpus[cpu];
550 pci->cpu = cpu;
551 return pci;
a718bd37
NS
552}
553
e7c9f3ff
NS
554
555static int resize_devices(char *name)
a718bd37 556{
e7c9f3ff 557 int size = (ndevices + 1) * sizeof(struct per_dev_info);
c499bf38 558
e7c9f3ff
NS
559 devices = realloc(devices, size);
560 if (!devices) {
561 fprintf(stderr, "Out of memory, device %s (%d)\n", name, size);
562 return 1;
563 }
564 memset(&devices[ndevices], 0, sizeof(struct per_dev_info));
565 devices[ndevices].name = name;
566 ndevices++;
567 return 0;
568}
a718bd37 569
cb2a1a62 570static struct per_dev_info *get_dev_info(dev_t id)
e7c9f3ff 571{
cb2a1a62 572 struct per_dev_info *pdi;
e7c9f3ff 573 int i;
c499bf38 574
e7c9f3ff
NS
575 for (i = 0; i < ndevices; i++)
576 if (devices[i].id == id)
577 return &devices[i];
cb2a1a62 578
e7c9f3ff
NS
579 if (resize_devices(NULL) != 0)
580 return NULL;
cb2a1a62
JA
581
582 pdi = &devices[ndevices - 1];
583 pdi->id = id;
584 return pdi;
a718bd37
NS
585}
586
e7c9f3ff
NS
587static char *get_dev_name(struct per_dev_info *pdi, char *buffer, int size)
588{
589 if (pdi->name)
590 snprintf(buffer, size, "%s", pdi->name);
591 else
592 snprintf(buffer, size, "%d,%d", MAJOR(pdi->id), MINOR(pdi->id));
593 return buffer;
594}
595
e7c9f3ff 596static void check_time(struct per_dev_info *pdi, struct blk_io_trace *bit)
cfab07eb
AB
597{
598 unsigned long long this = bit->time;
e7c9f3ff 599 unsigned long long last = pdi->last_reported_time;
cfab07eb 600
e7c9f3ff
NS
601 pdi->backwards = (this < last) ? 'B' : ' ';
602 pdi->last_reported_time = this;
cfab07eb
AB
603}
604
152f6476
JA
605static inline void __account_m(struct io_stats *ios, struct blk_io_trace *t,
606 int rw)
d0ca268b
JA
607{
608 if (rw) {
152f6476
JA
609 ios->mwrites++;
610 ios->qwrite_kb += t->bytes >> 10;
d0ca268b 611 } else {
152f6476
JA
612 ios->mreads++;
613 ios->qread_kb += t->bytes >> 10;
614 }
615}
616
617static inline void account_m(struct blk_io_trace *t, struct per_cpu_info *pci,
618 int rw)
619{
620 __account_m(&pci->io_stats, t, rw);
621
622 if (per_process_stats) {
623 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
624
625 __account_m(ios, t, rw);
d0ca268b
JA
626 }
627}
628
b6076a9b
JA
629static inline void __account_queue(struct io_stats *ios, struct blk_io_trace *t,
630 int rw)
d0ca268b
JA
631{
632 if (rw) {
152f6476
JA
633 ios->qwrites++;
634 ios->qwrite_kb += t->bytes >> 10;
d0ca268b 635 } else {
152f6476
JA
636 ios->qreads++;
637 ios->qread_kb += t->bytes >> 10;
638 }
639}
640
b6076a9b
JA
641static inline void account_queue(struct blk_io_trace *t,
642 struct per_cpu_info *pci, int rw)
152f6476 643{
b6076a9b 644 __account_queue(&pci->io_stats, t, rw);
152f6476
JA
645
646 if (per_process_stats) {
647 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
648
b6076a9b 649 __account_queue(ios, t, rw);
d0ca268b
JA
650 }
651}
652
152f6476 653static inline void __account_c(struct io_stats *ios, int rw, unsigned int bytes)
d0ca268b
JA
654{
655 if (rw) {
152f6476
JA
656 ios->cwrites++;
657 ios->cwrite_kb += bytes >> 10;
d0ca268b 658 } else {
152f6476
JA
659 ios->creads++;
660 ios->cread_kb += bytes >> 10;
661 }
662}
663
664static inline void account_c(struct blk_io_trace *t, struct per_cpu_info *pci,
665 int rw, int bytes)
666{
667 __account_c(&pci->io_stats, rw, bytes);
668
669 if (per_process_stats) {
670 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
671
672 __account_c(ios, rw, bytes);
d0ca268b
JA
673 }
674}
675
b6076a9b
JA
676static inline void __account_issue(struct io_stats *ios, int rw,
677 unsigned int bytes)
afd2d7ad 678{
679 if (rw) {
152f6476
JA
680 ios->iwrites++;
681 ios->iwrite_kb += bytes >> 10;
afd2d7ad 682 } else {
152f6476
JA
683 ios->ireads++;
684 ios->iread_kb += bytes >> 10;
afd2d7ad 685 }
686}
687
b6076a9b
JA
688static inline void account_issue(struct blk_io_trace *t,
689 struct per_cpu_info *pci, int rw)
d0ca268b 690{
b6076a9b 691 __account_issue(&pci->io_stats, rw, t->bytes);
152f6476
JA
692
693 if (per_process_stats) {
694 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
d5396421 695
b6076a9b 696 __account_issue(ios, rw, t->bytes);
152f6476
JA
697 }
698}
699
06639b27
JA
700static inline void __account_unplug(struct io_stats *ios, int timer)
701{
702 if (timer)
703 ios->timer_unplugs++;
704 else
705 ios->io_unplugs++;
706}
707
708static inline void account_unplug(struct blk_io_trace *t,
709 struct per_cpu_info *pci, int timer)
710{
711 __account_unplug(&pci->io_stats, timer);
712
713 if (per_process_stats) {
714 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
715
716 __account_unplug(ios, timer);
717 }
718}
719
ab197ca7
AB
720#define VALID_SPECS "BCDFGMPQRSTU"
721static char *override_format[256];
722static inline int valid_spec(int spec)
152f6476 723{
ab197ca7 724 return strchr(VALID_SPECS, spec) != NULL;
d0ca268b
JA
725}
726
ab197ca7 727static void set_all_format_specs(char *optarg)
d0ca268b 728{
ab197ca7 729 char *p;
d0ca268b 730
ab197ca7
AB
731 for (p = VALID_SPECS; *p; p++)
732 if (override_format[(int)(*p)] == NULL)
733 override_format[(int)(*p)] = strdup(optarg);
734}
d0ca268b 735
ab197ca7
AB
736static int add_format_spec(char *optarg)
737{
738 int spec = optarg[0];
d0ca268b 739
ab197ca7
AB
740 if (!valid_spec(spec)) {
741 fprintf(stderr,"Bad format specifier %c\n", spec);
742 return 1;
743 }
744 if (optarg[1] != ',') {
745 fprintf(stderr,"Bad format specifier - need ',' %s\n", optarg);
746 return 1;
747 }
748 optarg += 2;
749 if (*optarg == '\0') {
750 fprintf(stderr,"Bad format specifier - need fmt %s\n", optarg);
751 return 1;
752 }
753
754 /*
755 * Set both merges (front and back)
756 */
757 if (spec == 'M') {
758 override_format['B'] = strdup(optarg);
759 override_format['M'] = strdup(optarg);
760 } else
761 override_format[spec] = strdup(optarg);
d0ca268b 762
ab197ca7 763 return 0;
d0ca268b
JA
764}
765
ab197ca7
AB
766static void print_field(char *act, struct per_cpu_info *pci,
767 struct blk_io_trace *t, unsigned long long elapsed,
768 int pdu_len, unsigned char *pdu_buf, char field,
769 int minus, int has_w, int width)
d0ca268b 770{
ab197ca7 771 char format[64];
2e3e8ded 772
ab197ca7
AB
773 if (has_w) {
774 if (minus)
775 sprintf(format, "%%-%d", width);
776 else
777 sprintf(format, "%%%d", width);
778 } else
779 sprintf(format, "%%");
780
781 switch (field) {
782 case 'a':
783 fprintf(ofp, strcat(format, "s"), act);
784 break;
785 case 'c':
786 fprintf(ofp, strcat(format, "d"), pci->cpu);
787 break;
788 case 'C':
789 fprintf(ofp, strcat(format, "s"), t->comm);
790 break;
791 case 'd': {
792 char rwbs[4];
793 int i = 0;
794 int w = t->action & BLK_TC_ACT(BLK_TC_WRITE);
795 int b = t->action & BLK_TC_ACT(BLK_TC_BARRIER);
796 int s = t->action & BLK_TC_ACT(BLK_TC_SYNC);
797 if (w)
798 rwbs[i++] = 'W';
799 else
800 rwbs[i++] = 'R';
801 if (b)
802 rwbs[i++] = 'B';
803 if (s)
804 rwbs[i++] = 'S';
805 rwbs[i] = '\0';
806 fprintf(ofp, strcat(format, "s"), rwbs);
807 break;
808 }
809 case 'D': /* format width ignored */
810 fprintf(ofp,"%3d,%-3d", MAJOR(t->device), MINOR(t->device));
811 break;
812 case 'e':
813 fprintf(ofp, strcat(format, "d"), t->error);
814 break;
815 case 'M':
816 fprintf(ofp, strcat(format, "d"), MAJOR(t->device));
817 break;
818 case 'm':
819 fprintf(ofp, strcat(format, "d"), MINOR(t->device));
820 break;
821 case 'n':
822 fprintf(ofp, strcat(format, "u"), t->bytes >> 9);
823 break;
824 case 'p':
825 fprintf(ofp, strcat(format, "u"), t->pid);
826 break;
827 case 'P': /* format width ignored */
828 if ((pdu_len > 0) && (pdu_buf != NULL)) {
829 int i;
830 unsigned char *p = pdu_buf;
831 for (i = 0; i < pdu_len; i++)
832 fprintf(ofp, "%02x ", *p++);
833 }
834 break;
835 case 's':
836 fprintf(ofp, strcat(format, "ld"), t->sequence);
837 break;
838 case 'S':
839 fprintf(ofp, strcat(format, "lu"), t->sector);
840 break;
841 case 't':
842 sprintf(format, "%%0%dlu", has_w ? width : 9);
843 fprintf(ofp, format, NANO_SECONDS(t->time));
844 break;
845 case 'T':
846 fprintf(ofp, strcat(format, "d"), SECONDS(t->time));
847 break;
848 case 'u':
849 if (elapsed == -1ULL) {
850 fprintf(stderr, "Expecting elapsed value\n");
851 exit(1);
852 }
853 fprintf(ofp, strcat(format, "llu"), elapsed / 1000);
854 break;
855 case 'U': {
856 __u64 *depth = (__u64 *) ((char *) t + sizeof(*t));
857 fprintf(ofp, strcat(format, "u"),
858 (unsigned int) be64_to_cpu(*depth));
859 break;
860 }
861 default:
862 fprintf(ofp,strcat(format, "c"), field);
863 break;
864 }
865}
2e3e8ded 866
ab197ca7
AB
867static char *parse_field(char *act, struct per_cpu_info *pci,
868 struct blk_io_trace *t, unsigned long long elapsed,
869 int pdu_len, unsigned char *pdu_buf,
870 char *master_format)
871{
872 int minus = 0;
873 int has_w = 0;
874 int width = 0;
875 char *p = master_format;
876
877 if (*p == '-') {
878 minus = 1;
879 p++;
2e3e8ded 880 }
ab197ca7
AB
881 if (isdigit(*p)) {
882 has_w = 1;
883 do {
884 width = (width * 10) + (*p++ - '0');
885 } while ((*p) && (isdigit(*p)));
886 }
887 if (*p) {
888 print_field(act, pci, t, elapsed, pdu_len, pdu_buf, *p++,
889 minus, has_w, width);
890 }
891 return p;
d0ca268b
JA
892}
893
ab197ca7
AB
894static char *fmt_select(int fmt_spec, struct blk_io_trace *t,
895 unsigned long long elapsed)
d0ca268b 896{
ab197ca7
AB
897 char *fmt;
898 char scratch_format[1024];
2e3e8ded 899
ab197ca7
AB
900 if (override_format[fmt_spec] != NULL)
901 return override_format[fmt_spec];
95c15013 902
ab197ca7
AB
903 switch (fmt_spec) {
904 case 'C': /* Complete */
905 if (t->action & BLK_TC_ACT(BLK_TC_PC)) {
906 strcpy(scratch_format, HEADER);
907 strcat(scratch_format, "%P");
908 } else {
909 strcpy(scratch_format, HEADER "%S + %n ");
910 if (elapsed != -1ULL)
911 strcat(scratch_format, "(%8u) ");
912 }
913 strcat(scratch_format, "[%e]\n");
914 fmt = scratch_format;
915 break;
916
917 case 'D': /* Issue */
918 if (t->action & BLK_TC_ACT(BLK_TC_PC)) {
919 strcpy(scratch_format, HEADER);
920 strcat(scratch_format, "%P");
921 } else {
922 strcpy(scratch_format, HEADER "%S + %n ");
923 if (elapsed != -1ULL)
924 strcat(scratch_format, "(%8u) ");
925 }
926 strcat(scratch_format,"[%C]\n");
927 fmt = scratch_format;
928 break;
929
b6076a9b
JA
930 case 'I': /* Insert */
931 if (t->action & BLK_TC_ACT(BLK_TC_PC)) {
932 strcpy(scratch_format, HEADER);
933 strcat(scratch_format, "%P");
934 } else {
935 strcpy(scratch_format, HEADER "%S + %n ");
936 if (elapsed != -1ULL)
937 strcat(scratch_format, "(%8u) ");
938 }
939 strcat(scratch_format,"[%C]\n");
940 fmt = scratch_format;
941 break;
942
ab197ca7
AB
943 case 'Q': /* Queue */
944 strcpy(scratch_format, HEADER "%S + %n ");
945 if (elapsed != -1ULL)
946 strcat(scratch_format, "(%8u) ");
947 strcat(scratch_format,"[%C]\n");
948 fmt = scratch_format;
949 break;
950
951 case 'B': /* Back merge */
952 case 'F': /* Front merge */
953 case 'M': /* Front or back merge */
954 fmt = HEADER "%S + %n [%C]\n";
955 break;
956
957 case 'P': /* Plug */
958 fmt = HEADER "[%C]\n";
959 break;
960
961 case 'G': /* Get request */
962 case 'S': /* Sleep request */
963 fmt = HEADER "%S + %n [%C]\n";
964 break;
965
966 case 'U': /* Unplug IO */
967 case 'T': /* Unplug timer */
968 fmt = HEADER "[%C] %U\n";
969 break;
970
971 default:
972 fprintf(stderr,"FATAL: Invalid format spec %c\n", fmt_spec);
973 exit(1);
974 /*NOTREACHED*/
95c15013 975 }
ab197ca7
AB
976
977 return fmt;
d0ca268b
JA
978}
979
ab197ca7
AB
980static void process_fmt(char *act, struct per_cpu_info *pci,
981 struct blk_io_trace *t, unsigned long long elapsed,
982 int pdu_len, unsigned char *pdu_buf)
d0ca268b 983{
ab197ca7 984 char *p = fmt_select(act[0], t, elapsed);
2e3e8ded 985
ab197ca7
AB
986 while (*p) {
987 switch (*p) {
988 case '%': /* Field specifier */
989 p++;
990 if (*p == '%')
991 fprintf(ofp, "%c", *p++);
992 else if (!*p)
993 fprintf(ofp, "%c", '%');
994 else
995 p = parse_field(act, pci, t, elapsed,
996 pdu_len, pdu_buf, p);
997 break;
998 case '\\': { /* escape */
999 switch (p[1]) {
1000 case 'b': fprintf(ofp, "\b"); break;
1001 case 'n': fprintf(ofp, "\n"); break;
1002 case 'r': fprintf(ofp, "\r"); break;
1003 case 't': fprintf(ofp, "\t"); break;
1004 default:
1005 fprintf(stderr,
1006 "Invalid escape char in format %c\n",
1007 p[1]);
1008 exit(1);
1009 /*NOTREACHED*/
1010 }
1011 p += 2;
1012 break;
1013 }
1014 default:
1015 fprintf(ofp, "%c", *p++);
1016 break;
1017 }
2e3e8ded 1018 }
ab197ca7
AB
1019}
1020
1021static void log_complete(struct per_cpu_info *pci, struct blk_io_trace *t,
1022 char *act)
1023{
1024 process_fmt(act, pci, t, log_track_complete(t), 0, NULL);
1025}
1026
b6076a9b
JA
1027static void log_insert(struct per_cpu_info *pci, struct blk_io_trace *t,
1028 char *act)
1029{
1030 process_fmt(act, pci, t, log_track_insert(t), 0, NULL);
1031}
1032
ab197ca7
AB
1033static void log_queue(struct per_cpu_info *pci, struct blk_io_trace *t,
1034 char *act)
1035{
b6076a9b 1036 process_fmt(act, pci, t, -1, 0, NULL);
ab197ca7 1037}
2e3e8ded 1038
ab197ca7
AB
1039static void log_issue(struct per_cpu_info *pci, struct blk_io_trace *t,
1040 char *act)
1041{
1042 process_fmt(act, pci, t, log_track_issue(t), 0, NULL);
d0ca268b
JA
1043}
1044
d5396421 1045static void log_merge(struct per_cpu_info *pci, struct blk_io_trace *t,
3639a11e 1046 char *act)
d0ca268b 1047{
a01516de
JA
1048 if (act[0] == 'F')
1049 log_track_frontmerge(t);
2e3e8ded 1050
ab197ca7 1051 process_fmt(act, pci, t, -1ULL, 0, NULL);
d0ca268b
JA
1052}
1053
dfe34da1 1054static void log_action(struct per_cpu_info *pci, struct blk_io_trace *t,
3639a11e 1055 char *act)
dfe34da1 1056{
ab197ca7 1057 process_fmt(act, pci, t, -1ULL, 0, NULL);
dfe34da1
JA
1058}
1059
d5396421 1060static void log_generic(struct per_cpu_info *pci, struct blk_io_trace *t,
3639a11e 1061 char *act)
d0ca268b 1062{
ab197ca7 1063 process_fmt(act, pci, t, -1ULL, 0, NULL);
d0ca268b
JA
1064}
1065
ab197ca7 1066static void log_unplug(struct per_cpu_info *pci, struct blk_io_trace *t,
3639a11e 1067 char *act)
67e14fdc 1068{
ab197ca7 1069 process_fmt(act, pci, t, -1ULL, 0, NULL);
67e14fdc
JA
1070}
1071
ab197ca7 1072static void log_pc(struct per_cpu_info *pci, struct blk_io_trace *t, char *act)
d0ca268b 1073{
ab197ca7 1074 unsigned char *buf = (unsigned char *) t + sizeof(*t);
d0ca268b 1075
ab197ca7 1076 process_fmt(act, pci, t, -1ULL, t->pdu_len, buf);
d0ca268b
JA
1077}
1078
d5396421 1079static int dump_trace_pc(struct blk_io_trace *t, struct per_cpu_info *pci)
d0ca268b 1080{
87b72777
JA
1081 int ret = 0;
1082
d0ca268b
JA
1083 switch (t->action & 0xffff) {
1084 case __BLK_TA_QUEUE:
3639a11e 1085 log_generic(pci, t, "Q");
d0ca268b
JA
1086 break;
1087 case __BLK_TA_GETRQ:
3639a11e 1088 log_generic(pci, t, "G");
d0ca268b
JA
1089 break;
1090 case __BLK_TA_SLEEPRQ:
3639a11e 1091 log_generic(pci, t, "S");
d0ca268b
JA
1092 break;
1093 case __BLK_TA_REQUEUE:
3639a11e 1094 log_generic(pci, t, "R");
d0ca268b
JA
1095 break;
1096 case __BLK_TA_ISSUE:
ab197ca7 1097 log_pc(pci, t, "D");
d0ca268b
JA
1098 break;
1099 case __BLK_TA_COMPLETE:
3639a11e 1100 log_pc(pci, t, "C");
d0ca268b
JA
1101 break;
1102 default:
1103 fprintf(stderr, "Bad pc action %x\n", t->action);
87b72777
JA
1104 ret = 1;
1105 break;
d0ca268b
JA
1106 }
1107
ab197ca7 1108 return 0;
d0ca268b
JA
1109}
1110
d5396421 1111static void dump_trace_fs(struct blk_io_trace *t, struct per_cpu_info *pci)
d0ca268b
JA
1112{
1113 int w = t->action & BLK_TC_ACT(BLK_TC_WRITE);
7997c5b0 1114 int act = t->action & 0xffff;
d0ca268b 1115
7997c5b0 1116 switch (act) {
d0ca268b 1117 case __BLK_TA_QUEUE:
b6076a9b 1118 account_queue(t, pci, w);
3639a11e 1119 log_queue(pci, t, "Q");
d0ca268b 1120 break;
b6076a9b
JA
1121 case __BLK_TA_INSERT:
1122 log_insert(pci, t, "I");
1123 break;
d0ca268b 1124 case __BLK_TA_BACKMERGE:
152f6476 1125 account_m(t, pci, w);
3639a11e 1126 log_merge(pci, t, "M");
d0ca268b
JA
1127 break;
1128 case __BLK_TA_FRONTMERGE:
152f6476 1129 account_m(t, pci, w);
3639a11e 1130 log_merge(pci, t, "F");
d0ca268b
JA
1131 break;
1132 case __BLK_TA_GETRQ:
95c15013 1133 log_track_getrq(t);
3639a11e 1134 log_generic(pci, t, "G");
d0ca268b
JA
1135 break;
1136 case __BLK_TA_SLEEPRQ:
3639a11e 1137 log_generic(pci, t, "S");
d0ca268b
JA
1138 break;
1139 case __BLK_TA_REQUEUE:
152f6476 1140 account_c(t, pci, w, -t->bytes);
3639a11e 1141 log_queue(pci, t, "R");
d0ca268b
JA
1142 break;
1143 case __BLK_TA_ISSUE:
b6076a9b 1144 account_issue(t, pci, w);
3639a11e 1145 log_issue(pci, t, "D");
d0ca268b
JA
1146 break;
1147 case __BLK_TA_COMPLETE:
152f6476 1148 account_c(t, pci, w, t->bytes);
3639a11e 1149 log_complete(pci, t, "C");
d0ca268b 1150 break;
88b1a526 1151 case __BLK_TA_PLUG:
3639a11e 1152 log_action(pci, t, "P");
88b1a526 1153 break;
3639a11e 1154 case __BLK_TA_UNPLUG_IO:
06639b27 1155 account_unplug(t, pci, 0);
3639a11e
JA
1156 log_unplug(pci, t, "U");
1157 break;
1158 case __BLK_TA_UNPLUG_TIMER:
06639b27 1159 account_unplug(t, pci, 1);
3639a11e 1160 log_unplug(pci, t, "UT");
88b1a526 1161 break;
d0ca268b
JA
1162 default:
1163 fprintf(stderr, "Bad fs action %x\n", t->action);
1f79c4a0 1164 break;
d0ca268b 1165 }
d0ca268b
JA
1166}
1167
e7c9f3ff
NS
1168static int dump_trace(struct blk_io_trace *t, struct per_cpu_info *pci,
1169 struct per_dev_info *pdi)
d0ca268b 1170{
87b72777
JA
1171 int ret = 0;
1172
d0ca268b 1173 if (t->action & BLK_TC_ACT(BLK_TC_PC))
d5396421 1174 ret = dump_trace_pc(t, pci);
d0ca268b 1175 else
d5396421 1176 dump_trace_fs(t, pci);
87b72777 1177
e7c9f3ff 1178 pdi->events++;
87b72777 1179 return ret;
d0ca268b
JA
1180}
1181
152f6476 1182static void dump_io_stats(struct io_stats *ios, char *msg)
5c017e4b 1183{
152f6476
JA
1184 fprintf(ofp, "%s\n", msg);
1185
1186 fprintf(ofp, " Reads Queued: %'8lu, %'8LuKiB\t", ios->qreads, ios->qread_kb);
1187 fprintf(ofp, " Writes Queued: %'8lu, %'8LuKiB\n", ios->qwrites,ios->qwrite_kb);
0a6b8fc4 1188
152f6476
JA
1189 fprintf(ofp, " Read Dispatches: %'8lu, %'8LuKiB\t", ios->ireads, ios->iread_kb);
1190 fprintf(ofp, " Write Dispatches: %'8lu, %'8LuKiB\n", ios->iwrites,ios->iwrite_kb);
1191 fprintf(ofp, " Reads Completed: %'8lu, %'8LuKiB\t", ios->creads, ios->cread_kb);
1192 fprintf(ofp, " Writes Completed: %'8lu, %'8LuKiB\n", ios->cwrites,ios->cwrite_kb);
1193 fprintf(ofp, " Read Merges: %'8lu%8c\t", ios->mreads, ' ');
152f6476 1194 fprintf(ofp, " Write Merges: %'8lu\n", ios->mwrites);
06639b27
JA
1195 fprintf(ofp, " IO unplugs: %'8lu%8c\t", ios->io_unplugs, ' ');
1196 fprintf(ofp, " Timer unplugs: %'8lu\n", ios->timer_unplugs);
5c017e4b
JA
1197}
1198
50adc0ba
JA
1199static void dump_wait_stats(struct per_process_info *ppi)
1200{
b9d40d6f
JA
1201 unsigned long rawait = ppi->longest_allocation_wait[0] / 1000;
1202 unsigned long rdwait = ppi->longest_dispatch_wait[0] / 1000;
1203 unsigned long rcwait = ppi->longest_completion_wait[0] / 1000;
1204 unsigned long wawait = ppi->longest_allocation_wait[1] / 1000;
1205 unsigned long wdwait = ppi->longest_dispatch_wait[1] / 1000;
1206 unsigned long wcwait = ppi->longest_completion_wait[1] / 1000;
1207
1208 fprintf(ofp, " Allocation wait: %'8lu%8c\t", rawait, ' ');
1209 fprintf(ofp, " Allocation wait: %'8lu\n", wawait);
1210 fprintf(ofp, " Dispatch wait: %'8lu%8c\t", rdwait, ' ');
1211 fprintf(ofp, " Dispatch wait: %'8lu\n", wdwait);
1212 fprintf(ofp, " Completion wait: %'8lu%8c\t", rcwait, ' ');
1213 fprintf(ofp, " Completion wait: %'8lu\n", wcwait);
50adc0ba
JA
1214}
1215
886ecf0e
JA
1216static int ppi_name_compare(const void *p1, const void *p2)
1217{
1218 struct per_process_info *ppi1 = *((struct per_process_info **) p1);
1219 struct per_process_info *ppi2 = *((struct per_process_info **) p2);
1220 int res;
1221
1222 res = strverscmp(ppi1->name, ppi2->name);
1223 if (!res)
06e6f286 1224 res = ppi1->pid > ppi2->pid;
886ecf0e
JA
1225
1226 return res;
1227}
1228
1229static void sort_process_list(void)
1230{
1231 struct per_process_info **ppis;
1232 struct per_process_info *ppi;
1233 int i = 0;
1234
1235 ppis = malloc(ppi_list_entries * sizeof(struct per_process_info *));
1236
1237 ppi = ppi_list;
1238 while (ppi) {
06e6f286 1239 ppis[i++] = ppi;
886ecf0e
JA
1240 ppi = ppi->list_next;
1241 }
1242
06e6f286 1243 qsort(ppis, ppi_list_entries, sizeof(ppi), ppi_name_compare);
886ecf0e
JA
1244
1245 i = ppi_list_entries - 1;
1246 ppi_list = NULL;
1247 while (i >= 0) {
1248 ppi = ppis[i];
1249
1250 ppi->list_next = ppi_list;
1251 ppi_list = ppi;
1252 i--;
1253 }
1254}
1255
152f6476
JA
1256static void show_process_stats(void)
1257{
1258 struct per_process_info *ppi;
1259
886ecf0e
JA
1260 sort_process_list();
1261
152f6476
JA
1262 ppi = ppi_list;
1263 while (ppi) {
ce8b6b4f
JA
1264 char name[64];
1265
1266 snprintf(name, sizeof(name)-1, "%s (%u)", ppi->name, ppi->pid);
1267 dump_io_stats(&ppi->io_stats, name);
50adc0ba 1268 dump_wait_stats(ppi);
152f6476
JA
1269 ppi = ppi->list_next;
1270 }
1271
1272 fprintf(ofp, "\n");
1273}
1274
e7c9f3ff 1275static void show_device_and_cpu_stats(void)
d0ca268b 1276{
e7c9f3ff
NS
1277 struct per_dev_info *pdi;
1278 struct per_cpu_info *pci;
1279 struct io_stats total, *ios;
1280 int i, j, pci_events;
1281 char line[3 + 8/*cpu*/ + 2 + 32/*dev*/ + 3];
1282 char name[32];
1283
1284 for (pdi = devices, i = 0; i < ndevices; i++, pdi++) {
1285
1286 memset(&total, 0, sizeof(total));
1287 pci_events = 0;
1288
1289 if (i > 0)
1290 fprintf(ofp, "\n");
1291
1292 for (pci = pdi->cpus, j = 0; j < pdi->ncpus; j++, pci++) {
1293 if (!pci->nelems)
1294 continue;
1295
1296 ios = &pci->io_stats;
1297 total.qreads += ios->qreads;
1298 total.qwrites += ios->qwrites;
1299 total.creads += ios->creads;
1300 total.cwrites += ios->cwrites;
1301 total.mreads += ios->mreads;
1302 total.mwrites += ios->mwrites;
1303 total.ireads += ios->ireads;
1304 total.iwrites += ios->iwrites;
1305 total.qread_kb += ios->qread_kb;
1306 total.qwrite_kb += ios->qwrite_kb;
1307 total.cread_kb += ios->cread_kb;
1308 total.cwrite_kb += ios->cwrite_kb;
1309 total.iread_kb += ios->iread_kb;
1310 total.iwrite_kb += ios->iwrite_kb;
06639b27
JA
1311 total.timer_unplugs += ios->timer_unplugs;
1312 total.io_unplugs += ios->io_unplugs;
e7c9f3ff
NS
1313
1314 snprintf(line, sizeof(line) - 1, "CPU%d (%s):",
1315 j, get_dev_name(pdi, name, sizeof(name)));
1316 dump_io_stats(ios, line);
1317 pci_events++;
1318 }
5c017e4b 1319
e7c9f3ff
NS
1320 if (pci_events > 1) {
1321 fprintf(ofp, "\n");
1322 snprintf(line, sizeof(line) - 1, "Total (%s):",
1323 get_dev_name(pdi, name, sizeof(name)));
1324 dump_io_stats(&total, line);
1325 }
d0ca268b 1326
0aa8a62d
JA
1327 fprintf(ofp, "\nEvents (%s): %'Lu entries, %'lu skips\n",
1328 get_dev_name(pdi, line, sizeof(line)), pdi->events,
1329 pdi->skips);
e7c9f3ff 1330 }
d0ca268b
JA
1331}
1332
cb2a1a62 1333static struct blk_io_trace *find_trace(void *p, unsigned long offset)
2ff323b0 1334{
cb2a1a62 1335 unsigned long max_offset = offset;
2ff323b0
JA
1336 unsigned long off;
1337 struct blk_io_trace *bit;
1338 __u32 magic;
1339
1340 for (off = 0; off < max_offset; off++) {
1341 bit = p + off;
1342
1343 magic = be32_to_cpu(bit->magic);
1344 if ((magic & 0xffffff00) == BLK_IO_TRACE_MAGIC)
1345 return bit;
1346 }
1347
1348 return NULL;
1349}
1350
cb2a1a62 1351static int sort_entries(void)
8fc0abbc 1352{
e7c9f3ff 1353 struct per_dev_info *pdi;
412819ce 1354 struct per_cpu_info *pci;
8fc0abbc
JA
1355 struct blk_io_trace *bit;
1356 struct trace *t;
cb2a1a62 1357 int nr = 0;
8fc0abbc 1358
cb2a1a62 1359 while ((t = trace_list) != NULL) {
412819ce 1360
cb2a1a62
JA
1361 trace_list = t->next;
1362 bit = t->bit;
6fe4709e 1363
8fc0abbc
JA
1364 memset(&t->rb_node, 0, sizeof(t->rb_node));
1365
cb2a1a62 1366 if (verify_trace(bit))
66fa7233 1367 break;
cb2a1a62
JA
1368 if (trace_rb_insert(t))
1369 return -1;
66fa7233 1370
cb2a1a62
JA
1371 pdi = get_dev_info(bit->device);
1372 pci = get_cpu_info(pdi, bit->cpu);
412819ce
JA
1373 pci->nelems++;
1374
cb2a1a62 1375 nr++;
6fe4709e 1376 }
8fc0abbc 1377
cb2a1a62 1378 return nr;
412819ce
JA
1379}
1380
f1327cc5 1381static void show_entries_rb(int piped)
8fc0abbc 1382{
e7c9f3ff 1383 struct per_dev_info *pdi;
8fc0abbc 1384 struct blk_io_trace *bit;
3aabcd89 1385 struct rb_node *n;
8fc0abbc
JA
1386 struct trace *t;
1387 int cpu;
1388
cb2a1a62 1389 while ((n = rb_first(&rb_sort_root)) != NULL) {
8fc0abbc
JA
1390
1391 t = rb_entry(n, struct trace, rb_node);
1392 bit = t->bit;
1393
cb2a1a62 1394 pdi = get_dev_info(bit->device);
e7c9f3ff
NS
1395 if (!pdi) {
1396 fprintf(stderr, "Unknown device ID? (%d,%d)\n",
1397 MAJOR(bit->device), MINOR(bit->device));
1398 break;
1399 }
d5396421 1400 cpu = bit->cpu;
e7c9f3ff
NS
1401 if (cpu > pdi->ncpus) {
1402 fprintf(stderr, "Unknown CPU ID? (%d, device %d,%d)\n",
1403 cpu, MAJOR(bit->device), MINOR(bit->device));
87b72777 1404 break;
8fc0abbc
JA
1405 }
1406
cb2a1a62
JA
1407 /*
1408 * back off displaying more info if we are out of sync
1409 * on SMP systems. to prevent stalling on lost events,
0aa8a62d 1410 * only allow an event to us a few times
cb2a1a62 1411 */
a3983763
JA
1412 if (bit->sequence != (pdi->last_sequence + 1)) {
1413 if (piped && t->skipped < 5) {
0aa8a62d 1414 t->skipped++;
cb2a1a62 1415 break;
0aa8a62d
JA
1416 } else {
1417 fprintf(stderr, "skipping from %lu to %u\n", pdi->last_sequence, bit->sequence);
1418 pdi->skips++;
cb2a1a62
JA
1419 }
1420 }
1421
1422 pdi->last_sequence = bit->sequence;
1423
cfab07eb 1424 bit->time -= genesis_time;
46e6968b
NS
1425 if (bit->time < stopwatch_start)
1426 continue;
1427 if (bit->time >= stopwatch_end)
1428 break;
8fc0abbc 1429
e7c9f3ff 1430 check_time(pdi, bit);
8fc0abbc 1431
e7c9f3ff 1432 if (dump_trace(bit, &pdi->cpus[cpu], pdi))
87b72777
JA
1433 break;
1434
cb2a1a62 1435 rb_erase(&t->rb_node, &rb_sort_root);
e8741a4a 1436
f1327cc5 1437 if (piped) {
e8741a4a
JA
1438 free(bit);
1439 free(t);
1440 }
cb2a1a62 1441 }
8fc0abbc
JA
1442}
1443
1f79c4a0
JA
1444static int read_data(int fd, void *buffer, int bytes, int block)
1445{
1446 int ret, bytes_left, fl;
1447 void *p;
1448
1449 fl = fcntl(fd, F_GETFL);
1450
1451 if (!block)
1452 fcntl(fd, F_SETFL, fl | O_NONBLOCK);
1453 else
1454 fcntl(fd, F_SETFL, fl & ~O_NONBLOCK);
1455
1456 bytes_left = bytes;
1457 p = buffer;
1458 while (bytes_left > 0) {
1459 ret = read(fd, p, bytes_left);
1460 if (!ret)
1461 return 1;
1462 else if (ret < 0) {
1463 if (errno != EAGAIN)
1464 perror("read");
1465 return -1;
1466 } else {
1467 p += ret;
1468 bytes_left -= ret;
1469 }
1470 }
1471
1472 return 0;
1473}
1474
cb2a1a62
JA
1475/*
1476 * Find the traces in 'tb' and add them to the list for sorting and
1477 * displaying
1478 */
1479static int find_entries(void *tb, unsigned long size)
1480{
1481 struct blk_io_trace *bit;
1482 struct trace *t;
1483 void *start = tb;
1484
1485 while (tb - start <= size - sizeof(*bit)) {
1486 bit = find_trace(tb, size - (tb - start));
1487 if (!bit)
1488 break;
1489
1490 t = malloc(sizeof(*t));
1491 memset(t, 0, sizeof(*t));
1492 t->bit = bit;
1493
f1327cc5 1494 trace_to_cpu(bit);
cb2a1a62
JA
1495 t->next = trace_list;
1496 trace_list = t;
1497
1498 tb += sizeof(*bit) + bit->pdu_len;
1499 }
1500
1501 return 0;
1502}
1503
d5396421 1504static int do_file(void)
d0ca268b 1505{
cb2a1a62 1506 int i, j, nfiles = 0, nelems;
d0ca268b 1507
e8741a4a 1508 for (i = 0; i < ndevices; i++) {
e7c9f3ff 1509 for (j = 0;; j++, nfiles++) {
e8741a4a 1510 struct per_dev_info *pdi;
e7c9f3ff
NS
1511 struct per_cpu_info *pci;
1512 struct stat st;
1513 void *tb;
87b72777 1514
e8741a4a 1515 pdi = &devices[i];
e7c9f3ff
NS
1516 pci = get_cpu_info(pdi, j);
1517 pci->cpu = j;
d0ca268b 1518
e7c9f3ff 1519 snprintf(pci->fname, sizeof(pci->fname)-1,
9f6486bd 1520 "%s.blktrace.%d", pdi->name, j);
e7c9f3ff
NS
1521 if (stat(pci->fname, &st) < 0)
1522 break;
1523 if (!st.st_size)
1524 continue;
1525
1526 printf("Processing %s\n", pci->fname);
1527
1528 tb = malloc(st.st_size);
1529 if (!tb) {
1530 fprintf(stderr, "Out of memory, skip file %s\n",
1531 pci->fname);
1532 continue;
1533 }
1534
1535 pci->fd = open(pci->fname, O_RDONLY);
1536 if (pci->fd < 0) {
1537 perror(pci->fname);
1538 free(tb);
1539 continue;
1540 }
1541
1542 if (read_data(pci->fd, tb, st.st_size, 1)) {
1543 close(pci->fd);
1544 free(tb);
1545 continue;
1546 }
1547
cb2a1a62
JA
1548 if (find_entries(tb, st.st_size)) {
1549 close(pci->fd);
1550 free(tb);
1551 }
1552
1553 nelems = sort_entries();
1554 if (nelems == -1) {
e7c9f3ff
NS
1555 close(pci->fd);
1556 free(tb);
1557 continue;
1558 }
1559
1560 printf("Completed %s (CPU%d %d, entries)\n",
cb2a1a62 1561 pci->fname, j, nelems);
e7c9f3ff 1562 close(pci->fd);
d0ca268b 1563 }
d5396421
JA
1564 }
1565
1566 if (!nfiles) {
1567 fprintf(stderr, "No files found\n");
1568 return 1;
1569 }
1570
e8741a4a 1571 show_entries_rb(0);
d5396421
JA
1572 return 0;
1573}
1574
cb2a1a62 1575static int read_sort_events(int fd)
d5396421 1576{
cb2a1a62 1577 int events = 0;
d5396421 1578
412819ce 1579 do {
cb2a1a62
JA
1580 struct blk_io_trace *bit;
1581 struct trace *t;
412819ce 1582 int pdu_len;
51128a28 1583 __u32 magic;
d5396421 1584
cb2a1a62 1585 bit = malloc(sizeof(*bit));
d5396421 1586
cb2a1a62 1587 if (read_data(fd, bit, sizeof(*bit), !events))
c80c18a7 1588 break;
d5396421 1589
cb2a1a62 1590 magic = be32_to_cpu(bit->magic);
51128a28
JA
1591 if ((magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
1592 fprintf(stderr, "Bad magic %x\n", magic);
1593 break;
1594 }
1595
cb2a1a62 1596 pdu_len = be16_to_cpu(bit->pdu_len);
2ff323b0 1597 if (pdu_len) {
cb2a1a62 1598 void *ptr = realloc(bit, sizeof(*bit) + pdu_len);
d5396421 1599
cb2a1a62 1600 if (read_data(fd, ptr + sizeof(*bit), pdu_len, 1))
2ff323b0 1601 break;
d5396421 1602
cb2a1a62 1603 bit = ptr;
2ff323b0 1604 }
d5396421 1605
cb2a1a62
JA
1606 t = malloc(sizeof(*t));
1607 memset(t, 0, sizeof(*t));
1608 t->bit = bit;
f1327cc5
JA
1609
1610 trace_to_cpu(bit);
cb2a1a62
JA
1611 t->next = trace_list;
1612 trace_list = t;
1613
412819ce 1614 events++;
79f19470 1615 } while (!is_done() && events < rb_batch);
d5396421 1616
412819ce
JA
1617 return events;
1618}
d5396421 1619
412819ce
JA
1620static int do_stdin(void)
1621{
1622 int fd;
d5396421 1623
1f79c4a0 1624 fd = dup(STDIN_FILENO);
412819ce
JA
1625 do {
1626 int events;
d5396421 1627
cb2a1a62 1628 events = read_sort_events(fd);
412819ce
JA
1629 if (!events)
1630 break;
1631
cb2a1a62 1632 if (sort_entries() == -1)
2ff323b0
JA
1633 break;
1634
e8741a4a 1635 show_entries_rb(1);
d5396421
JA
1636 } while (1);
1637
1638 close(fd);
d5396421
JA
1639 return 0;
1640}
d0ca268b 1641
1f79c4a0 1642static void flush_output(void)
412819ce 1643{
152f6476 1644 fflush(ofp);
412819ce
JA
1645}
1646
1f79c4a0 1647static void handle_sigint(int sig)
412819ce
JA
1648{
1649 done = 1;
1650 flush_output();
1651}
1652
46e6968b
NS
1653/*
1654 * Extract start and duration times from a string, allowing
1655 * us to specify a time interval of interest within a trace.
1656 * Format: "duration" (start is zero) or "start:duration".
1657 */
1658static int find_stopwatch_interval(char *string)
1659{
1660 double value;
1661 char *sp;
1662
1663 value = strtod(string, &sp);
1664 if (sp == string) {
1665 fprintf(stderr,"Invalid stopwatch timer: %s\n", string);
1666 return 1;
1667 }
1668 if (*sp == ':') {
1669 stopwatch_start = DOUBLE_TO_NANO_ULL(value);
1670 string = sp + 1;
1671 value = strtod(string, &sp);
1672 if (sp == string || *sp != '\0') {
1673 fprintf(stderr,"Invalid stopwatch duration time: %s\n",
1674 string);
1675 return 1;
1676 }
1677 } else if (*sp != '\0') {
1678 fprintf(stderr,"Invalid stopwatch start timer: %s\n", string);
1679 return 1;
1680 }
1681 stopwatch_end = stopwatch_start + DOUBLE_TO_NANO_ULL(value);
1682 return 0;
1683}
1684
1f79c4a0
JA
1685static void usage(char *prog)
1686{
46e6968b
NS
1687 fprintf(stderr, "Usage: %s "
1688 "[-i <name>] [-o <output>] [-s] [-w N[:n]] <name>...\n",
1689 prog);
1f79c4a0
JA
1690}
1691
d5396421
JA
1692int main(int argc, char *argv[])
1693{
152f6476 1694 char *ofp_buffer;
a66877e6 1695 int c, ret, mode;
1e1c60f1 1696 int per_device_and_cpu_stats = 1;
d5396421
JA
1697
1698 while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
1699 switch (c) {
1700 case 'i':
e7c9f3ff
NS
1701 if (!strcmp(optarg, "-") && !pipeline)
1702 pipeline = 1;
1703 else if (resize_devices(optarg) != 0)
1704 return 1;
d5396421
JA
1705 break;
1706 case 'o':
66efebf8 1707 output_name = optarg;
d5396421 1708 break;
79f19470
JA
1709 case 'b':
1710 rb_batch = atoi(optarg);
1711 if (rb_batch <= 0)
1712 rb_batch = RB_BATCH_DEFAULT;
1713 break;
152f6476
JA
1714 case 's':
1715 per_process_stats = 1;
1716 break;
7997c5b0
JA
1717 case 't':
1718 track_ios = 1;
1719 break;
1e1c60f1
NS
1720 case 'q':
1721 per_device_and_cpu_stats = 0;
1722 break;
46e6968b
NS
1723 case 'w':
1724 if (find_stopwatch_interval(optarg) != 0)
1725 return 1;
1726 break;
ab197ca7
AB
1727 case 'f':
1728 set_all_format_specs(optarg);
1729 break;
1730 case 'F':
1731 if (add_format_spec(optarg) != 0)
1732 return 1;
1733 break;
d5396421 1734 default:
1f79c4a0 1735 usage(argv[0]);
d5396421
JA
1736 return 1;
1737 }
d0ca268b
JA
1738 }
1739
e7c9f3ff
NS
1740 while (optind < argc) {
1741 if (!strcmp(argv[optind], "-") && !pipeline)
1742 pipeline = 1;
1743 else if (resize_devices(argv[optind]) != 0)
1744 return 1;
1745 optind++;
1746 }
1747
1748 if (!pipeline && !ndevices) {
1f79c4a0 1749 usage(argv[0]);
d5396421
JA
1750 return 1;
1751 }
1752
7997c5b0
JA
1753 memset(&rb_sort_root, 0, sizeof(rb_sort_root));
1754 memset(&rb_track_root, 0, sizeof(rb_track_root));
412819ce
JA
1755
1756 signal(SIGINT, handle_sigint);
1757 signal(SIGHUP, handle_sigint);
1758 signal(SIGTERM, handle_sigint);
d5396421 1759
d69db225
JA
1760 setlocale(LC_NUMERIC, "en_US");
1761
a66877e6 1762 if (!output_name) {
152f6476 1763 ofp = fdopen(STDOUT_FILENO, "w");
a66877e6
JA
1764 mode = _IOLBF;
1765 } else {
152f6476
JA
1766 char ofname[128];
1767
1768 snprintf(ofname, sizeof(ofname) - 1, "%s.log", output_name);
1769 ofp = fopen(ofname, "w");
a66877e6 1770 mode = _IOFBF;
152f6476
JA
1771 }
1772
1773 if (!ofp) {
1774 perror("fopen");
1775 return 1;
1776 }
1777
1778 ofp_buffer = malloc(4096);
a66877e6 1779 if (setvbuf(ofp, ofp_buffer, mode, 4096)) {
152f6476
JA
1780 perror("setvbuf");
1781 return 1;
1782 }
1783
e7c9f3ff 1784 if (pipeline)
d5396421
JA
1785 ret = do_stdin();
1786 else
1787 ret = do_file();
1788
152f6476
JA
1789 if (per_process_stats)
1790 show_process_stats();
1791
1e1c60f1
NS
1792 if (per_device_and_cpu_stats)
1793 show_device_and_cpu_stats();
152f6476 1794
412819ce 1795 flush_output();
d5396421 1796 return ret;
d0ca268b 1797}