[PATCH] kernel: move the plug/unplug logging around a little
[blktrace.git] / blkparse.c
CommitLineData
d956a2cd
JA
1/*
2 * block queue tracing parse application
3 *
4 * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 */
d0ca268b
JA
21#include <sys/types.h>
22#include <sys/stat.h>
23#include <unistd.h>
24#include <stdio.h>
25#include <fcntl.h>
26#include <stdlib.h>
8fc0abbc 27#include <string.h>
d5396421 28#include <getopt.h>
412819ce
JA
29#include <errno.h>
30#include <signal.h>
d69db225 31#include <locale.h>
46e6968b 32#include <limits.h>
d0ca268b 33
8fc0abbc
JA
34#include "blktrace.h"
35#include "rbtree.h"
bf0720af 36#include "jhash.h"
d0ca268b 37
52724a0e
JA
38static char blkparse_version[] = "0.90";
39
e7c9f3ff
NS
40struct per_dev_info {
41 dev_t id;
42 char *name;
43
44 int backwards;
45 unsigned long long events;
46 unsigned long long last_reported_time;
287fa3d6 47 unsigned long long last_read_time;
e7c9f3ff 48 struct io_stats io_stats;
cb2a1a62 49 unsigned long last_sequence;
dcf0f7ed 50 unsigned long skips;
e7c9f3ff 51
2a1b3424
JA
52 struct rb_root rb_last;
53 unsigned long rb_last_entries;
54
73877e12 55 int nfiles;
e7c9f3ff
NS
56 int ncpus;
57 struct per_cpu_info *cpus;
58};
59
152f6476
JA
60struct per_process_info {
61 char name[16];
62 __u32 pid;
63 struct io_stats io_stats;
64 struct per_process_info *hash_next, *list_next;
715d8021 65 int more_than_one;
50adc0ba
JA
66
67 /*
68 * individual io stats
69 */
b9d40d6f
JA
70 unsigned long long longest_allocation_wait[2];
71 unsigned long long longest_dispatch_wait[2];
72 unsigned long long longest_completion_wait[2];
d0ca268b
JA
73};
74
152f6476 75#define PPI_HASH_SHIFT (8)
bf0720af
JA
76#define PPI_HASH_SIZE (1 << PPI_HASH_SHIFT)
77#define PPI_HASH_MASK (PPI_HASH_SIZE - 1)
78static struct per_process_info *ppi_hash_table[PPI_HASH_SIZE];
152f6476 79static struct per_process_info *ppi_list;
886ecf0e 80static int ppi_list_entries;
152f6476 81
bf0720af 82#define S_OPTS "i:o:b:stqw:f:F:vn"
d5396421
JA
83static struct option l_opts[] = {
84 {
85 .name = "input",
428683db 86 .has_arg = required_argument,
d5396421
JA
87 .flag = NULL,
88 .val = 'i'
89 },
90 {
91 .name = "output",
428683db 92 .has_arg = required_argument,
d5396421
JA
93 .flag = NULL,
94 .val = 'o'
95 },
79f19470
JA
96 {
97 .name = "batch",
428683db 98 .has_arg = required_argument,
79f19470
JA
99 .flag = NULL,
100 .val = 'b'
101 },
152f6476
JA
102 {
103 .name = "per program stats",
428683db 104 .has_arg = no_argument,
152f6476
JA
105 .flag = NULL,
106 .val = 's'
107 },
7997c5b0
JA
108 {
109 .name = "track ios",
428683db 110 .has_arg = no_argument,
7997c5b0
JA
111 .flag = NULL,
112 .val = 't'
113 },
1e1c60f1
NS
114 {
115 .name = "quiet",
428683db 116 .has_arg = no_argument,
1e1c60f1
NS
117 .flag = NULL,
118 .val = 'q'
119 },
46e6968b
NS
120 {
121 .name = "stopwatch",
428683db 122 .has_arg = required_argument,
46e6968b
NS
123 .flag = NULL,
124 .val = 'w'
125 },
ab197ca7
AB
126 {
127 .name = "format",
428683db 128 .has_arg = required_argument,
ab197ca7
AB
129 .flag = NULL,
130 .val = 'f'
131 },
132 {
133 .name = "format-spec",
428683db 134 .has_arg = required_argument,
ab197ca7
AB
135 .flag = NULL,
136 .val = 'F'
137 },
bf0720af
JA
138 {
139 .name = "hash by name",
140 .has_arg = no_argument,
141 .flag = NULL,
142 .val = 'n'
143 },
52724a0e
JA
144 {
145 .name = "version",
146 .has_arg = no_argument,
147 .flag = NULL,
148 .val = 'v'
149 },
d5396421
JA
150};
151
7997c5b0
JA
152/*
153 * for sorting the displayed output
154 */
8fc0abbc
JA
155struct trace {
156 struct blk_io_trace *bit;
157 struct rb_node rb_node;
cb2a1a62 158 struct trace *next;
8fc0abbc
JA
159};
160
cb2a1a62 161static struct rb_root rb_sort_root;
a649216c
JA
162static unsigned long rb_sort_entries;
163
cb2a1a62
JA
164static struct rb_root rb_track_root;
165
166static struct trace *trace_list;
167
d36421e4
JA
168/*
169 * allocation cache
170 */
171static struct blk_io_trace *bit_alloc_list;
172static struct trace *t_alloc_list;
173
7997c5b0
JA
174/*
175 * for tracking individual ios
176 */
177struct io_track {
178 struct rb_node rb_node;
179
e7c9f3ff 180 dev_t device;
7997c5b0
JA
181 __u64 sector;
182 __u32 pid;
bf0720af 183 char comm[16];
95c15013 184 unsigned long long allocation_time;
7997c5b0
JA
185 unsigned long long queue_time;
186 unsigned long long dispatch_time;
187 unsigned long long completion_time;
188};
189
e7c9f3ff
NS
190static int ndevices;
191static struct per_dev_info *devices;
192static char *get_dev_name(struct per_dev_info *, char *, int);
d0ca268b 193
71d5d4c9 194FILE *ofp = NULL;
e7c9f3ff
NS
195static char *output_name;
196
197static unsigned long long genesis_time;
287fa3d6 198static unsigned long long last_allowed_time;
46e6968b
NS
199static unsigned long long stopwatch_start; /* start from zero by default */
200static unsigned long long stopwatch_end = ULONG_LONG_MAX; /* "infinity" */
152f6476
JA
201
202static int per_process_stats;
7997c5b0 203static int track_ios;
bf0720af 204static int ppi_hash_by_pid = 1;
d0ca268b 205
7d747d22 206#define RB_BATCH_DEFAULT (512)
79f19470
JA
207static int rb_batch = RB_BATCH_DEFAULT;
208
e7c9f3ff
NS
209static int pipeline;
210
412819ce
JA
211#define is_done() (*(volatile int *)(&done))
212static volatile int done;
213
bf0720af
JA
214#define JHASH_RANDOM (0x3af5f2ee)
215
216static inline int ppi_hash_pid(__u32 pid)
217{
218 return jhash_1word(pid, JHASH_RANDOM) & PPI_HASH_MASK;
219}
220
221static inline int ppi_hash_name(const char *name)
152f6476 222{
bf0720af
JA
223 return jhash(name, 16, JHASH_RANDOM) & PPI_HASH_MASK;
224}
225
226static inline int ppi_hash(struct per_process_info *ppi)
227{
228 if (ppi_hash_by_pid)
229 return ppi_hash_pid(ppi->pid);
230
bf0720af 231 return ppi_hash_name(ppi->name);
152f6476
JA
232}
233
234static inline void add_process_to_hash(struct per_process_info *ppi)
235{
bf0720af 236 const int hash_idx = ppi_hash(ppi);
152f6476 237
bf0720af
JA
238 ppi->hash_next = ppi_hash_table[hash_idx];
239 ppi_hash_table[hash_idx] = ppi;
152f6476
JA
240}
241
242static inline void add_process_to_list(struct per_process_info *ppi)
243{
244 ppi->list_next = ppi_list;
245 ppi_list = ppi;
886ecf0e 246 ppi_list_entries++;
152f6476
JA
247}
248
bf0720af
JA
249static struct per_process_info *find_process_by_name(char *name)
250{
251 const int hash_idx = ppi_hash_name(name);
252 struct per_process_info *ppi;
253
254 ppi = ppi_hash_table[hash_idx];
255 while (ppi) {
256 if (!strcmp(ppi->name, name))
257 return ppi;
258
259 ppi = ppi->hash_next;
260 }
261
262 return NULL;
263}
264
152f6476
JA
265static struct per_process_info *find_process_by_pid(__u32 pid)
266{
bf0720af 267 const int hash_idx = ppi_hash_pid(pid);
152f6476
JA
268 struct per_process_info *ppi;
269
bf0720af 270 ppi = ppi_hash_table[hash_idx];
152f6476
JA
271 while (ppi) {
272 if (ppi->pid == pid)
273 return ppi;
274
275 ppi = ppi->hash_next;
276 }
277
278 return NULL;
279}
280
bf0720af
JA
281static struct per_process_info *find_process(__u32 pid, char *name)
282{
715d8021
JA
283 struct per_process_info *ppi;
284
bf0720af
JA
285 if (ppi_hash_by_pid)
286 return find_process_by_pid(pid);
287
715d8021
JA
288 ppi = find_process_by_name(name);
289 if (ppi && ppi->pid != pid)
290 ppi->more_than_one = 1;
291
292 return ppi;
bf0720af
JA
293}
294
2a1b3424 295static inline int trace_rb_insert(struct trace *t, struct rb_root *root)
7997c5b0 296{
2a1b3424 297 struct rb_node **p = &root->rb_node;
7997c5b0
JA
298 struct rb_node *parent = NULL;
299 struct trace *__t;
300
301 while (*p) {
302 parent = *p;
2a1b3424 303
7997c5b0
JA
304 __t = rb_entry(parent, struct trace, rb_node);
305
dcf0f7ed 306 if (t->bit->time < __t->bit->time)
e7c9f3ff
NS
307 p = &(*p)->rb_left;
308 else if (t->bit->time > __t->bit->time)
309 p = &(*p)->rb_right;
310 else if (t->bit->device < __t->bit->device)
311 p = &(*p)->rb_left;
312 else if (t->bit->device > __t->bit->device)
313 p = &(*p)->rb_right;
dcf0f7ed
JA
314 else if (t->bit->sequence < __t->bit->sequence)
315 p = &(*p)->rb_left;
316 else if (t->bit->sequence > __t->bit->sequence)
317 p = &(*p)->rb_right;
e7c9f3ff
NS
318 else if (t->bit->device == __t->bit->device) {
319 fprintf(stderr,
320 "sequence alias (%d) on device %d,%d!\n",
321 t->bit->sequence,
322 MAJOR(t->bit->device), MINOR(t->bit->device));
7997c5b0
JA
323 return 1;
324 }
325 }
326
327 rb_link_node(&t->rb_node, parent, p);
2a1b3424 328 rb_insert_color(&t->rb_node, root);
7997c5b0
JA
329 return 0;
330}
331
2a1b3424 332static inline int trace_rb_insert_sort(struct trace *t)
e3556946 333{
2a1b3424
JA
334 if (!trace_rb_insert(t, &rb_sort_root)) {
335 rb_sort_entries++;
336 return 0;
337 }
338
339 return 1;
340}
341
342static inline int trace_rb_insert_last(struct per_dev_info *pdi,struct trace *t)
343{
344 if (!trace_rb_insert(t, &pdi->rb_last)) {
345 pdi->rb_last_entries++;
346 return 0;
347 }
348
349 return 1;
350}
351
352static struct trace *trace_rb_find(dev_t device, unsigned long sequence,
353 struct rb_root *root, int order)
354{
355 struct rb_node *n = root->rb_node;
356 struct rb_node *prev = NULL;
e3556946
JA
357 struct trace *__t;
358
2a1b3424
JA
359 while (n) {
360 __t = rb_entry(n, struct trace, rb_node);
361 prev = n;
e3556946 362
0583b6a2 363 if (device < __t->bit->device)
2a1b3424 364 n = n->rb_left;
0583b6a2 365 else if (device > __t->bit->device)
2a1b3424 366 n = n->rb_right;
0583b6a2 367 else if (sequence < __t->bit->sequence)
2a1b3424 368 n = n->rb_left;
e3556946 369 else if (sequence > __t->bit->sequence)
2a1b3424 370 n = n->rb_right;
e3556946
JA
371 else
372 return __t;
373 }
374
2a1b3424
JA
375 /*
376 * hack - the list may not be sequence ordered because some
377 * events don't have sequence and time matched. so we end up
378 * being a little off in the rb lookup here, because we don't
379 * know the time we are looking for. compensate by browsing
380 * a little ahead from the last entry to find the match
381 */
382 if (order && prev) {
383 int max = 5;
384
385 while (((n = rb_next(prev)) != NULL) && max--) {
386 __t = rb_entry(n, struct trace, rb_node);
387
388 if (__t->bit->device == device &&
389 __t->bit->sequence == sequence)
390 return __t;
391
392 prev = n;
393 }
394 }
395
e3556946
JA
396 return NULL;
397}
398
2a1b3424
JA
399static inline struct trace *trace_rb_find_sort(dev_t dev, unsigned long seq)
400{
401 return trace_rb_find(dev, seq, &rb_sort_root, 1);
402}
403
404static inline struct trace *trace_rb_find_last(struct per_dev_info *pdi,
405 unsigned long seq)
406{
407 return trace_rb_find(pdi->id, seq, &pdi->rb_last, 0);
408}
409
7997c5b0
JA
410static inline int track_rb_insert(struct io_track *iot)
411{
412 struct rb_node **p = &rb_track_root.rb_node;
413 struct rb_node *parent = NULL;
414 struct io_track *__iot;
415
416 while (*p) {
417 parent = *p;
7997c5b0
JA
418 __iot = rb_entry(parent, struct io_track, rb_node);
419
e7c9f3ff
NS
420 if (iot->device < __iot->device)
421 p = &(*p)->rb_left;
422 else if (iot->device > __iot->device)
423 p = &(*p)->rb_right;
424 else if (iot->sector < __iot->sector)
7997c5b0
JA
425 p = &(*p)->rb_left;
426 else if (iot->sector > __iot->sector)
427 p = &(*p)->rb_right;
428 else {
e7c9f3ff 429 fprintf(stderr,
ab197ca7
AB
430 "sector alias (%Lu) on device %d,%d!\n",
431 (unsigned long long) iot->sector,
e7c9f3ff 432 MAJOR(iot->device), MINOR(iot->device));
7997c5b0
JA
433 return 1;
434 }
435 }
436
437 rb_link_node(&iot->rb_node, parent, p);
438 rb_insert_color(&iot->rb_node, &rb_track_root);
439 return 0;
440}
441
e7c9f3ff 442static struct io_track *__find_track(dev_t device, __u64 sector)
7997c5b0 443{
2a1b3424 444 struct rb_node *n = rb_track_root.rb_node;
7997c5b0
JA
445 struct io_track *__iot;
446
2a1b3424
JA
447 while (n) {
448 __iot = rb_entry(n, struct io_track, rb_node);
7997c5b0 449
e7c9f3ff 450 if (device < __iot->device)
2a1b3424 451 n = n->rb_left;
e7c9f3ff 452 else if (device > __iot->device)
2a1b3424 453 n = n->rb_right;
e7c9f3ff 454 else if (sector < __iot->sector)
2a1b3424 455 n = n->rb_left;
7997c5b0 456 else if (sector > __iot->sector)
2a1b3424 457 n = n->rb_right;
7997c5b0
JA
458 else
459 return __iot;
460 }
461
462 return NULL;
463}
464
bf0720af
JA
465static struct io_track *find_track(__u32 pid, char *comm, dev_t device,
466 __u64 sector)
7997c5b0 467{
916b5501 468 struct io_track *iot;
7997c5b0 469
e7c9f3ff 470 iot = __find_track(device, sector);
7997c5b0
JA
471 if (!iot) {
472 iot = malloc(sizeof(*iot));
50adc0ba 473 iot->pid = pid;
bf0720af 474 memcpy(iot->comm, comm, sizeof(iot->comm));
e7c9f3ff 475 iot->device = device;
7997c5b0
JA
476 iot->sector = sector;
477 track_rb_insert(iot);
478 }
479
480 return iot;
481}
482
a01516de 483static void log_track_frontmerge(struct blk_io_trace *t)
2e3e8ded
JA
484{
485 struct io_track *iot;
486
487 if (!track_ios)
488 return;
2e3e8ded 489
a01516de 490 iot = __find_track(t->device, t->sector + (t->bytes >> 9));
cb2a1a62 491 if (!iot) {
acd70d21
JA
492 fprintf(stderr, "merge not found for (%d,%d): %llu\n",
493 MAJOR(t->device), MINOR(t->device),
494 t->sector + (t->bytes >> 9));
cb2a1a62 495 return;
2e3e8ded 496 }
cb2a1a62
JA
497
498 rb_erase(&iot->rb_node, &rb_track_root);
a01516de 499 iot->sector -= t->bytes >> 9;
cb2a1a62 500 track_rb_insert(iot);
2e3e8ded
JA
501}
502
95c15013 503static void log_track_getrq(struct blk_io_trace *t)
2e3e8ded
JA
504{
505 struct io_track *iot;
506
507 if (!track_ios)
508 return;
509
bf0720af 510 iot = find_track(t->pid, t->comm, t->device, t->sector);
95c15013
JA
511 iot->allocation_time = t->time;
512}
513
95c15013 514/*
b6076a9b 515 * return time between rq allocation and insertion
95c15013 516 */
b6076a9b 517static unsigned long long log_track_insert(struct blk_io_trace *t)
95c15013 518{
50adc0ba 519 unsigned long long elapsed;
95c15013
JA
520 struct io_track *iot;
521
522 if (!track_ios)
523 return -1;
524
bf0720af 525 iot = find_track(t->pid, t->comm, t->device, t->sector);
2e3e8ded 526 iot->queue_time = t->time;
acd70d21
JA
527
528 if (!iot->allocation_time)
529 return -1;
530
50adc0ba
JA
531 elapsed = iot->queue_time - iot->allocation_time;
532
533 if (per_process_stats) {
bf0720af 534 struct per_process_info *ppi = find_process(iot->pid,iot->comm);
b9d40d6f 535 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
50adc0ba 536
b9d40d6f
JA
537 if (ppi && elapsed > ppi->longest_allocation_wait[w])
538 ppi->longest_allocation_wait[w] = elapsed;
50adc0ba
JA
539 }
540
541 return elapsed;
2e3e8ded
JA
542}
543
544/*
545 * return time between queue and issue
546 */
547static unsigned long long log_track_issue(struct blk_io_trace *t)
548{
50adc0ba 549 unsigned long long elapsed;
2e3e8ded
JA
550 struct io_track *iot;
551
552 if (!track_ios)
553 return -1;
554 if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
555 return -1;
556
e7c9f3ff 557 iot = __find_track(t->device, t->sector);
cb2a1a62 558 if (!iot) {
acd70d21
JA
559 fprintf(stderr, "issue not found for (%d,%d): %llu\n",
560 MAJOR(t->device), MINOR(t->device), t->sector);
2e3e8ded 561 return -1;
cb2a1a62 562 }
2e3e8ded
JA
563
564 iot->dispatch_time = t->time;
50adc0ba
JA
565 elapsed = iot->dispatch_time - iot->queue_time;
566
567 if (per_process_stats) {
bf0720af 568 struct per_process_info *ppi = find_process(iot->pid,iot->comm);
b9d40d6f 569 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
50adc0ba 570
b9d40d6f
JA
571 if (ppi && elapsed > ppi->longest_dispatch_wait[w])
572 ppi->longest_dispatch_wait[w] = elapsed;
50adc0ba
JA
573 }
574
575 return elapsed;
2e3e8ded
JA
576}
577
578/*
579 * return time between dispatch and complete
580 */
581static unsigned long long log_track_complete(struct blk_io_trace *t)
582{
583 unsigned long long elapsed;
584 struct io_track *iot;
585
586 if (!track_ios)
587 return -1;
588 if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0)
589 return -1;
590
e7c9f3ff 591 iot = __find_track(t->device, t->sector);
cb2a1a62 592 if (!iot) {
acd70d21
JA
593 fprintf(stderr, "complete not found for (%d,%d): %llu\n",
594 MAJOR(t->device), MINOR(t->device), t->sector);
2e3e8ded 595 return -1;
cb2a1a62 596 }
2e3e8ded
JA
597
598 iot->completion_time = t->time;
599 elapsed = iot->completion_time - iot->dispatch_time;
600
50adc0ba 601 if (per_process_stats) {
bf0720af 602 struct per_process_info *ppi = find_process(iot->pid,iot->comm);
b9d40d6f 603 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
50adc0ba 604
b9d40d6f
JA
605 if (ppi && elapsed > ppi->longest_completion_wait[w])
606 ppi->longest_completion_wait[w] = elapsed;
50adc0ba
JA
607 }
608
2e3e8ded
JA
609 /*
610 * kill the trace, we don't need it after completion
611 */
612 rb_erase(&iot->rb_node, &rb_track_root);
613 free(iot);
614
615 return elapsed;
616}
617
618
152f6476
JA
619static struct io_stats *find_process_io_stats(__u32 pid, char *name)
620{
bf0720af 621 struct per_process_info *ppi = find_process(pid, name);
152f6476
JA
622
623 if (!ppi) {
624 ppi = malloc(sizeof(*ppi));
625 memset(ppi, 0, sizeof(*ppi));
bf0720af 626 memcpy(ppi->name, name, 16);
152f6476
JA
627 ppi->pid = pid;
628 add_process_to_hash(ppi);
629 add_process_to_list(ppi);
630 }
631
632 return &ppi->io_stats;
633}
634
e7c9f3ff 635static void resize_cpu_info(struct per_dev_info *pdi, int cpu)
a718bd37 636{
e7c9f3ff
NS
637 struct per_cpu_info *cpus = pdi->cpus;
638 int ncpus = pdi->ncpus;
639 int new_count = cpu + 1;
640 int new_space, size;
a718bd37
NS
641 char *new_start;
642
e7c9f3ff
NS
643 size = new_count * sizeof(struct per_cpu_info);
644 cpus = realloc(cpus, size);
645 if (!cpus) {
646 char name[20];
647 fprintf(stderr, "Out of memory, CPU info for device %s (%d)\n",
648 get_dev_name(pdi, name, sizeof(name)), size);
a718bd37
NS
649 exit(1);
650 }
651
e7c9f3ff
NS
652 new_start = (char *)cpus + (ncpus * sizeof(struct per_cpu_info));
653 new_space = (new_count - ncpus) * sizeof(struct per_cpu_info);
a718bd37 654 memset(new_start, 0, new_space);
e7c9f3ff
NS
655
656 pdi->ncpus = new_count;
657 pdi->cpus = cpus;
658}
cb2a1a62 659
e7c9f3ff
NS
660static struct per_cpu_info *get_cpu_info(struct per_dev_info *pdi, int cpu)
661{
cb2a1a62
JA
662 struct per_cpu_info *pci;
663
e7c9f3ff
NS
664 if (cpu >= pdi->ncpus)
665 resize_cpu_info(pdi, cpu);
cb2a1a62
JA
666
667 pci = &pdi->cpus[cpu];
668 pci->cpu = cpu;
669 return pci;
a718bd37
NS
670}
671
e7c9f3ff
NS
672
673static int resize_devices(char *name)
a718bd37 674{
e7c9f3ff 675 int size = (ndevices + 1) * sizeof(struct per_dev_info);
c499bf38 676
e7c9f3ff
NS
677 devices = realloc(devices, size);
678 if (!devices) {
679 fprintf(stderr, "Out of memory, device %s (%d)\n", name, size);
680 return 1;
681 }
682 memset(&devices[ndevices], 0, sizeof(struct per_dev_info));
683 devices[ndevices].name = name;
684 ndevices++;
685 return 0;
686}
a718bd37 687
cb2a1a62 688static struct per_dev_info *get_dev_info(dev_t id)
e7c9f3ff 689{
cb2a1a62 690 struct per_dev_info *pdi;
e7c9f3ff 691 int i;
c499bf38 692
7d747d22
JA
693 for (i = 0; i < ndevices; i++) {
694 if (!devices[i].id)
695 devices[i].id = id;
e7c9f3ff
NS
696 if (devices[i].id == id)
697 return &devices[i];
7d747d22 698 }
cb2a1a62 699
2a1b3424 700 if (resize_devices(NULL))
e7c9f3ff 701 return NULL;
cb2a1a62
JA
702
703 pdi = &devices[ndevices - 1];
704 pdi->id = id;
0583b6a2 705 pdi->last_sequence = 0;
287fa3d6 706 pdi->last_read_time = 0;
2a1b3424
JA
707 memset(&pdi->rb_last, 0, sizeof(pdi->rb_last));
708 pdi->rb_last_entries = 0;
cb2a1a62 709 return pdi;
a718bd37
NS
710}
711
e7c9f3ff
NS
712static char *get_dev_name(struct per_dev_info *pdi, char *buffer, int size)
713{
714 if (pdi->name)
715 snprintf(buffer, size, "%s", pdi->name);
716 else
717 snprintf(buffer, size, "%d,%d", MAJOR(pdi->id), MINOR(pdi->id));
718 return buffer;
719}
720
e7c9f3ff 721static void check_time(struct per_dev_info *pdi, struct blk_io_trace *bit)
cfab07eb
AB
722{
723 unsigned long long this = bit->time;
e7c9f3ff 724 unsigned long long last = pdi->last_reported_time;
cfab07eb 725
e7c9f3ff
NS
726 pdi->backwards = (this < last) ? 'B' : ' ';
727 pdi->last_reported_time = this;
cfab07eb
AB
728}
729
152f6476
JA
730static inline void __account_m(struct io_stats *ios, struct blk_io_trace *t,
731 int rw)
d0ca268b
JA
732{
733 if (rw) {
152f6476
JA
734 ios->mwrites++;
735 ios->qwrite_kb += t->bytes >> 10;
d0ca268b 736 } else {
152f6476
JA
737 ios->mreads++;
738 ios->qread_kb += t->bytes >> 10;
739 }
740}
741
742static inline void account_m(struct blk_io_trace *t, struct per_cpu_info *pci,
743 int rw)
744{
745 __account_m(&pci->io_stats, t, rw);
746
747 if (per_process_stats) {
748 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
749
750 __account_m(ios, t, rw);
d0ca268b
JA
751 }
752}
753
b6076a9b
JA
754static inline void __account_queue(struct io_stats *ios, struct blk_io_trace *t,
755 int rw)
d0ca268b
JA
756{
757 if (rw) {
152f6476
JA
758 ios->qwrites++;
759 ios->qwrite_kb += t->bytes >> 10;
d0ca268b 760 } else {
152f6476
JA
761 ios->qreads++;
762 ios->qread_kb += t->bytes >> 10;
763 }
764}
765
b6076a9b
JA
766static inline void account_queue(struct blk_io_trace *t,
767 struct per_cpu_info *pci, int rw)
152f6476 768{
b6076a9b 769 __account_queue(&pci->io_stats, t, rw);
152f6476
JA
770
771 if (per_process_stats) {
772 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
773
b6076a9b 774 __account_queue(ios, t, rw);
d0ca268b
JA
775 }
776}
777
152f6476 778static inline void __account_c(struct io_stats *ios, int rw, unsigned int bytes)
d0ca268b
JA
779{
780 if (rw) {
152f6476
JA
781 ios->cwrites++;
782 ios->cwrite_kb += bytes >> 10;
d0ca268b 783 } else {
152f6476
JA
784 ios->creads++;
785 ios->cread_kb += bytes >> 10;
786 }
787}
788
789static inline void account_c(struct blk_io_trace *t, struct per_cpu_info *pci,
790 int rw, int bytes)
791{
792 __account_c(&pci->io_stats, rw, bytes);
793
794 if (per_process_stats) {
795 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
796
797 __account_c(ios, rw, bytes);
d0ca268b
JA
798 }
799}
800
b6076a9b
JA
801static inline void __account_issue(struct io_stats *ios, int rw,
802 unsigned int bytes)
afd2d7ad 803{
804 if (rw) {
152f6476
JA
805 ios->iwrites++;
806 ios->iwrite_kb += bytes >> 10;
afd2d7ad 807 } else {
152f6476
JA
808 ios->ireads++;
809 ios->iread_kb += bytes >> 10;
afd2d7ad 810 }
811}
812
b6076a9b
JA
813static inline void account_issue(struct blk_io_trace *t,
814 struct per_cpu_info *pci, int rw)
d0ca268b 815{
b6076a9b 816 __account_issue(&pci->io_stats, rw, t->bytes);
152f6476
JA
817
818 if (per_process_stats) {
819 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
d5396421 820
b6076a9b 821 __account_issue(ios, rw, t->bytes);
152f6476
JA
822 }
823}
824
06639b27
JA
825static inline void __account_unplug(struct io_stats *ios, int timer)
826{
827 if (timer)
828 ios->timer_unplugs++;
829 else
830 ios->io_unplugs++;
831}
832
833static inline void account_unplug(struct blk_io_trace *t,
834 struct per_cpu_info *pci, int timer)
835{
836 __account_unplug(&pci->io_stats, timer);
837
838 if (per_process_stats) {
839 struct io_stats *ios = find_process_io_stats(t->pid, t->comm);
840
841 __account_unplug(ios, timer);
842 }
843}
844
ab197ca7
AB
845static void log_complete(struct per_cpu_info *pci, struct blk_io_trace *t,
846 char *act)
847{
848 process_fmt(act, pci, t, log_track_complete(t), 0, NULL);
849}
850
b6076a9b
JA
851static void log_insert(struct per_cpu_info *pci, struct blk_io_trace *t,
852 char *act)
853{
854 process_fmt(act, pci, t, log_track_insert(t), 0, NULL);
855}
856
ab197ca7
AB
857static void log_queue(struct per_cpu_info *pci, struct blk_io_trace *t,
858 char *act)
859{
b6076a9b 860 process_fmt(act, pci, t, -1, 0, NULL);
ab197ca7 861}
2e3e8ded 862
ab197ca7
AB
863static void log_issue(struct per_cpu_info *pci, struct blk_io_trace *t,
864 char *act)
865{
866 process_fmt(act, pci, t, log_track_issue(t), 0, NULL);
d0ca268b
JA
867}
868
d5396421 869static void log_merge(struct per_cpu_info *pci, struct blk_io_trace *t,
3639a11e 870 char *act)
d0ca268b 871{
a01516de
JA
872 if (act[0] == 'F')
873 log_track_frontmerge(t);
2e3e8ded 874
ab197ca7 875 process_fmt(act, pci, t, -1ULL, 0, NULL);
d0ca268b
JA
876}
877
dfe34da1 878static void log_action(struct per_cpu_info *pci, struct blk_io_trace *t,
3639a11e 879 char *act)
dfe34da1 880{
ab197ca7 881 process_fmt(act, pci, t, -1ULL, 0, NULL);
dfe34da1
JA
882}
883
d5396421 884static void log_generic(struct per_cpu_info *pci, struct blk_io_trace *t,
3639a11e 885 char *act)
d0ca268b 886{
ab197ca7 887 process_fmt(act, pci, t, -1ULL, 0, NULL);
d0ca268b
JA
888}
889
ab197ca7 890static void log_unplug(struct per_cpu_info *pci, struct blk_io_trace *t,
3639a11e 891 char *act)
67e14fdc 892{
ab197ca7 893 process_fmt(act, pci, t, -1ULL, 0, NULL);
67e14fdc
JA
894}
895
93f1c611
JA
896static void log_split(struct per_cpu_info *pci, struct blk_io_trace *t,
897 char *act)
898{
899 process_fmt(act, pci, t, -1ULL, 0, NULL);
900}
901
ab197ca7 902static void log_pc(struct per_cpu_info *pci, struct blk_io_trace *t, char *act)
d0ca268b 903{
ab197ca7 904 unsigned char *buf = (unsigned char *) t + sizeof(*t);
d0ca268b 905
ab197ca7 906 process_fmt(act, pci, t, -1ULL, t->pdu_len, buf);
d0ca268b
JA
907}
908
ff3a732c 909static void dump_trace_pc(struct blk_io_trace *t, struct per_cpu_info *pci)
d0ca268b 910{
56f2af81
JA
911 int act = t->action & 0xffff;
912
913 switch (act) {
d0ca268b 914 case __BLK_TA_QUEUE:
3639a11e 915 log_generic(pci, t, "Q");
d0ca268b
JA
916 break;
917 case __BLK_TA_GETRQ:
3639a11e 918 log_generic(pci, t, "G");
d0ca268b
JA
919 break;
920 case __BLK_TA_SLEEPRQ:
3639a11e 921 log_generic(pci, t, "S");
d0ca268b
JA
922 break;
923 case __BLK_TA_REQUEUE:
3639a11e 924 log_generic(pci, t, "R");
d0ca268b
JA
925 break;
926 case __BLK_TA_ISSUE:
ab197ca7 927 log_pc(pci, t, "D");
d0ca268b
JA
928 break;
929 case __BLK_TA_COMPLETE:
3639a11e 930 log_pc(pci, t, "C");
d0ca268b 931 break;
56f2af81
JA
932 case __BLK_TA_INSERT:
933 log_pc(pci, t, "I");
934 break;
d0ca268b 935 default:
56f2af81 936 fprintf(stderr, "Bad pc action %x\n", act);
87b72777 937 break;
d0ca268b 938 }
d0ca268b
JA
939}
940
d5396421 941static void dump_trace_fs(struct blk_io_trace *t, struct per_cpu_info *pci)
d0ca268b
JA
942{
943 int w = t->action & BLK_TC_ACT(BLK_TC_WRITE);
7997c5b0 944 int act = t->action & 0xffff;
d0ca268b 945
7997c5b0 946 switch (act) {
d0ca268b 947 case __BLK_TA_QUEUE:
b6076a9b 948 account_queue(t, pci, w);
3639a11e 949 log_queue(pci, t, "Q");
d0ca268b 950 break;
b6076a9b
JA
951 case __BLK_TA_INSERT:
952 log_insert(pci, t, "I");
953 break;
d0ca268b 954 case __BLK_TA_BACKMERGE:
152f6476 955 account_m(t, pci, w);
3639a11e 956 log_merge(pci, t, "M");
d0ca268b
JA
957 break;
958 case __BLK_TA_FRONTMERGE:
152f6476 959 account_m(t, pci, w);
3639a11e 960 log_merge(pci, t, "F");
d0ca268b
JA
961 break;
962 case __BLK_TA_GETRQ:
95c15013 963 log_track_getrq(t);
3639a11e 964 log_generic(pci, t, "G");
d0ca268b
JA
965 break;
966 case __BLK_TA_SLEEPRQ:
3639a11e 967 log_generic(pci, t, "S");
d0ca268b
JA
968 break;
969 case __BLK_TA_REQUEUE:
152f6476 970 account_c(t, pci, w, -t->bytes);
3639a11e 971 log_queue(pci, t, "R");
d0ca268b
JA
972 break;
973 case __BLK_TA_ISSUE:
b6076a9b 974 account_issue(t, pci, w);
3639a11e 975 log_issue(pci, t, "D");
d0ca268b
JA
976 break;
977 case __BLK_TA_COMPLETE:
152f6476 978 account_c(t, pci, w, t->bytes);
3639a11e 979 log_complete(pci, t, "C");
d0ca268b 980 break;
88b1a526 981 case __BLK_TA_PLUG:
3639a11e 982 log_action(pci, t, "P");
88b1a526 983 break;
3639a11e 984 case __BLK_TA_UNPLUG_IO:
06639b27 985 account_unplug(t, pci, 0);
3639a11e
JA
986 log_unplug(pci, t, "U");
987 break;
988 case __BLK_TA_UNPLUG_TIMER:
06639b27 989 account_unplug(t, pci, 1);
3639a11e 990 log_unplug(pci, t, "UT");
88b1a526 991 break;
93f1c611
JA
992 case __BLK_TA_SPLIT:
993 log_split(pci, t, "X");
994 break;
995 case __BLK_TA_BOUNCE:
996 log_generic(pci, t, "B");
997 break;
d0ca268b
JA
998 default:
999 fprintf(stderr, "Bad fs action %x\n", t->action);
1f79c4a0 1000 break;
d0ca268b 1001 }
d0ca268b
JA
1002}
1003
ff3a732c
JA
1004static void dump_trace(struct blk_io_trace *t, struct per_cpu_info *pci,
1005 struct per_dev_info *pdi)
d0ca268b
JA
1006{
1007 if (t->action & BLK_TC_ACT(BLK_TC_PC))
ff3a732c 1008 dump_trace_pc(t, pci);
d0ca268b 1009 else
d5396421 1010 dump_trace_fs(t, pci);
87b72777 1011
e7c9f3ff 1012 pdi->events++;
d0ca268b
JA
1013}
1014
152f6476 1015static void dump_io_stats(struct io_stats *ios, char *msg)
5c017e4b 1016{
152f6476
JA
1017 fprintf(ofp, "%s\n", msg);
1018
1019 fprintf(ofp, " Reads Queued: %'8lu, %'8LuKiB\t", ios->qreads, ios->qread_kb);
1020 fprintf(ofp, " Writes Queued: %'8lu, %'8LuKiB\n", ios->qwrites,ios->qwrite_kb);
0a6b8fc4 1021
152f6476
JA
1022 fprintf(ofp, " Read Dispatches: %'8lu, %'8LuKiB\t", ios->ireads, ios->iread_kb);
1023 fprintf(ofp, " Write Dispatches: %'8lu, %'8LuKiB\n", ios->iwrites,ios->iwrite_kb);
1024 fprintf(ofp, " Reads Completed: %'8lu, %'8LuKiB\t", ios->creads, ios->cread_kb);
1025 fprintf(ofp, " Writes Completed: %'8lu, %'8LuKiB\n", ios->cwrites,ios->cwrite_kb);
1026 fprintf(ofp, " Read Merges: %'8lu%8c\t", ios->mreads, ' ');
152f6476 1027 fprintf(ofp, " Write Merges: %'8lu\n", ios->mwrites);
06639b27
JA
1028 fprintf(ofp, " IO unplugs: %'8lu%8c\t", ios->io_unplugs, ' ');
1029 fprintf(ofp, " Timer unplugs: %'8lu\n", ios->timer_unplugs);
5c017e4b
JA
1030}
1031
50adc0ba
JA
1032static void dump_wait_stats(struct per_process_info *ppi)
1033{
b9d40d6f
JA
1034 unsigned long rawait = ppi->longest_allocation_wait[0] / 1000;
1035 unsigned long rdwait = ppi->longest_dispatch_wait[0] / 1000;
1036 unsigned long rcwait = ppi->longest_completion_wait[0] / 1000;
1037 unsigned long wawait = ppi->longest_allocation_wait[1] / 1000;
1038 unsigned long wdwait = ppi->longest_dispatch_wait[1] / 1000;
1039 unsigned long wcwait = ppi->longest_completion_wait[1] / 1000;
1040
1041 fprintf(ofp, " Allocation wait: %'8lu%8c\t", rawait, ' ');
1042 fprintf(ofp, " Allocation wait: %'8lu\n", wawait);
1043 fprintf(ofp, " Dispatch wait: %'8lu%8c\t", rdwait, ' ');
1044 fprintf(ofp, " Dispatch wait: %'8lu\n", wdwait);
1045 fprintf(ofp, " Completion wait: %'8lu%8c\t", rcwait, ' ');
1046 fprintf(ofp, " Completion wait: %'8lu\n", wcwait);
50adc0ba
JA
1047}
1048
886ecf0e
JA
1049static int ppi_name_compare(const void *p1, const void *p2)
1050{
1051 struct per_process_info *ppi1 = *((struct per_process_info **) p1);
1052 struct per_process_info *ppi2 = *((struct per_process_info **) p2);
1053 int res;
1054
1055 res = strverscmp(ppi1->name, ppi2->name);
1056 if (!res)
06e6f286 1057 res = ppi1->pid > ppi2->pid;
886ecf0e
JA
1058
1059 return res;
1060}
1061
1062static void sort_process_list(void)
1063{
1064 struct per_process_info **ppis;
1065 struct per_process_info *ppi;
1066 int i = 0;
1067
1068 ppis = malloc(ppi_list_entries * sizeof(struct per_process_info *));
1069
1070 ppi = ppi_list;
1071 while (ppi) {
06e6f286 1072 ppis[i++] = ppi;
886ecf0e
JA
1073 ppi = ppi->list_next;
1074 }
1075
06e6f286 1076 qsort(ppis, ppi_list_entries, sizeof(ppi), ppi_name_compare);
886ecf0e
JA
1077
1078 i = ppi_list_entries - 1;
1079 ppi_list = NULL;
1080 while (i >= 0) {
1081 ppi = ppis[i];
1082
1083 ppi->list_next = ppi_list;
1084 ppi_list = ppi;
1085 i--;
1086 }
50c38702
JA
1087
1088 free(ppis);
886ecf0e
JA
1089}
1090
152f6476
JA
1091static void show_process_stats(void)
1092{
1093 struct per_process_info *ppi;
1094
886ecf0e
JA
1095 sort_process_list();
1096
152f6476
JA
1097 ppi = ppi_list;
1098 while (ppi) {
ce8b6b4f
JA
1099 char name[64];
1100
715d8021 1101 if (ppi->more_than_one)
bf0720af 1102 sprintf(name, "%s (%u, ...)", ppi->name, ppi->pid);
715d8021
JA
1103 else
1104 sprintf(name, "%s (%u)", ppi->name, ppi->pid);
bf0720af 1105
ce8b6b4f 1106 dump_io_stats(&ppi->io_stats, name);
50adc0ba 1107 dump_wait_stats(ppi);
152f6476
JA
1108 ppi = ppi->list_next;
1109 }
1110
1111 fprintf(ofp, "\n");
1112}
1113
e7c9f3ff 1114static void show_device_and_cpu_stats(void)
d0ca268b 1115{
e7c9f3ff
NS
1116 struct per_dev_info *pdi;
1117 struct per_cpu_info *pci;
1118 struct io_stats total, *ios;
1119 int i, j, pci_events;
1120 char line[3 + 8/*cpu*/ + 2 + 32/*dev*/ + 3];
1121 char name[32];
1122
1123 for (pdi = devices, i = 0; i < ndevices; i++, pdi++) {
1124
1125 memset(&total, 0, sizeof(total));
1126 pci_events = 0;
1127
1128 if (i > 0)
1129 fprintf(ofp, "\n");
1130
1131 for (pci = pdi->cpus, j = 0; j < pdi->ncpus; j++, pci++) {
1132 if (!pci->nelems)
1133 continue;
1134
1135 ios = &pci->io_stats;
1136 total.qreads += ios->qreads;
1137 total.qwrites += ios->qwrites;
1138 total.creads += ios->creads;
1139 total.cwrites += ios->cwrites;
1140 total.mreads += ios->mreads;
1141 total.mwrites += ios->mwrites;
1142 total.ireads += ios->ireads;
1143 total.iwrites += ios->iwrites;
1144 total.qread_kb += ios->qread_kb;
1145 total.qwrite_kb += ios->qwrite_kb;
1146 total.cread_kb += ios->cread_kb;
1147 total.cwrite_kb += ios->cwrite_kb;
1148 total.iread_kb += ios->iread_kb;
1149 total.iwrite_kb += ios->iwrite_kb;
06639b27
JA
1150 total.timer_unplugs += ios->timer_unplugs;
1151 total.io_unplugs += ios->io_unplugs;
e7c9f3ff
NS
1152
1153 snprintf(line, sizeof(line) - 1, "CPU%d (%s):",
1154 j, get_dev_name(pdi, name, sizeof(name)));
1155 dump_io_stats(ios, line);
1156 pci_events++;
1157 }
5c017e4b 1158
e7c9f3ff
NS
1159 if (pci_events > 1) {
1160 fprintf(ofp, "\n");
1161 snprintf(line, sizeof(line) - 1, "Total (%s):",
1162 get_dev_name(pdi, name, sizeof(name)));
1163 dump_io_stats(&total, line);
1164 }
d0ca268b 1165
dcf0f7ed 1166 fprintf(ofp, "\nEvents (%s): %'Lu entries, %'lu skips\n",
0aa8a62d 1167 get_dev_name(pdi, line, sizeof(line)), pdi->events,
dcf0f7ed 1168 pdi->skips);
e7c9f3ff 1169 }
d0ca268b
JA
1170}
1171
d36421e4
JA
1172/*
1173 * struct trace and blktrace allocation cache, we do potentially
1174 * millions of mallocs for these structures while only using at most
1175 * a few thousand at the time
1176 */
1177static inline void t_free(struct trace *t)
1178{
1179 t->next = t_alloc_list;
1180 t_alloc_list = t;
1181}
1182
1183static inline struct trace *t_alloc(void)
1184{
1185 struct trace *t = t_alloc_list;
1186
1187 if (t) {
1188 t_alloc_list = t->next;
1189 return t;
1190 }
1191
1192 return malloc(sizeof(*t));
1193}
1194
1195static inline void bit_free(struct blk_io_trace *bit)
1196{
1197 /*
1198 * abuse a 64-bit field for a next pointer for the free item
1199 */
0c39425c 1200 bit->time = (__u64) (unsigned long) bit_alloc_list;
d36421e4
JA
1201 bit_alloc_list = (struct blk_io_trace *) bit;
1202}
1203
1204static inline struct blk_io_trace *bit_alloc(void)
1205{
1206 struct blk_io_trace *bit = bit_alloc_list;
1207
1208 if (bit) {
0c39425c
NS
1209 bit_alloc_list = (struct blk_io_trace *) (unsigned long) \
1210 bit->time;
d36421e4
JA
1211 return bit;
1212 }
1213
1214 return malloc(sizeof(*bit));
1215}
1216
4f0ae44f
JA
1217static void find_genesis(void)
1218{
1219 struct trace *t = trace_list;
1220
1221 genesis_time = -1ULL;
1222 while (t != NULL) {
1223 if (t->bit->time < genesis_time)
1224 genesis_time = t->bit->time;
1225
1226 t = t->next;
1227 }
1228}
1229
7f4d89e6 1230static inline int check_stopwatch(struct blk_io_trace *bit)
4f0ae44f 1231{
7f4d89e6
JA
1232 if (bit->time < stopwatch_end &&
1233 bit->time >= stopwatch_start)
4f0ae44f
JA
1234 return 0;
1235
1236 return 1;
1237}
1238
53c68c88
JA
1239/*
1240 * return youngest entry read
1241 */
1242static int sort_entries(unsigned long long *youngest)
4f0ae44f
JA
1243{
1244 struct trace *t;
4f0ae44f
JA
1245
1246 if (!genesis_time)
1247 find_genesis();
1248
53c68c88 1249 *youngest = -1ULL;
4f0ae44f
JA
1250 while ((t = trace_list) != NULL) {
1251 struct blk_io_trace *bit = t->bit;
1252
1253 trace_list = t->next;
1254
1255 if (verify_trace(bit))
1256 continue;
1257
7f4d89e6 1258 bit->time -= genesis_time;
4f0ae44f 1259
7f4d89e6 1260 if (check_stopwatch(bit)) {
4f0ae44f
JA
1261 bit_free(bit);
1262 t_free(t);
1263 continue;
1264 }
1265
2a1b3424 1266 if (trace_rb_insert_sort(t))
53c68c88 1267 return -1;
4f0ae44f 1268
53c68c88
JA
1269 if (bit->time < *youngest)
1270 *youngest = bit->time;
4f0ae44f
JA
1271 }
1272
53c68c88 1273 return 0;
4f0ae44f
JA
1274}
1275
2a1b3424
JA
1276static inline void put_trace(struct per_dev_info *pdi, struct trace *t)
1277{
1278 rb_erase(&t->rb_node, &rb_sort_root);
1279 rb_sort_entries--;
1280
1281 trace_rb_insert_last(pdi, t);
1282
1283 if (pdi->rb_last_entries > 1024) {
1284 struct rb_node *n = rb_first(&pdi->rb_last);
1285
1286 t = rb_entry(n, struct trace, rb_node);
1287 rb_erase(n, &pdi->rb_last);
1288 pdi->rb_last_entries--;
1289
1290 bit_free(t->bit);
1291 t_free(t);
1292 }
1293}
1294
1295static int check_sequence(struct per_dev_info *pdi, struct blk_io_trace *bit,
1296 int force)
1297{
1298 unsigned long expected_sequence = pdi->last_sequence + 1;
1299 struct trace *t;
1300
1301 if (bit->sequence == expected_sequence)
1302 return 0;
1303
1304 if (bit->sequence < expected_sequence &&
1305 bit->time > pdi->last_reported_time)
1306 return 0;
1307
1308 /*
1309 * the wanted sequence is really there, continue
1310 * because this means that the log time is earlier
1311 * on the trace we have now */
1312 t = trace_rb_find_sort(pdi->id, expected_sequence);
1313 if (t)
1314 return 0;
1315
1316 t = trace_rb_find_last(pdi, expected_sequence);
1317 if (t)
1318 return 0;
1319
1320 /*
1321 * unless this is the last run, break and wait for more entries
1322 */
1323 if (!force)
1324 return 1;
1325
1326 fprintf(stderr, "(%d,%d): skipping %lu -> %u\n", MAJOR(pdi->id),
1327 MINOR(pdi->id), pdi->last_sequence, bit->sequence);
1328 pdi->skips++;
1329 return 0;
1330}
1331
a649216c 1332static void show_entries_rb(int force)
8fc0abbc 1333{
1f7afa72
JA
1334 struct per_dev_info *pdi = NULL;
1335 struct per_cpu_info *pci = NULL;
8fc0abbc 1336 struct blk_io_trace *bit;
3aabcd89 1337 struct rb_node *n;
8fc0abbc 1338 struct trace *t;
1f7afa72 1339
7d747d22 1340 while ((n = rb_first(&rb_sort_root)) != NULL) {
1f7afa72
JA
1341 if (done)
1342 break;
8fc0abbc
JA
1343
1344 t = rb_entry(n, struct trace, rb_node);
1345 bit = t->bit;
1346
287fa3d6
JA
1347 if (!pdi || pdi->id != bit->device)
1348 pdi = get_dev_info(bit->device);
1f7afa72 1349
e7c9f3ff
NS
1350 if (!pdi) {
1351 fprintf(stderr, "Unknown device ID? (%d,%d)\n",
1352 MAJOR(bit->device), MINOR(bit->device));
1353 break;
1354 }
1f7afa72
JA
1355
1356 if (bit->cpu > pdi->ncpus) {
e7c9f3ff 1357 fprintf(stderr, "Unknown CPU ID? (%d, device %d,%d)\n",
7d747d22
JA
1358 bit->cpu, MAJOR(bit->device),
1359 MINOR(bit->device));
87b72777 1360 break;
8fc0abbc
JA
1361 }
1362
2a1b3424
JA
1363 if (check_sequence(pdi, bit, force))
1364 break;
cb2a1a62 1365
a649216c 1366 if (!force && bit->time > last_allowed_time)
46e6968b 1367 break;
8fc0abbc 1368
be925321
JA
1369 pdi->last_sequence = bit->sequence;
1370
4f0ae44f 1371 check_time(pdi, bit);
8fc0abbc 1372
4f0ae44f
JA
1373 if (!pci || pci->cpu != bit->cpu)
1374 pci = get_cpu_info(pdi, bit->cpu);
287fa3d6 1375
4f0ae44f 1376 dump_trace(bit, pci, pdi);
87b72777 1377
2a1b3424 1378 put_trace(pdi, t);
cb2a1a62 1379 }
8fc0abbc
JA
1380}
1381
1f79c4a0
JA
1382static int read_data(int fd, void *buffer, int bytes, int block)
1383{
1384 int ret, bytes_left, fl;
1385 void *p;
1386
1387 fl = fcntl(fd, F_GETFL);
1388
1389 if (!block)
1390 fcntl(fd, F_SETFL, fl | O_NONBLOCK);
1391 else
1392 fcntl(fd, F_SETFL, fl & ~O_NONBLOCK);
1393
1394 bytes_left = bytes;
1395 p = buffer;
1396 while (bytes_left > 0) {
1397 ret = read(fd, p, bytes_left);
1398 if (!ret)
1399 return 1;
1400 else if (ret < 0) {
1401 if (errno != EAGAIN)
1402 perror("read");
a649216c 1403
1f79c4a0
JA
1404 return -1;
1405 } else {
1406 p += ret;
1407 bytes_left -= ret;
1408 }
1409 }
1410
1411 return 0;
1412}
1413
a649216c 1414static int read_events(int fd, int always_block)
cb2a1a62 1415{
287fa3d6 1416 struct per_dev_info *pdi = NULL;
7d747d22
JA
1417 int events = 0;
1418
1419 while (!is_done() && events < rb_batch) {
1420 struct blk_io_trace *bit;
1421 struct trace *t;
1422 int pdu_len;
1423 __u32 magic;
1424
d36421e4 1425 bit = bit_alloc();
cb2a1a62 1426
a649216c 1427 if (read_data(fd, bit, sizeof(*bit), !events || always_block))
cb2a1a62
JA
1428 break;
1429
7d747d22
JA
1430 magic = be32_to_cpu(bit->magic);
1431 if ((magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
1432 fprintf(stderr, "Bad magic %x\n", magic);
1433 break;
1434 }
1435
1436 pdu_len = be16_to_cpu(bit->pdu_len);
1437 if (pdu_len) {
1438 void *ptr = realloc(bit, sizeof(*bit) + pdu_len);
1439
1440 if (read_data(fd, ptr + sizeof(*bit), pdu_len, 1))
1441 break;
1442
1443 bit = ptr;
1444 }
1445
d36421e4 1446 t = t_alloc();
cb2a1a62
JA
1447 memset(t, 0, sizeof(*t));
1448 t->bit = bit;
1449
f1327cc5 1450 trace_to_cpu(bit);
4f0ae44f 1451
7d747d22
JA
1452 t->next = trace_list;
1453 trace_list = t;
1f7afa72 1454
287fa3d6
JA
1455 if (!pdi || pdi->id != bit->device)
1456 pdi = get_dev_info(bit->device);
1457
1458 if (bit->time > pdi->last_read_time)
1459 pdi->last_read_time = bit->time;
1460
7d747d22 1461 events++;
cb2a1a62
JA
1462 }
1463
7d747d22 1464 return events;
cb2a1a62
JA
1465}
1466
d5396421 1467static int do_file(void)
d0ca268b 1468{
7d747d22 1469 struct per_cpu_info *pci;
73877e12
NS
1470 struct per_dev_info *pdi;
1471 int i, j, events, events_added;
d0ca268b 1472
7d747d22
JA
1473 /*
1474 * first prepare all files for reading
1475 */
e8741a4a 1476 for (i = 0; i < ndevices; i++) {
73877e12
NS
1477 pdi = &devices[i];
1478 pdi->nfiles = 0;
0583b6a2 1479 pdi->last_sequence = 0;
73877e12 1480
74feaeb5 1481 for (j = 0;; j++) {
e7c9f3ff 1482 struct stat st;
87b72777 1483
e7c9f3ff
NS
1484 pci = get_cpu_info(pdi, j);
1485 pci->cpu = j;
7d747d22 1486 pci->fd = -1;
d0ca268b 1487
e7c9f3ff 1488 snprintf(pci->fname, sizeof(pci->fname)-1,
7d747d22 1489 "%s.blktrace.%d", pdi->name, pci->cpu);
e7c9f3ff
NS
1490 if (stat(pci->fname, &st) < 0)
1491 break;
74feaeb5
AB
1492 if (st.st_size) {
1493 pci->fd = open(pci->fname, O_RDONLY);
1494 if (pci->fd < 0) {
1495 perror(pci->fname);
1496 continue;
1497 }
e7c9f3ff
NS
1498 }
1499
7d747d22 1500 printf("Input file %s added\n", pci->fname);
73877e12 1501 pdi->nfiles++;
d0ca268b 1502 }
d5396421
JA
1503 }
1504
7d747d22
JA
1505 /*
1506 * now loop over the files reading in the data
1507 */
412819ce 1508 do {
53c68c88
JA
1509 unsigned long long youngest;
1510
7d747d22 1511 events_added = 0;
287fa3d6 1512 last_allowed_time = -1ULL;
d5396421 1513
7d747d22 1514 for (i = 0; i < ndevices; i++) {
73877e12
NS
1515 pdi = &devices[i];
1516
1517 for (j = 0; j < pdi->nfiles; j++) {
d5396421 1518
73877e12 1519 pci = get_cpu_info(pdi, j);
d5396421 1520
7d747d22
JA
1521 if (pci->fd == -1)
1522 continue;
51128a28 1523
a649216c 1524 events = read_events(pci->fd, 1);
7d747d22
JA
1525 if (!events) {
1526 close(pci->fd);
1527 pci->fd = -1;
1528 continue;
1529 }
d5396421 1530
287fa3d6
JA
1531 if (pdi->last_read_time < last_allowed_time)
1532 last_allowed_time = pdi->last_read_time;
d5396421 1533
7d747d22
JA
1534 events_added += events;
1535 }
2ff323b0 1536 }
d5396421 1537
53c68c88
JA
1538 if (sort_entries(&youngest))
1539 break;
1540
1541 if (youngest > stopwatch_end)
287fa3d6
JA
1542 break;
1543
a649216c 1544 show_entries_rb(0);
cb2a1a62 1545
7d747d22 1546 } while (events_added);
d5396421 1547
a649216c
JA
1548 if (rb_sort_entries)
1549 show_entries_rb(1);
1550
7d747d22 1551 return 0;
412819ce 1552}
d5396421 1553
412819ce
JA
1554static int do_stdin(void)
1555{
53c68c88 1556 unsigned long long youngest;
412819ce 1557 int fd;
d5396421 1558
be925321 1559 last_allowed_time = -1ULL;
1f79c4a0 1560 fd = dup(STDIN_FILENO);
412819ce
JA
1561 do {
1562 int events;
d5396421 1563
a649216c 1564 events = read_events(fd, 0);
412819ce
JA
1565 if (!events)
1566 break;
1567
53c68c88
JA
1568 if (sort_entries(&youngest))
1569 break;
1570
1571 if (youngest > stopwatch_end)
2ff323b0
JA
1572 break;
1573
a649216c 1574 show_entries_rb(0);
d5396421
JA
1575 } while (1);
1576
a649216c
JA
1577 if (rb_sort_entries)
1578 show_entries_rb(1);
1579
d5396421 1580 close(fd);
d5396421
JA
1581 return 0;
1582}
d0ca268b 1583
1f79c4a0 1584static void flush_output(void)
412819ce 1585{
152f6476 1586 fflush(ofp);
412819ce
JA
1587}
1588
1f79c4a0 1589static void handle_sigint(int sig)
412819ce
JA
1590{
1591 done = 1;
1592 flush_output();
1593}
1594
46e6968b
NS
1595/*
1596 * Extract start and duration times from a string, allowing
1597 * us to specify a time interval of interest within a trace.
1598 * Format: "duration" (start is zero) or "start:duration".
1599 */
1600static int find_stopwatch_interval(char *string)
1601{
1602 double value;
1603 char *sp;
1604
1605 value = strtod(string, &sp);
1606 if (sp == string) {
1607 fprintf(stderr,"Invalid stopwatch timer: %s\n", string);
1608 return 1;
1609 }
1610 if (*sp == ':') {
1611 stopwatch_start = DOUBLE_TO_NANO_ULL(value);
1612 string = sp + 1;
1613 value = strtod(string, &sp);
1614 if (sp == string || *sp != '\0') {
1615 fprintf(stderr,"Invalid stopwatch duration time: %s\n",
1616 string);
1617 return 1;
1618 }
1619 } else if (*sp != '\0') {
1620 fprintf(stderr,"Invalid stopwatch start timer: %s\n", string);
1621 return 1;
1622 }
1b928247
JA
1623 stopwatch_end = DOUBLE_TO_NANO_ULL(value);
1624 if (stopwatch_end <= stopwatch_start) {
1625 fprintf(stderr, "Invalid stopwatch interval: %Lu -> %Lu\n",
1626 stopwatch_start, stopwatch_end);
1627 return 1;
1628 }
1629
46e6968b
NS
1630 return 0;
1631}
1632
52724a0e
JA
1633static char usage_str[] = \
1634 "[ -i <input name> ] [-o <output name> [ -s ] [ -t ] [ -q ]\n" \
1635 "[ -w start:stop ] [ -f output format ] [ -F format spec ] [ -v] \n\n" \
1636 "\t-i Input file containing trace data, or '-' for stdin\n" \
1637 "\t-o Output file. If not given, output is stdout\n" \
1638 "\t-b stdin read batching\n" \
1639 "\t-s Show per-program io statistics\n" \
bf0720af 1640 "\t-n Hash processes by name, not pid\n" \
52724a0e
JA
1641 "\t-t Track individual ios. Will tell you the time a request took\n" \
1642 "\t to get queued, to get dispatched, and to get completed\n" \
1643 "\t-q Quiet. Don't display any stats at the end of the trace\n" \
1644 "\t-w Only parse data between the given time interval in seconds.\n" \
1645 "\t If 'start' isn't given, blkparse defaults the start time to 0\n" \
1646 "\t -f Output format. Customize the output format. The format field\n" \
1647 "\t identifies can be found in the documentation\n" \
1648 "\t-F Format specification. Can be found in the documentation\n" \
1649 "\t-v Print program version info\n\n";
1650
1f79c4a0
JA
1651static void usage(char *prog)
1652{
52724a0e 1653 fprintf(stderr, "Usage: %s %s %s", prog, blkparse_version, usage_str);
1f79c4a0
JA
1654}
1655
d5396421
JA
1656int main(int argc, char *argv[])
1657{
152f6476 1658 char *ofp_buffer;
a66877e6 1659 int c, ret, mode;
1e1c60f1 1660 int per_device_and_cpu_stats = 1;
d5396421
JA
1661
1662 while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
1663 switch (c) {
1664 case 'i':
e7c9f3ff
NS
1665 if (!strcmp(optarg, "-") && !pipeline)
1666 pipeline = 1;
1667 else if (resize_devices(optarg) != 0)
1668 return 1;
d5396421
JA
1669 break;
1670 case 'o':
66efebf8 1671 output_name = optarg;
d5396421 1672 break;
79f19470
JA
1673 case 'b':
1674 rb_batch = atoi(optarg);
1675 if (rb_batch <= 0)
1676 rb_batch = RB_BATCH_DEFAULT;
1677 break;
152f6476
JA
1678 case 's':
1679 per_process_stats = 1;
1680 break;
7997c5b0
JA
1681 case 't':
1682 track_ios = 1;
1683 break;
1e1c60f1
NS
1684 case 'q':
1685 per_device_and_cpu_stats = 0;
1686 break;
46e6968b
NS
1687 case 'w':
1688 if (find_stopwatch_interval(optarg) != 0)
1689 return 1;
1690 break;
ab197ca7
AB
1691 case 'f':
1692 set_all_format_specs(optarg);
1693 break;
1694 case 'F':
1695 if (add_format_spec(optarg) != 0)
1696 return 1;
1697 break;
bf0720af 1698 case 'n':
715d8021 1699 ppi_hash_by_pid = 0;
bf0720af 1700 break;
52724a0e
JA
1701 case 'v':
1702 printf("%s version %s\n", argv[0], blkparse_version);
1703 return 0;
d5396421 1704 default:
1f79c4a0 1705 usage(argv[0]);
d5396421
JA
1706 return 1;
1707 }
d0ca268b
JA
1708 }
1709
e7c9f3ff
NS
1710 while (optind < argc) {
1711 if (!strcmp(argv[optind], "-") && !pipeline)
1712 pipeline = 1;
1713 else if (resize_devices(argv[optind]) != 0)
1714 return 1;
1715 optind++;
1716 }
1717
1718 if (!pipeline && !ndevices) {
1f79c4a0 1719 usage(argv[0]);
d5396421
JA
1720 return 1;
1721 }
1722
7997c5b0
JA
1723 memset(&rb_sort_root, 0, sizeof(rb_sort_root));
1724 memset(&rb_track_root, 0, sizeof(rb_track_root));
412819ce
JA
1725
1726 signal(SIGINT, handle_sigint);
1727 signal(SIGHUP, handle_sigint);
1728 signal(SIGTERM, handle_sigint);
d5396421 1729
d69db225
JA
1730 setlocale(LC_NUMERIC, "en_US");
1731
a66877e6 1732 if (!output_name) {
152f6476 1733 ofp = fdopen(STDOUT_FILENO, "w");
a66877e6
JA
1734 mode = _IOLBF;
1735 } else {
152f6476
JA
1736 char ofname[128];
1737
7a9690c0 1738 snprintf(ofname, sizeof(ofname) - 1, "%s", output_name);
152f6476 1739 ofp = fopen(ofname, "w");
a66877e6 1740 mode = _IOFBF;
152f6476
JA
1741 }
1742
1743 if (!ofp) {
1744 perror("fopen");
1745 return 1;
1746 }
1747
1748 ofp_buffer = malloc(4096);
a66877e6 1749 if (setvbuf(ofp, ofp_buffer, mode, 4096)) {
152f6476
JA
1750 perror("setvbuf");
1751 return 1;
1752 }
1753
e7c9f3ff 1754 if (pipeline)
d5396421
JA
1755 ret = do_stdin();
1756 else
1757 ret = do_file();
1758
152f6476
JA
1759 if (per_process_stats)
1760 show_process_stats();
1761
1e1c60f1
NS
1762 if (per_device_and_cpu_stats)
1763 show_device_and_cpu_stats();
152f6476 1764
412819ce 1765 flush_output();
d5396421 1766 return ret;
d0ca268b 1767}