blkiomon: I/O monitor
[blktrace.git] / blkiomon.c
CommitLineData
cc19ddd6
MP
1/*
2 * I/O monitor based on block queue trace data
3 *
4 * Copyright IBM Corp. 2008
5 *
6 * Author(s): Martin Peschke <mp3@de.ibm.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23#include <sys/types.h>
24#include <sys/stat.h>
25#include <fcntl.h>
26#include <unistd.h>
27#include <stdio.h>
28#include <stdlib.h>
29#include <string.h>
30#include <signal.h>
31#include <getopt.h>
32#include <errno.h>
33#include <locale.h>
34#include <libgen.h>
35#include <sys/msg.h>
36#include <pthread.h>
37#include <time.h>
38
39#include "blktrace.h"
40#include "rbtree.h"
41#include "jhash.h"
42#include "blkiomon.h"
43
44struct trace {
45 struct blk_io_trace bit;
46 struct rb_node node;
47 struct trace *next;
48 long sequence;
49};
50
51struct rb_search {
52 struct rb_node **node_ptr;
53 struct rb_node *parent;
54};
55
56struct dstat_msg {
57 long mtype;
58 struct blkiomon_stat stat;
59};
60
61struct dstat {
62 struct dstat_msg msg;
63 struct rb_node node;
64 struct dstat *next;
65};
66
67struct output {
68 char *fn;
69 FILE *fp;
70 char *buf;
71 int pipe;
72};
73
74static char blkiomon_version[] = "0.2";
75
76static FILE *ifp;
77static int interval = -1;
78
79static struct trace *vacant_traces_list = NULL;
80static int vacant_traces = 0;
81static struct rb_root trace_tree = RB_ROOT;
82
83#define TRACE_HASH_SIZE 128
84struct trace *thash[TRACE_HASH_SIZE] = {};
85
86static struct dstat *vacant_dstats_list = NULL;
87static struct rb_root dstat_tree[2] = { RB_ROOT, RB_ROOT };
88static struct dstat *dstat_list[2] = {};
89static int dstat_curr = 0;
90
91static struct output human, binary, debug;
92
93static char *msg_q_name = NULL;
94static int msg_q_id = -1, msg_q = -1;
95static long msg_id = -1;
96
97static pthread_t interval_thread;
98static pthread_mutex_t dstat_mutex = PTHREAD_MUTEX_INITIALIZER;
99
100int data_is_native = -1;
101
102static int up = 1;
103
104/* debugging */
105static long leftover = 0, driverdata = 0, match = 0, mismatch = 0, sequence = 0;
106
107static void dump_bit(struct trace *t, const char *descr)
108{
109 struct blk_io_trace *bit = &t->bit;
110
111 if (!debug.fn)
112 return;
113
114 fprintf(debug.fp, "--- %s ---\n", descr);
115 fprintf(debug.fp, "magic %16d\n", bit->magic);
116 fprintf(debug.fp, "sequence %16d\n", bit->sequence);
117 fprintf(debug.fp, "time %16ld\n", (unsigned long)bit->time);
118 fprintf(debug.fp, "sector %16ld\n", (unsigned long)bit->sector);
119 fprintf(debug.fp, "bytes %16d\n", bit->bytes);
120 fprintf(debug.fp, "action %16x\n", bit->action);
121 fprintf(debug.fp, "pid %16d\n", bit->pid);
122 fprintf(debug.fp, "device %16d\n", bit->device);
123 fprintf(debug.fp, "cpu %16d\n", bit->cpu);
124 fprintf(debug.fp, "error %16d\n", bit->error);
125 fprintf(debug.fp, "pdu_len %16d\n", bit->pdu_len);
126
127 fprintf(debug.fp, "order %16ld\n", t->sequence);
128}
129
130static void dump_bits(struct trace *t1, struct trace *t2, const char *descr)
131{
132 struct blk_io_trace *bit1 = &t1->bit;
133 struct blk_io_trace *bit2 = &t2->bit;
134
135 if (!debug.fn)
136 return;
137
138 fprintf(debug.fp, "--- %s ---\n", descr);
139 fprintf(debug.fp, "magic %16d %16d\n", bit1->magic, bit2->magic);
140 fprintf(debug.fp, "sequence %16d %16d\n",
141 bit1->sequence, bit2->sequence);
142 fprintf(debug.fp, "time %16ld %16ld\n",
143 (unsigned long)bit1->time, (unsigned long)bit2->time);
144 fprintf(debug.fp, "sector %16ld %16ld\n",
145 (unsigned long)bit1->sector, (unsigned long)bit2->sector);
146 fprintf(debug.fp, "bytes %16d %16d\n", bit1->bytes, bit2->bytes);
147 fprintf(debug.fp, "action %16x %16x\n", bit1->action, bit2->action);
148 fprintf(debug.fp, "pid %16d %16d\n", bit1->pid, bit2->pid);
149 fprintf(debug.fp, "device %16d %16d\n", bit1->device, bit2->device);
150 fprintf(debug.fp, "cpu %16d %16d\n", bit1->cpu, bit2->cpu);
151 fprintf(debug.fp, "error %16d %16d\n", bit1->error, bit2->error);
152 fprintf(debug.fp, "pdu_len %16d %16d\n", bit1->pdu_len, bit2->pdu_len);
153
154 fprintf(debug.fp, "order %16ld %16ld\n", t1->sequence, t2->sequence);
155}
156
157static struct dstat *blkiomon_alloc_dstat(void)
158{
159 struct dstat *dstat;
160
161 if (vacant_dstats_list) {
162 dstat = vacant_dstats_list;
163 vacant_dstats_list = dstat->next;
164 } else
165 dstat = malloc(sizeof(*dstat));
166 if (!dstat) {
167 perror("blkiomon: could not allocate device statistic");
168 return NULL;
169 }
170
171 memset(dstat, 0, sizeof(*dstat));
172 return dstat;
173}
174
175static struct dstat *blkiomon_find_dstat(struct rb_search *search, __u32 device)
176{
177 struct rb_node **p = &(dstat_tree[dstat_curr].rb_node);
178 struct rb_node *parent = NULL;
179 struct dstat *dstat;
180
181 while (*p) {
182 parent = *p;
183
184 dstat = rb_entry(parent, struct dstat, node);
185
186 if (dstat->msg.stat.device < device)
187 p = &(*p)->rb_left;
188 else if (dstat->msg.stat.device > device)
189 p = &(*p)->rb_right;
190 else
191 return dstat;
192 }
193 search->node_ptr = p;
194 search->parent = parent;
195 return NULL;
196}
197
198static struct dstat *blkiomon_get_dstat(__u32 device)
199{
200 struct dstat *dstat;
201 struct rb_search search;
202
203 pthread_mutex_lock(&dstat_mutex);
204
205 dstat = blkiomon_find_dstat(&search, device);
206 if (dstat)
207 goto out;
208
209 dstat = blkiomon_alloc_dstat();
210 if (!dstat)
211 goto out;
212
213 dstat->msg.stat.device = device;
214 dstat->msg.stat.size_mm.min = -1ULL;
215 dstat->msg.stat.d2c_mm.min = -1ULL;
216
217 rb_link_node(&dstat->node, search.parent, search.node_ptr);
218 rb_insert_color(&dstat->node, &dstat_tree[dstat_curr]);
219
220 dstat->next = dstat_list[dstat_curr];
221 dstat_list[dstat_curr] = dstat;
222
223out:
224 pthread_mutex_unlock(&dstat_mutex);
225 return dstat;
226}
227
228static int blkiomon_output_msg_q(struct dstat *dstat)
229{
230 if (!msg_q_name)
231 return 0;
232
233 dstat->msg.mtype = msg_id;
234 return msgsnd(msg_q, &dstat->msg, sizeof(struct blkiomon_stat), 0);
235}
236
237static int blkiomon_output_binary(struct dstat *dstat)
238{
239 struct blkiomon_stat *p = &dstat->msg.stat;
240
241 if (!binary.fn)
242 return 0;
243
244 if (fwrite(p, sizeof(*p), 1, binary.fp) != 1)
245 goto failed;
246 if (binary.pipe && fflush(binary.fp))
247 goto failed;
248 return 0;
249
250failed:
251 fprintf(stderr, "blkiomon: could not write to %s\n", binary.fn);
252 fclose(binary.fp);
253 binary.fn = NULL;
254 return 1;
255}
256
257static struct dstat *blkiomon_output(struct dstat *head, struct timespec *ts)
258{
259 struct dstat *dstat, *tail = NULL;
260
261 for (dstat = head; dstat; dstat = dstat->next) {
262 dstat->msg.stat.time = ts->tv_sec;
263 blkiomon_stat_print(human.fp, &dstat->msg.stat);
264 blkiomon_stat_to_be(&dstat->msg.stat);
265 blkiomon_output_binary(dstat);
266 blkiomon_output_msg_q(dstat);
267 tail = dstat;
268 }
269 return tail;
270}
271
272static void *blkiomon_interval(void *data)
273{
274 struct timespec wake, r;
275 struct dstat *head, *tail;
276 int finished;
277
278 clock_gettime(CLOCK_REALTIME, &wake);
279
280 while (1) {
281 wake.tv_sec += interval;
282 if (clock_nanosleep(CLOCK_REALTIME, TIMER_ABSTIME, &wake, &r)) {
283 perror("blkiomon: interrupted sleep");
284 continue;
285 }
286
287 /* grab tree and make data gatherer build up another tree */
288 pthread_mutex_lock(&dstat_mutex);
289 finished = dstat_curr;
290 dstat_curr = dstat_curr ? 0 : 1;
291 pthread_mutex_unlock(&dstat_mutex);
292
293 head = dstat_list[finished];
294 if (!head)
295 continue;
296 dstat_list[finished] = NULL;
297 dstat_tree[finished] = RB_ROOT;
298 tail = blkiomon_output(head, &wake);
299
300 pthread_mutex_lock(&dstat_mutex);
301 tail->next = vacant_dstats_list;
302 vacant_dstats_list = head;
303 pthread_mutex_unlock(&dstat_mutex);
304 }
305 return data;
306}
307
308#define BLK_DATADIR(a) (((a) >> BLK_TC_SHIFT) & (BLK_TC_READ | BLK_TC_WRITE))
309
310static int blkiomon_account(struct blk_io_trace *bit_d,
311 struct blk_io_trace *bit_c)
312{
313 struct dstat *dstat;
314 struct blkiomon_stat *p;
315 __u64 d2c = (bit_c->time - bit_d->time) / 1000; /* ns -> us */
316 __u32 size = bit_d->bytes;
317
318 dstat = blkiomon_get_dstat(bit_d->device);
319 if (!dstat)
320 return 1;
321 p = &dstat->msg.stat;
322
323 if (BLK_DATADIR(bit_c->action) & BLK_TC_READ)
324 p->read++;
325 else if (BLK_DATADIR(bit_c->action) & BLK_TC_WRITE)
326 p->write++;
327 else
328 p->bidir++;
329
330 histlog2_account(p->size_hist, size, &size_hist);
331 histlog2_account(p->d2c_hist, d2c, &d2c_hist);
332 minmax_account(&p->size_mm, size);
333 minmax_account(&p->d2c_mm, d2c);
334 return 0;
335}
336
337static struct trace *blkiomon_alloc_trace(void)
338{
339 struct trace *t = vacant_traces_list;
340 if (t) {
341 vacant_traces_list = t->next;
342 vacant_traces--;
343 } else
344 t = malloc(sizeof(*t));
345 memset(t, 0, sizeof(*t));
346 return t;
347}
348
349static void blkiomon_free_trace(struct trace *t)
350{
351 if (vacant_traces < 256) {
352 t->next = vacant_traces_list;
353 vacant_traces_list = t;
354 vacant_traces++;
355 } else
356 free(t);
357}
358
359static int action(int a)
360{
361 int bits = BLK_TC_WRITE | BLK_TC_READ | BLK_TC_FS | BLK_TC_PC;
362 return a & (BLK_TC_ACT(bits));
363}
364
365static void blkiomon_store_trace(struct trace *t)
366{
367 int i = t->bit.sector % TRACE_HASH_SIZE;
368
369 t->next = thash[i];
370 thash[i] = t;
371}
372
373static struct trace *blkiomon_fetch_trace(struct blk_io_trace *bit)
374{
375 int i = bit->sector % TRACE_HASH_SIZE;
376 struct trace *t, *prev = NULL;
377
378 for (t = thash[i]; t; t = t->next) {
379 if (t->bit.device == bit->device &&
380 t->bit.sector == bit->sector &&
381 action(t->bit.action) == action(bit->action)) {
382 if (prev)
383 prev->next = t->next;
384 else
385 thash[i] = t->next;
386 return t;
387 }
388 prev = t;
389 }
390 return NULL;
391}
392
393static struct trace *blkiomon_do_trace(struct trace *t)
394{
395 struct trace *t_stored, *t_old, *t_young;
396
397 /* store trace if there is no match yet */
398 t_stored = blkiomon_fetch_trace(&t->bit);
399 if (!t_stored) {
400 blkiomon_store_trace(t);
401 return blkiomon_alloc_trace();
402 }
403
404 /* figure out older trace and younger trace */
405 if (t_stored->bit.time < t->bit.time) {
406 t_old = t_stored;
407 t_young = t;
408 } else {
409 t_old = t;
410 t_young = t_stored;
411 }
412
413 /* we need an older D trace and a younger C trace */
414 if (t_old->bit.action & BLK_TC_ACT(BLK_TC_ISSUE) &&
415 t_young->bit.action & BLK_TC_ACT(BLK_TC_COMPLETE)) {
416 /* matching D and C traces - update statistics */
417 match++;
418 blkiomon_account(&t_old->bit, &t_young->bit);
419 blkiomon_free_trace(t_stored);
420 return t;
421 }
422
423 /* no matching D and C traces - keep more recent trace */
424 dump_bits(t_old, t_young, "mismatch");
425 mismatch++;
426 blkiomon_store_trace(t_young);
427 return t_old;
428}
429
430static int blkiomon_do_fifo(void)
431{
432 struct trace *t;
433 struct blk_io_trace *bit;
434 void *pdu_buf = NULL;
435
436 t = blkiomon_alloc_trace();
437 if (!t)
438 return 1;
439 bit = &t->bit;
440
441 while (up) {
442 if (fread(bit, sizeof(*bit), 1, ifp) != 1) {
443 if (!feof(ifp))
444 fprintf(stderr,
445 "blkiomon: could not read trace");
446 break;
447 }
448 if (ferror(ifp)) {
449 clearerr(ifp);
450 perror("blkiomon: error while reading trace");
451 break;
452 }
453
454 if (data_is_native == -1 && check_data_endianness(bit->magic))
455 break;
456
457 /* endianess */
458 trace_to_cpu(bit);
459 if (verify_trace(bit)) {
460 perror("blkiomon: bad trace");
461 break;
462 }
463
464 /* read additional trace payload */
465 if (bit->pdu_len) {
466 pdu_buf = realloc(pdu_buf, bit->pdu_len);
467 if (fread(pdu_buf, bit->pdu_len, 1, ifp) != 1) {
468 clearerr(ifp);
469 perror("blkiomon: could not read payload");
470 break;
471 }
472 }
473
474 t->sequence = sequence++;
475
476 if (!(bit->action & BLK_TC_ACT(BLK_TC_ISSUE | BLK_TC_COMPLETE)))
477 continue;
478
479 /* try to find matching trace and update statistics */
480 t = blkiomon_do_trace(t);
481 if (!t)
482 break;
483 bit = &t->bit;
484 /* t and bit will be recycled for next incoming trace */
485 }
486 blkiomon_free_trace(t);
487 free(pdu_buf);
488 return 0;
489}
490
491static int blkiomon_open_output(struct output *out)
492{
493 int mode, vbuf_size;
494
495 if (!out->fn)
496 return 0;
497
498 if (!strcmp(out->fn, "-")) {
499 out->fp = fdopen(STDOUT_FILENO, "w");
500 mode = _IOLBF;
501 vbuf_size = 4096;
502 out->pipe = 1;
503 } else {
504 out->fp = fopen(out->fn, "w");
505 mode = _IOFBF;
506 vbuf_size = 128 * 1024;
507 out->pipe = 0;
508 }
509 if (!out->fp)
510 goto failed;
511 out->buf = malloc(128 * 1024);
512 if (setvbuf(out->fp, out->buf, mode, vbuf_size))
513 goto failed;
514 return 0;
515
516failed:
517 fprintf(stderr, "blkiomon: could not write to %s\n", out->fn);
518 out->fn = NULL;
519 free(out->buf);
520 return 1;
521}
522
523static int blkiomon_open_msg_q(void)
524{
525 key_t key;
526
527 if (!msg_q_name)
528 return 0;
529 if (!msg_q_id || msg_id <= 0)
530 return 1;
531 key = ftok(msg_q_name, msg_q_id);
532 if (key == -1)
533 return 1;
534 while (up) {
535 msg_q = msgget(key, S_IRWXU);
536 if (msg_q >= 0)
537 break;
538 }
539 return (msg_q >= 0 ? 0 : -1);
540}
541
542static void blkiomon_debug(void)
543{
544 struct rb_node *n;
545 struct trace *t;
546
547 if (!debug.fn)
548 return;
549
550 for (n = rb_first(&trace_tree); n; n = rb_next(n)) {
551 t = rb_entry(n, struct trace, node);
552 dump_bit(t, "leftover");
553 leftover++;
554 }
555 fprintf(debug.fp, "%ld leftover, %ld match, %ld mismatch, "
556 "%ld driverdata, %ld overall\n",
557 leftover, match, mismatch, driverdata, sequence);
558}
559
560#define S_OPTS "b:D:h:I:Q:q:m:V"
561
562static char usage_str[] = "\n\nblkiomon " \
563 "-I <interval> | --interval=<interval>\n" \
564 "[ -h <file> | --human-readable=<file> ]\n" \
565 "[ -b <file> | --binary=<file> ]\n" \
566 "[ -D <file> | --debug=<file> ]\n" \
567 "[ -Q <path name> | --msg-queue-name=<path name>]\n" \
568 "[ -q <msg queue id> | --msg-queue-id=<msg queue id>]\n" \
569 "[ -m <msg id> | --msg-id=<msg id>]\n" \
570 "[ -V | --version ]\n\n" \
571 "\t-I Sample interval.\n" \
572 "\t-h Human-readable output file.\n" \
573 "\t-b Binary output file.\n" \
574 "\t-D Output file for debugging data.\n" \
575 "\t-Qqm Output to message queue using given ID for messages.\n" \
576 "\t-V Print program version.\n\n";
577
578static struct option l_opts[] = {
579 {
580 .name = "human-readable",
581 .has_arg = required_argument,
582 .flag = NULL,
583 .val = 'h'
584 },
585 {
586 .name = "binary",
587 .has_arg = required_argument,
588 .flag = NULL,
589 .val = 'b'
590 },
591 {
592 .name = "debug",
593 .has_arg = required_argument,
594 .flag = NULL,
595 .val = 'D'
596 },
597 {
598 .name = "interval",
599 .has_arg = required_argument,
600 .flag = NULL,
601 .val = 'I'
602 },
603 {
604 .name = "msg-queue",
605 .has_arg = required_argument,
606 .flag = NULL,
607 .val = 'Q'
608 },
609 {
610 .name = "msg-queue-id",
611 .has_arg = required_argument,
612 .flag = NULL,
613 .val = 'q'
614 },
615 {
616 .name = "msg-id",
617 .has_arg = required_argument,
618 .flag = NULL,
619 .val = 'm'
620 },
621 {
622 .name = "version",
623 .has_arg = no_argument,
624 .flag = NULL,
625 .val = 'V'
626 },
627 {
628 .name = NULL,
629 }
630};
631
632static void blkiomon_signal(int signal)
633{
634 fprintf(stderr, "blkiomon: terminated by signal\n");
635 up = signal & 0;
636}
637
638int main(int argc, char *argv[])
639{
640 int c;
641
642 signal(SIGALRM, blkiomon_signal);
643 signal(SIGINT, blkiomon_signal);
644 signal(SIGTERM, blkiomon_signal);
645 signal(SIGQUIT, blkiomon_signal);
646
647 while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
648 switch (c) {
649 case 'h':
650 human.fn = optarg;
651 break;
652 case 'b':
653 binary.fn = optarg;
654 break;
655 case 'D':
656 debug.fn = optarg;
657 break;
658 case 'I':
659 interval = atoi(optarg);
660 break;
661 case 'Q':
662 msg_q_name = optarg;
663 break;
664 case 'q':
665 msg_q_id = atoi(optarg);
666 break;
667 case 'm':
668 msg_id = atoi(optarg);
669 break;
670 case 'V':
671 printf("%s version %s\n", argv[0], blkiomon_version);
672 return 0;
673 default:
674 fprintf(stderr, "Usage: %s", usage_str);
675 return 1;
676 }
677 }
678
679 if (interval <= 0) {
680 fprintf(stderr, "Usage: %s", usage_str);
681 return 1;
682 }
683
684 ifp = fdopen(STDIN_FILENO, "r");
685 if (!ifp) {
686 perror("blkiomon: could not open stdin for reading");
687 return 1;
688 }
689
690 if (blkiomon_open_output(&human))
691 return 1;
692 if (blkiomon_open_output(&binary))
693 return 1;
694 if (blkiomon_open_output(&debug))
695 return 1;
696 if (blkiomon_open_msg_q())
697 return 1;
698
699 if (pthread_create(&interval_thread, NULL, blkiomon_interval, NULL)) {
700 perror("blkiomon: could not create thread");
701 return 1;
702 }
703
704 blkiomon_do_fifo();
705
706 blkiomon_debug();
707 return 0;
708}