2 * I/O monitor based on block queue trace data
4 * Copyright IBM Corp. 2008
6 * Author(s): Martin Peschke <mp3@de.ibm.com>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 #include <sys/types.h>
45 struct blk_io_trace bit;
52 struct rb_node **node_ptr;
53 struct rb_node *parent;
58 struct blkiomon_stat stat;
74 static char blkiomon_version[] = "0.2";
77 static int interval = -1;
79 static struct trace *vacant_traces_list = NULL;
80 static int vacant_traces = 0;
82 #define TRACE_HASH_SIZE 128
83 struct trace *thash[TRACE_HASH_SIZE] = {};
85 static struct dstat *vacant_dstats_list = NULL;
86 static struct rb_root dstat_tree[2] = { RB_ROOT, RB_ROOT };
87 static struct dstat *dstat_list[2] = {};
88 static int dstat_curr = 0;
90 static struct output drvdata, human, binary, debug;
92 static char *msg_q_name = NULL;
93 static int msg_q_id = -1, msg_q = -1;
94 static long msg_id = -1;
96 static pthread_t interval_thread;
97 static pthread_mutex_t dstat_mutex = PTHREAD_MUTEX_INITIALIZER;
99 int data_is_native = -1;
104 static long leftover = 0, driverdata = 0, match = 0, mismatch = 0, sequence = 0;
106 static void dump_bit(struct trace *t, const char *descr)
108 struct blk_io_trace *bit = &t->bit;
113 fprintf(debug.fp, "--- %s ---\n", descr);
114 fprintf(debug.fp, "magic %16d\n", bit->magic);
115 fprintf(debug.fp, "sequence %16d\n", bit->sequence);
116 fprintf(debug.fp, "time %16ld\n", (unsigned long)bit->time);
117 fprintf(debug.fp, "sector %16ld\n", (unsigned long)bit->sector);
118 fprintf(debug.fp, "bytes %16d\n", bit->bytes);
119 fprintf(debug.fp, "action %16x\n", bit->action);
120 fprintf(debug.fp, "pid %16d\n", bit->pid);
121 fprintf(debug.fp, "device %16d\n", bit->device);
122 fprintf(debug.fp, "cpu %16d\n", bit->cpu);
123 fprintf(debug.fp, "error %16d\n", bit->error);
124 fprintf(debug.fp, "pdu_len %16d\n", bit->pdu_len);
126 fprintf(debug.fp, "order %16ld\n", t->sequence);
129 static void dump_bits(struct trace *t1, struct trace *t2, const char *descr)
131 struct blk_io_trace *bit1 = &t1->bit;
132 struct blk_io_trace *bit2 = &t2->bit;
137 fprintf(debug.fp, "--- %s ---\n", descr);
138 fprintf(debug.fp, "magic %16d %16d\n", bit1->magic, bit2->magic);
139 fprintf(debug.fp, "sequence %16d %16d\n",
140 bit1->sequence, bit2->sequence);
141 fprintf(debug.fp, "time %16ld %16ld\n",
142 (unsigned long)bit1->time, (unsigned long)bit2->time);
143 fprintf(debug.fp, "sector %16ld %16ld\n",
144 (unsigned long)bit1->sector, (unsigned long)bit2->sector);
145 fprintf(debug.fp, "bytes %16d %16d\n", bit1->bytes, bit2->bytes);
146 fprintf(debug.fp, "action %16x %16x\n", bit1->action, bit2->action);
147 fprintf(debug.fp, "pid %16d %16d\n", bit1->pid, bit2->pid);
148 fprintf(debug.fp, "device %16d %16d\n", bit1->device, bit2->device);
149 fprintf(debug.fp, "cpu %16d %16d\n", bit1->cpu, bit2->cpu);
150 fprintf(debug.fp, "error %16d %16d\n", bit1->error, bit2->error);
151 fprintf(debug.fp, "pdu_len %16d %16d\n", bit1->pdu_len, bit2->pdu_len);
153 fprintf(debug.fp, "order %16ld %16ld\n", t1->sequence, t2->sequence);
156 static struct dstat *blkiomon_alloc_dstat(void)
160 if (vacant_dstats_list) {
161 dstat = vacant_dstats_list;
162 vacant_dstats_list = dstat->next;
164 dstat = malloc(sizeof(*dstat));
167 "blkiomon: could not allocate device statistic");
171 memset(dstat, 0, sizeof(*dstat));
175 static struct dstat *blkiomon_find_dstat(struct rb_search *search, __u32 device)
177 struct rb_node **p = &(dstat_tree[dstat_curr].rb_node);
178 struct rb_node *parent = NULL;
184 dstat = rb_entry(parent, struct dstat, node);
186 if (dstat->msg.stat.device < device)
188 else if (dstat->msg.stat.device > device)
193 search->node_ptr = p;
194 search->parent = parent;
198 static struct dstat *blkiomon_get_dstat(__u32 device)
201 struct rb_search search;
203 pthread_mutex_lock(&dstat_mutex);
205 dstat = blkiomon_find_dstat(&search, device);
209 dstat = blkiomon_alloc_dstat();
213 dstat->msg.stat.device = device;
214 dstat->msg.stat.size_r.min = -1ULL;
215 dstat->msg.stat.size_w.min = -1ULL;
216 dstat->msg.stat.d2c_r.min = -1ULL;
217 dstat->msg.stat.d2c_w.min = -1ULL;
219 rb_link_node(&dstat->node, search.parent, search.node_ptr);
220 rb_insert_color(&dstat->node, &dstat_tree[dstat_curr]);
222 dstat->next = dstat_list[dstat_curr];
223 dstat_list[dstat_curr] = dstat;
226 pthread_mutex_unlock(&dstat_mutex);
230 static int blkiomon_output_msg_q(struct dstat *dstat)
235 dstat->msg.mtype = msg_id;
236 return msgsnd(msg_q, &dstat->msg, sizeof(struct blkiomon_stat), 0);
239 static int blkiomon_output_binary(struct dstat *dstat)
241 struct blkiomon_stat *p = &dstat->msg.stat;
246 if (fwrite(p, sizeof(*p), 1, binary.fp) != 1)
248 if (binary.pipe && fflush(binary.fp))
253 fprintf(stderr, "blkiomon: could not write to %s\n", binary.fn);
259 static struct dstat *blkiomon_output(struct dstat *head, struct timespec *ts)
261 struct dstat *dstat, *tail = NULL;
263 for (dstat = head; dstat; dstat = dstat->next) {
264 dstat->msg.stat.time = ts->tv_sec;
265 blkiomon_stat_print(human.fp, &dstat->msg.stat);
266 blkiomon_stat_to_be(&dstat->msg.stat);
267 blkiomon_output_binary(dstat);
268 blkiomon_output_msg_q(dstat);
274 static void *blkiomon_interval(void *data)
276 struct timespec wake, r;
277 struct dstat *head, *tail;
280 clock_gettime(CLOCK_REALTIME, &wake);
283 wake.tv_sec += interval;
284 if (clock_nanosleep(CLOCK_REALTIME, TIMER_ABSTIME, &wake, &r)) {
285 fprintf(stderr, "blkiomon: interrupted sleep");
289 /* grab tree and make data gatherer build up another tree */
290 pthread_mutex_lock(&dstat_mutex);
291 finished = dstat_curr;
292 dstat_curr = dstat_curr ? 0 : 1;
293 pthread_mutex_unlock(&dstat_mutex);
295 head = dstat_list[finished];
298 dstat_list[finished] = NULL;
299 dstat_tree[finished] = RB_ROOT;
300 tail = blkiomon_output(head, &wake);
302 pthread_mutex_lock(&dstat_mutex);
303 tail->next = vacant_dstats_list;
304 vacant_dstats_list = head;
305 pthread_mutex_unlock(&dstat_mutex);
310 #define BLK_DATADIR(a) (((a) >> BLK_TC_SHIFT) & (BLK_TC_READ | BLK_TC_WRITE))
312 static int blkiomon_account(struct blk_io_trace *bit_d,
313 struct blk_io_trace *bit_c)
316 struct blkiomon_stat *p;
317 __u64 d2c = (bit_c->time - bit_d->time) / 1000; /* ns -> us */
318 __u32 size = bit_d->bytes;
319 __u64 thrput = size * 1000 / d2c;
321 dstat = blkiomon_get_dstat(bit_d->device);
324 p = &dstat->msg.stat;
326 if (BLK_DATADIR(bit_c->action) & BLK_TC_READ) {
327 minmax_account(&p->thrput_r, thrput);
328 minmax_account(&p->size_r, size);
329 minmax_account(&p->d2c_r, d2c);
330 } else if (BLK_DATADIR(bit_c->action) & BLK_TC_WRITE) {
331 minmax_account(&p->thrput_w, thrput);
332 minmax_account(&p->size_w, size);
333 minmax_account(&p->d2c_w, d2c);
337 histlog2_account(p->size_hist, size, &size_hist);
338 histlog2_account(p->d2c_hist, d2c, &d2c_hist);
342 static struct trace *blkiomon_alloc_trace(void)
344 struct trace *t = vacant_traces_list;
346 vacant_traces_list = t->next;
349 t = malloc(sizeof(*t));
350 memset(t, 0, sizeof(*t));
354 static void blkiomon_free_trace(struct trace *t)
356 if (vacant_traces < 256) {
357 t->next = vacant_traces_list;
358 vacant_traces_list = t;
364 static int action(int a)
366 int bits = BLK_TC_WRITE | BLK_TC_READ | BLK_TC_FS | BLK_TC_PC;
367 return a & (BLK_TC_ACT(bits));
370 static void blkiomon_store_trace(struct trace *t)
372 int i = t->bit.sector % TRACE_HASH_SIZE;
378 static struct trace *blkiomon_fetch_trace(struct blk_io_trace *bit)
380 int i = bit->sector % TRACE_HASH_SIZE;
381 struct trace *t, *prev = NULL;
383 for (t = thash[i]; t; t = t->next) {
384 if (t->bit.device == bit->device &&
385 t->bit.sector == bit->sector &&
386 action(t->bit.action) == action(bit->action)) {
388 prev->next = t->next;
398 static struct trace *blkiomon_do_trace(struct trace *t)
400 struct trace *t_stored, *t_old, *t_young;
402 /* store trace if there is no match yet */
403 t_stored = blkiomon_fetch_trace(&t->bit);
405 blkiomon_store_trace(t);
406 return blkiomon_alloc_trace();
409 /* figure out older trace and younger trace */
410 if (t_stored->bit.time < t->bit.time) {
418 /* we need an older D trace and a younger C trace */
419 if (t_old->bit.action & BLK_TC_ACT(BLK_TC_ISSUE) &&
420 t_young->bit.action & BLK_TC_ACT(BLK_TC_COMPLETE)) {
421 /* matching D and C traces - update statistics */
423 blkiomon_account(&t_old->bit, &t_young->bit);
424 blkiomon_free_trace(t_stored);
428 /* no matching D and C traces - keep more recent trace */
429 dump_bits(t_old, t_young, "mismatch");
431 blkiomon_store_trace(t_young);
435 static int blkiomon_dump_drvdata(struct blk_io_trace *bit, void *pdu_buf)
440 if (fwrite(bit, sizeof(*bit), 1, drvdata.fp) != 1)
442 if (fwrite(pdu_buf, bit->pdu_len, 1, drvdata.fp) != 1)
444 if (drvdata.pipe && fflush(drvdata.fp))
449 fprintf(stderr, "blkiomon: could not write to %s\n", drvdata.fn);
455 static int blkiomon_do_fifo(void)
458 struct blk_io_trace *bit;
459 void *pdu_buf = NULL;
461 t = blkiomon_alloc_trace();
467 if (fread(bit, sizeof(*bit), 1, ifp) != 1) {
470 "blkiomon: could not read trace");
475 fprintf(stderr, "blkiomon: error while reading trace");
479 if (data_is_native == -1 && check_data_endianness(bit->magic)) {
480 fprintf(stderr, "blkiomon: endianess problem\n");
486 if (verify_trace(bit)) {
487 fprintf(stderr, "blkiomon: bad trace\n");
491 /* read additional trace payload */
493 pdu_buf = realloc(pdu_buf, bit->pdu_len);
494 if (fread(pdu_buf, bit->pdu_len, 1, ifp) != 1) {
496 fprintf(stderr, "blkiomon: could not read payload\n");
501 t->sequence = sequence++;
503 /* forward low-level device driver trace to other tool */
504 if (bit->action & BLK_TC_ACT(BLK_TC_DRV_DATA)) {
506 if (blkiomon_dump_drvdata(bit, pdu_buf)) {
507 fprintf(stderr, "blkiomon: could not send trace\n");
513 if (!(bit->action & BLK_TC_ACT(BLK_TC_ISSUE | BLK_TC_COMPLETE)))
516 /* try to find matching trace and update statistics */
517 t = blkiomon_do_trace(t);
519 fprintf(stderr, "blkiomon: could not alloc trace\n");
523 /* t and bit will be recycled for next incoming trace */
525 blkiomon_free_trace(t);
530 static int blkiomon_open_output(struct output *out)
537 if (!strcmp(out->fn, "-")) {
538 out->fp = fdopen(STDOUT_FILENO, "w");
543 out->fp = fopen(out->fn, "w");
545 vbuf_size = 128 * 1024;
550 out->buf = malloc(128 * 1024);
551 if (setvbuf(out->fp, out->buf, mode, vbuf_size))
556 fprintf(stderr, "blkiomon: could not write to %s\n", out->fn);
562 static int blkiomon_open_msg_q(void)
568 if (!msg_q_id || msg_id <= 0)
570 key = ftok(msg_q_name, msg_q_id);
574 msg_q = msgget(key, S_IRWXU);
578 return (msg_q >= 0 ? 0 : -1);
581 static void blkiomon_debug(void)
589 for (i = 0; i < TRACE_HASH_SIZE; i++)
590 for (t = thash[i]; t; t = t->next) {
591 dump_bit(t, "leftover");
595 fprintf(debug.fp, "%ld leftover, %ld match, %ld mismatch, "
596 "%ld driverdata, %ld overall\n",
597 leftover, match, mismatch, driverdata, sequence);
600 #define S_OPTS "b:d:D:h:I:Q:q:m:V"
602 static char usage_str[] = "\n\nblkiomon " \
603 "-I <interval> | --interval=<interval>\n" \
604 "[ -h <file> | --human-readable=<file> ]\n" \
605 "[ -b <file> | --binary=<file> ]\n" \
606 "[ -D <file> | --debug=<file> ]\n" \
607 "[ -Q <path name> | --msg-queue-name=<path name>]\n" \
608 "[ -q <msg queue id> | --msg-queue-id=<msg queue id>]\n" \
609 "[ -m <msg id> | --msg-id=<msg id>]\n" \
610 "[ -V | --version ]\n\n" \
611 "\t-I Sample interval.\n" \
612 "\t-h Human-readable output file.\n" \
613 "\t-b Binary output file.\n" \
614 "\t-d Output file for data emitted by low level device driver.\n" \
615 "\t-D Output file for debugging data.\n" \
616 "\t-Qqm Output to message queue using given ID for messages.\n" \
617 "\t-V Print program version.\n\n";
619 static struct option l_opts[] = {
621 .name = "human-readable",
622 .has_arg = required_argument,
628 .has_arg = required_argument,
634 .has_arg = required_argument,
640 .has_arg = required_argument,
646 .has_arg = required_argument,
652 .has_arg = required_argument,
657 .name = "msg-queue-id",
658 .has_arg = required_argument,
664 .has_arg = required_argument,
670 .has_arg = no_argument,
679 static void blkiomon_signal(int signal)
681 fprintf(stderr, "blkiomon: terminated by signal\n");
685 int main(int argc, char *argv[])
689 signal(SIGALRM, blkiomon_signal);
690 signal(SIGINT, blkiomon_signal);
691 signal(SIGTERM, blkiomon_signal);
692 signal(SIGQUIT, blkiomon_signal);
694 while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
709 interval = atoi(optarg);
715 msg_q_id = atoi(optarg);
718 msg_id = atoi(optarg);
721 printf("%s version %s\n", argv[0], blkiomon_version);
724 fprintf(stderr, "Usage: %s", usage_str);
730 fprintf(stderr, "Usage: %s", usage_str);
734 ifp = fdopen(STDIN_FILENO, "r");
736 perror("blkiomon: could not open stdin for reading");
740 if (blkiomon_open_output(&human))
742 if (blkiomon_open_output(&binary))
744 if (blkiomon_open_output(&drvdata))
746 if (blkiomon_open_output(&debug))
748 if (blkiomon_open_msg_q())
751 if (pthread_create(&interval_thread, NULL, blkiomon_interval, NULL)) {
752 fprintf(stderr, "blkiomon: could not create thread");