blkiomon: I/O monitor
[blktrace.git] / blkiomon.c
diff --git a/blkiomon.c b/blkiomon.c
new file mode 100644 (file)
index 0000000..3ac4360
--- /dev/null
@@ -0,0 +1,708 @@
+/*
+ * I/O monitor based on block queue trace data
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Author(s): Martin Peschke <mp3@de.ibm.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+#include <getopt.h>
+#include <errno.h>
+#include <locale.h>
+#include <libgen.h>
+#include <sys/msg.h>
+#include <pthread.h>
+#include <time.h>
+
+#include "blktrace.h"
+#include "rbtree.h"
+#include "jhash.h"
+#include "blkiomon.h"
+
+struct trace {
+       struct blk_io_trace bit;
+       struct rb_node node;
+       struct trace *next;
+       long sequence;
+};
+
+struct rb_search {
+       struct rb_node **node_ptr;
+       struct rb_node *parent;
+};
+
+struct dstat_msg {
+       long mtype;
+       struct blkiomon_stat stat;
+};
+
+struct dstat {
+       struct dstat_msg msg;
+       struct rb_node node;
+       struct dstat *next;
+};
+
+struct output {
+       char *fn;
+       FILE *fp;
+       char *buf;
+       int pipe;
+};
+
+static char blkiomon_version[] = "0.2";
+
+static FILE *ifp;
+static int interval = -1;
+
+static struct trace *vacant_traces_list = NULL;
+static int vacant_traces = 0;
+static struct rb_root trace_tree = RB_ROOT;
+
+#define TRACE_HASH_SIZE 128
+struct trace *thash[TRACE_HASH_SIZE] = {};
+
+static struct dstat *vacant_dstats_list = NULL;
+static struct rb_root dstat_tree[2] = { RB_ROOT, RB_ROOT };
+static struct dstat *dstat_list[2] = {};
+static int dstat_curr = 0;
+
+static struct output human, binary, debug;
+
+static char *msg_q_name = NULL;
+static int msg_q_id = -1, msg_q = -1;
+static long msg_id = -1;
+
+static pthread_t interval_thread;
+static pthread_mutex_t dstat_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+int data_is_native = -1;
+
+static int up = 1;
+
+/* debugging */
+static long leftover = 0, driverdata = 0, match = 0, mismatch = 0, sequence = 0;
+
+static void dump_bit(struct trace *t, const char *descr)
+{
+       struct blk_io_trace *bit = &t->bit;
+
+       if (!debug.fn)
+               return;
+
+       fprintf(debug.fp, "--- %s ---\n", descr);
+       fprintf(debug.fp, "magic    %16d\n", bit->magic);
+       fprintf(debug.fp, "sequence %16d\n", bit->sequence);
+       fprintf(debug.fp, "time     %16ld\n", (unsigned long)bit->time);
+       fprintf(debug.fp, "sector   %16ld\n", (unsigned long)bit->sector);
+       fprintf(debug.fp, "bytes    %16d\n", bit->bytes);
+       fprintf(debug.fp, "action   %16x\n", bit->action);
+       fprintf(debug.fp, "pid      %16d\n", bit->pid);
+       fprintf(debug.fp, "device   %16d\n", bit->device);
+       fprintf(debug.fp, "cpu      %16d\n", bit->cpu);
+       fprintf(debug.fp, "error    %16d\n", bit->error);
+       fprintf(debug.fp, "pdu_len  %16d\n", bit->pdu_len);
+
+       fprintf(debug.fp, "order    %16ld\n", t->sequence);
+}
+
+static void dump_bits(struct trace *t1, struct trace *t2, const char *descr)
+{
+       struct blk_io_trace *bit1 = &t1->bit;
+       struct blk_io_trace *bit2 = &t2->bit;
+
+       if (!debug.fn)
+               return;
+
+       fprintf(debug.fp, "--- %s ---\n", descr);
+       fprintf(debug.fp, "magic    %16d %16d\n", bit1->magic, bit2->magic);
+       fprintf(debug.fp, "sequence %16d %16d\n",
+               bit1->sequence, bit2->sequence);
+       fprintf(debug.fp, "time     %16ld %16ld\n",
+               (unsigned long)bit1->time, (unsigned long)bit2->time);
+       fprintf(debug.fp, "sector   %16ld %16ld\n",
+               (unsigned long)bit1->sector, (unsigned long)bit2->sector);
+       fprintf(debug.fp, "bytes    %16d %16d\n", bit1->bytes, bit2->bytes);
+       fprintf(debug.fp, "action   %16x %16x\n", bit1->action, bit2->action);
+       fprintf(debug.fp, "pid      %16d %16d\n", bit1->pid, bit2->pid);
+       fprintf(debug.fp, "device   %16d %16d\n", bit1->device, bit2->device);
+       fprintf(debug.fp, "cpu      %16d %16d\n", bit1->cpu, bit2->cpu);
+       fprintf(debug.fp, "error    %16d %16d\n", bit1->error, bit2->error);
+       fprintf(debug.fp, "pdu_len  %16d %16d\n", bit1->pdu_len, bit2->pdu_len);
+
+       fprintf(debug.fp, "order    %16ld %16ld\n", t1->sequence, t2->sequence);
+}
+
+static struct dstat *blkiomon_alloc_dstat(void)
+{
+       struct dstat *dstat;
+
+       if (vacant_dstats_list) {
+               dstat = vacant_dstats_list;
+               vacant_dstats_list = dstat->next;
+       } else
+               dstat = malloc(sizeof(*dstat));
+       if (!dstat) {
+               perror("blkiomon: could not allocate device statistic");
+               return NULL;
+       }
+
+       memset(dstat, 0, sizeof(*dstat));
+       return dstat;
+}
+
+static struct dstat *blkiomon_find_dstat(struct rb_search *search, __u32 device)
+{
+       struct rb_node **p = &(dstat_tree[dstat_curr].rb_node);
+       struct rb_node *parent = NULL;
+       struct dstat *dstat;
+
+       while (*p) {
+               parent = *p;
+
+               dstat = rb_entry(parent, struct dstat, node);
+
+               if (dstat->msg.stat.device < device)
+                       p = &(*p)->rb_left;
+               else if (dstat->msg.stat.device > device)
+                       p = &(*p)->rb_right;
+               else
+                       return dstat;
+       }
+       search->node_ptr = p;
+       search->parent = parent;
+       return NULL;
+}
+
+static struct dstat *blkiomon_get_dstat(__u32 device)
+{
+       struct dstat *dstat;
+       struct rb_search search;
+
+       pthread_mutex_lock(&dstat_mutex);
+
+       dstat = blkiomon_find_dstat(&search, device);
+       if (dstat)
+               goto out;
+
+       dstat = blkiomon_alloc_dstat();
+       if (!dstat)
+               goto out;
+
+       dstat->msg.stat.device = device;
+       dstat->msg.stat.size_mm.min = -1ULL;
+       dstat->msg.stat.d2c_mm.min = -1ULL;
+
+       rb_link_node(&dstat->node, search.parent, search.node_ptr);
+       rb_insert_color(&dstat->node, &dstat_tree[dstat_curr]);
+
+       dstat->next = dstat_list[dstat_curr];
+       dstat_list[dstat_curr] = dstat;
+
+out:
+       pthread_mutex_unlock(&dstat_mutex);
+       return dstat;
+}
+
+static int blkiomon_output_msg_q(struct dstat *dstat)
+{
+       if (!msg_q_name)
+               return 0;
+
+       dstat->msg.mtype = msg_id;
+       return msgsnd(msg_q, &dstat->msg, sizeof(struct blkiomon_stat), 0);
+}
+
+static int blkiomon_output_binary(struct dstat *dstat)
+{
+       struct blkiomon_stat *p = &dstat->msg.stat;
+
+       if (!binary.fn)
+               return 0;
+
+       if (fwrite(p, sizeof(*p), 1, binary.fp) != 1)
+               goto failed;
+       if (binary.pipe && fflush(binary.fp))
+               goto failed;
+       return 0;
+
+failed:
+       fprintf(stderr, "blkiomon: could not write to %s\n", binary.fn);
+       fclose(binary.fp);
+       binary.fn = NULL;
+       return 1;
+}
+
+static struct dstat *blkiomon_output(struct dstat *head, struct timespec *ts)
+{
+       struct dstat *dstat, *tail = NULL;
+
+       for (dstat = head; dstat; dstat = dstat->next) {
+               dstat->msg.stat.time = ts->tv_sec;
+               blkiomon_stat_print(human.fp, &dstat->msg.stat);
+               blkiomon_stat_to_be(&dstat->msg.stat);
+               blkiomon_output_binary(dstat);
+               blkiomon_output_msg_q(dstat);
+               tail = dstat;
+       }
+       return tail;
+}
+
+static void *blkiomon_interval(void *data)
+{
+       struct timespec wake, r;
+       struct dstat *head, *tail;
+       int finished;
+
+       clock_gettime(CLOCK_REALTIME, &wake);
+
+       while (1) {
+               wake.tv_sec += interval;
+               if (clock_nanosleep(CLOCK_REALTIME, TIMER_ABSTIME, &wake, &r)) {
+                       perror("blkiomon: interrupted sleep");
+                       continue;
+               }
+
+               /* grab tree and make data gatherer build up another tree */
+               pthread_mutex_lock(&dstat_mutex);
+               finished = dstat_curr;
+               dstat_curr = dstat_curr ? 0 : 1;
+               pthread_mutex_unlock(&dstat_mutex);
+
+               head = dstat_list[finished];
+               if (!head)
+                       continue;
+               dstat_list[finished] = NULL;
+               dstat_tree[finished] = RB_ROOT;
+               tail = blkiomon_output(head, &wake);
+
+               pthread_mutex_lock(&dstat_mutex);
+               tail->next = vacant_dstats_list;
+               vacant_dstats_list = head;
+               pthread_mutex_unlock(&dstat_mutex);
+       }
+       return data;
+}
+
+#define BLK_DATADIR(a) (((a) >> BLK_TC_SHIFT) & (BLK_TC_READ | BLK_TC_WRITE))
+
+static int blkiomon_account(struct blk_io_trace *bit_d,
+                           struct blk_io_trace *bit_c)
+{
+       struct dstat *dstat;
+       struct blkiomon_stat *p;
+       __u64 d2c = (bit_c->time - bit_d->time) / 1000; /* ns -> us */
+       __u32 size = bit_d->bytes;
+
+       dstat = blkiomon_get_dstat(bit_d->device);
+       if (!dstat)
+               return 1;
+       p = &dstat->msg.stat;
+
+       if (BLK_DATADIR(bit_c->action) & BLK_TC_READ)
+               p->read++;
+       else if (BLK_DATADIR(bit_c->action) & BLK_TC_WRITE)
+               p->write++;
+       else
+               p->bidir++;
+
+       histlog2_account(p->size_hist, size, &size_hist);
+       histlog2_account(p->d2c_hist, d2c, &d2c_hist);
+       minmax_account(&p->size_mm, size);
+       minmax_account(&p->d2c_mm, d2c);
+       return 0;
+}
+
+static struct trace *blkiomon_alloc_trace(void)
+{
+       struct trace *t = vacant_traces_list;
+       if (t) {
+               vacant_traces_list = t->next;
+               vacant_traces--;
+       } else
+               t = malloc(sizeof(*t));
+       memset(t, 0, sizeof(*t));
+       return t;
+}
+
+static void blkiomon_free_trace(struct trace *t)
+{
+       if (vacant_traces < 256) {
+               t->next = vacant_traces_list;
+               vacant_traces_list = t;
+               vacant_traces++;
+       } else
+               free(t);
+}
+
+static int action(int a)
+{
+       int bits = BLK_TC_WRITE | BLK_TC_READ | BLK_TC_FS | BLK_TC_PC;
+       return a & (BLK_TC_ACT(bits));
+}
+
+static void blkiomon_store_trace(struct trace *t)
+{
+       int i = t->bit.sector % TRACE_HASH_SIZE;
+
+       t->next = thash[i];
+       thash[i] = t;
+}
+
+static struct trace *blkiomon_fetch_trace(struct blk_io_trace *bit)
+{
+       int i = bit->sector % TRACE_HASH_SIZE;
+       struct trace *t, *prev = NULL;
+
+       for (t = thash[i]; t; t = t->next) {
+               if (t->bit.device == bit->device &&
+                   t->bit.sector == bit->sector &&
+                   action(t->bit.action) == action(bit->action)) {
+                       if (prev)
+                               prev->next = t->next;
+                       else
+                               thash[i] = t->next;
+                       return t;
+               }
+               prev = t;
+       }
+       return NULL;
+}
+
+static struct trace *blkiomon_do_trace(struct trace *t)
+{
+       struct trace *t_stored, *t_old, *t_young;
+
+       /* store trace if there is no match yet */
+       t_stored = blkiomon_fetch_trace(&t->bit);
+       if (!t_stored) {
+               blkiomon_store_trace(t);
+               return blkiomon_alloc_trace();
+       }
+
+       /* figure out older trace and younger trace */
+       if (t_stored->bit.time < t->bit.time) {
+               t_old = t_stored;
+               t_young = t;
+       } else {
+               t_old = t;
+               t_young = t_stored;
+       }
+
+       /* we need an older D trace and a younger C trace */
+       if (t_old->bit.action & BLK_TC_ACT(BLK_TC_ISSUE) &&
+           t_young->bit.action & BLK_TC_ACT(BLK_TC_COMPLETE)) {
+               /* matching D and C traces - update statistics */
+               match++;
+               blkiomon_account(&t_old->bit, &t_young->bit);
+               blkiomon_free_trace(t_stored);
+               return t;
+       }
+
+       /* no matching D and C traces - keep more recent trace */
+       dump_bits(t_old, t_young, "mismatch");
+       mismatch++;
+       blkiomon_store_trace(t_young);
+       return t_old;
+}
+
+static int blkiomon_do_fifo(void)
+{
+       struct trace *t;
+       struct blk_io_trace *bit;
+       void *pdu_buf = NULL;
+
+       t = blkiomon_alloc_trace();
+       if (!t)
+               return 1;
+       bit = &t->bit;
+
+       while (up) {
+               if (fread(bit, sizeof(*bit), 1, ifp) != 1) {
+                       if (!feof(ifp))
+                               fprintf(stderr,
+                                       "blkiomon: could not read trace");
+                       break;
+               }
+               if (ferror(ifp)) {
+                       clearerr(ifp);
+                       perror("blkiomon: error while reading trace");
+                       break;
+               }
+
+               if (data_is_native == -1 && check_data_endianness(bit->magic))
+                       break;
+
+               /* endianess */
+               trace_to_cpu(bit);
+               if (verify_trace(bit)) {
+                       perror("blkiomon: bad trace");
+                       break;
+               }
+
+               /* read additional trace payload */
+               if (bit->pdu_len) {
+                       pdu_buf = realloc(pdu_buf, bit->pdu_len);
+                       if (fread(pdu_buf, bit->pdu_len, 1, ifp) != 1) {
+                               clearerr(ifp);
+                               perror("blkiomon: could not read payload");
+                               break;
+                       }
+               }
+
+               t->sequence = sequence++;
+
+               if (!(bit->action & BLK_TC_ACT(BLK_TC_ISSUE | BLK_TC_COMPLETE)))
+                       continue;
+
+               /* try to find matching trace and update statistics */
+               t = blkiomon_do_trace(t);
+               if (!t)
+                       break;
+               bit = &t->bit;
+               /* t and bit will be recycled for next incoming trace */
+       }
+       blkiomon_free_trace(t);
+       free(pdu_buf);
+       return 0;
+}
+
+static int blkiomon_open_output(struct output *out)
+{
+       int mode, vbuf_size;
+
+       if (!out->fn)
+               return 0;
+
+       if (!strcmp(out->fn, "-")) {
+               out->fp = fdopen(STDOUT_FILENO, "w");
+               mode = _IOLBF;
+               vbuf_size = 4096;
+               out->pipe = 1;
+       } else {
+               out->fp = fopen(out->fn, "w");
+               mode = _IOFBF;
+               vbuf_size = 128 * 1024;
+               out->pipe = 0;
+       }
+       if (!out->fp)
+               goto failed;
+       out->buf = malloc(128 * 1024);
+       if (setvbuf(out->fp, out->buf, mode, vbuf_size))
+               goto failed;
+       return 0;
+
+failed:
+       fprintf(stderr, "blkiomon: could not write to %s\n", out->fn);
+       out->fn = NULL;
+       free(out->buf);
+       return 1;
+}
+
+static int blkiomon_open_msg_q(void)
+{
+       key_t key;
+
+       if (!msg_q_name)
+               return 0;
+       if (!msg_q_id || msg_id <= 0)
+               return 1;
+       key = ftok(msg_q_name, msg_q_id);
+       if (key == -1)
+               return 1;
+       while (up) {
+               msg_q = msgget(key, S_IRWXU);
+               if (msg_q >= 0)
+                       break;
+       }
+       return (msg_q >= 0 ? 0 : -1);
+}
+
+static void blkiomon_debug(void)
+{
+       struct rb_node *n;
+       struct trace *t;
+
+       if (!debug.fn)
+               return;
+
+       for (n = rb_first(&trace_tree); n; n = rb_next(n)) {
+               t = rb_entry(n, struct trace, node);
+               dump_bit(t, "leftover");
+               leftover++;
+       }
+       fprintf(debug.fp, "%ld leftover, %ld match, %ld mismatch, "
+               "%ld driverdata, %ld overall\n",
+               leftover, match, mismatch, driverdata, sequence);
+}
+
+#define S_OPTS "b:D:h:I:Q:q:m:V"
+
+static char usage_str[] = "\n\nblkiomon " \
+       "-I <interval>       | --interval=<interval>\n" \
+       "[ -h <file>         | --human-readable=<file> ]\n" \
+       "[ -b <file>         | --binary=<file> ]\n" \
+       "[ -D <file>         | --debug=<file> ]\n" \
+       "[ -Q <path name>    | --msg-queue-name=<path name>]\n" \
+       "[ -q <msg queue id> | --msg-queue-id=<msg queue id>]\n" \
+       "[ -m <msg id>       | --msg-id=<msg id>]\n" \
+       "[ -V                | --version ]\n\n" \
+       "\t-I   Sample interval.\n" \
+       "\t-h   Human-readable output file.\n" \
+       "\t-b   Binary output file.\n" \
+       "\t-D   Output file for debugging data.\n" \
+       "\t-Qqm Output to message queue using given ID for messages.\n" \
+       "\t-V   Print program version.\n\n";
+
+static struct option l_opts[] = {
+       {
+               .name = "human-readable",
+               .has_arg = required_argument,
+               .flag = NULL,
+               .val = 'h'
+       },
+       {
+               .name = "binary",
+               .has_arg = required_argument,
+               .flag = NULL,
+               .val = 'b'
+       },
+       {
+               .name = "debug",
+               .has_arg = required_argument,
+               .flag = NULL,
+               .val = 'D'
+       },
+       {
+               .name = "interval",
+               .has_arg = required_argument,
+               .flag = NULL,
+               .val = 'I'
+       },
+       {
+               .name = "msg-queue",
+               .has_arg = required_argument,
+               .flag = NULL,
+               .val = 'Q'
+       },
+       {
+               .name = "msg-queue-id",
+               .has_arg = required_argument,
+               .flag = NULL,
+               .val = 'q'
+       },
+       {
+               .name = "msg-id",
+               .has_arg = required_argument,
+               .flag = NULL,
+               .val = 'm'
+       },
+       {
+               .name = "version",
+               .has_arg = no_argument,
+               .flag = NULL,
+               .val = 'V'
+       },
+       {
+               .name = NULL,
+       }
+};
+
+static void blkiomon_signal(int signal)
+{
+       fprintf(stderr, "blkiomon: terminated by signal\n");
+       up = signal & 0;
+}
+
+int main(int argc, char *argv[])
+{
+       int c;
+
+       signal(SIGALRM, blkiomon_signal);
+       signal(SIGINT, blkiomon_signal);
+       signal(SIGTERM, blkiomon_signal);
+       signal(SIGQUIT, blkiomon_signal);
+
+       while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
+               switch (c) {
+               case 'h':
+                       human.fn = optarg;
+                       break;
+               case 'b':
+                       binary.fn = optarg;
+                       break;
+               case 'D':
+                       debug.fn = optarg;
+                       break;
+               case 'I':
+                       interval = atoi(optarg);
+                       break;
+               case 'Q':
+                       msg_q_name = optarg;
+                       break;
+               case 'q':
+                       msg_q_id = atoi(optarg);
+                       break;
+               case 'm':
+                       msg_id = atoi(optarg);
+                       break;
+               case 'V':
+                       printf("%s version %s\n", argv[0], blkiomon_version);
+                       return 0;
+               default:
+                       fprintf(stderr, "Usage: %s", usage_str);
+                       return 1;
+               }
+       }
+
+       if (interval <= 0) {
+               fprintf(stderr, "Usage: %s", usage_str);
+               return 1;
+       }
+
+       ifp = fdopen(STDIN_FILENO, "r");
+       if (!ifp) {
+               perror("blkiomon: could not open stdin for reading");
+               return 1;
+       }
+
+       if (blkiomon_open_output(&human))
+               return 1;
+       if (blkiomon_open_output(&binary))
+               return 1;
+       if (blkiomon_open_output(&debug))
+               return 1;
+       if (blkiomon_open_msg_q())
+               return 1;
+
+       if (pthread_create(&interval_thread, NULL, blkiomon_interval, NULL)) {
+               perror("blkiomon: could not create thread");
+               return 1;
+       }
+
+       blkiomon_do_fifo();
+
+       blkiomon_debug();
+       return 0;
+}