summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile5
-rw-r--r--blkiomon.c708
-rw-r--r--blkiomon.h105
-rw-r--r--doc/blkiomon.8116
-rw-r--r--stats.h155
5 files changed, 1088 insertions, 1 deletions
diff --git a/Makefile b/Makefile
index c78cf6b..623b84b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
CC = gcc
CFLAGS = -Wall -O2 -g -W
ALL_CFLAGS = $(CFLAGS) -D_GNU_SOURCE -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64
-PROGS = blkparse blktrace verify_blkparse blkrawverify
+PROGS = blkparse blktrace verify_blkparse blkrawverify blkiomon
LIBS = -lpthread
SCRIPTS = btrace
@@ -34,6 +34,9 @@ verify_blkparse: verify_blkparse.o
blkrawverify: blkrawverify.o
$(CC) $(ALL_CFLAGS) -o $@ $(filter %.o,$^)
+blkiomon: blkiomon.o rbtree.o
+ $(CC) $(ALL_CFLAGS) -o $@ $(filter %.o,$^) $(LIBS) -lrt
+
$(PROGS): | depend
docs:
diff --git a/blkiomon.c b/blkiomon.c
new file mode 100644
index 0000000..3ac4360
--- /dev/null
+++ b/blkiomon.c
@@ -0,0 +1,708 @@
+/*
+ * I/O monitor based on block queue trace data
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Author(s): Martin Peschke <mp3@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+#include <getopt.h>
+#include <errno.h>
+#include <locale.h>
+#include <libgen.h>
+#include <sys/msg.h>
+#include <pthread.h>
+#include <time.h>
+
+#include "blktrace.h"
+#include "rbtree.h"
+#include "jhash.h"
+#include "blkiomon.h"
+
+struct trace {
+ struct blk_io_trace bit;
+ struct rb_node node;
+ struct trace *next;
+ long sequence;
+};
+
+struct rb_search {
+ struct rb_node **node_ptr;
+ struct rb_node *parent;
+};
+
+struct dstat_msg {
+ long mtype;
+ struct blkiomon_stat stat;
+};
+
+struct dstat {
+ struct dstat_msg msg;
+ struct rb_node node;
+ struct dstat *next;
+};
+
+struct output {
+ char *fn;
+ FILE *fp;
+ char *buf;
+ int pipe;
+};
+
+static char blkiomon_version[] = "0.2";
+
+static FILE *ifp;
+static int interval = -1;
+
+static struct trace *vacant_traces_list = NULL;
+static int vacant_traces = 0;
+static struct rb_root trace_tree = RB_ROOT;
+
+#define TRACE_HASH_SIZE 128
+struct trace *thash[TRACE_HASH_SIZE] = {};
+
+static struct dstat *vacant_dstats_list = NULL;
+static struct rb_root dstat_tree[2] = { RB_ROOT, RB_ROOT };
+static struct dstat *dstat_list[2] = {};
+static int dstat_curr = 0;
+
+static struct output human, binary, debug;
+
+static char *msg_q_name = NULL;
+static int msg_q_id = -1, msg_q = -1;
+static long msg_id = -1;
+
+static pthread_t interval_thread;
+static pthread_mutex_t dstat_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+int data_is_native = -1;
+
+static int up = 1;
+
+/* debugging */
+static long leftover = 0, driverdata = 0, match = 0, mismatch = 0, sequence = 0;
+
+static void dump_bit(struct trace *t, const char *descr)
+{
+ struct blk_io_trace *bit = &t->bit;
+
+ if (!debug.fn)
+ return;
+
+ fprintf(debug.fp, "--- %s ---\n", descr);
+ fprintf(debug.fp, "magic %16d\n", bit->magic);
+ fprintf(debug.fp, "sequence %16d\n", bit->sequence);
+ fprintf(debug.fp, "time %16ld\n", (unsigned long)bit->time);
+ fprintf(debug.fp, "sector %16ld\n", (unsigned long)bit->sector);
+ fprintf(debug.fp, "bytes %16d\n", bit->bytes);
+ fprintf(debug.fp, "action %16x\n", bit->action);
+ fprintf(debug.fp, "pid %16d\n", bit->pid);
+ fprintf(debug.fp, "device %16d\n", bit->device);
+ fprintf(debug.fp, "cpu %16d\n", bit->cpu);
+ fprintf(debug.fp, "error %16d\n", bit->error);
+ fprintf(debug.fp, "pdu_len %16d\n", bit->pdu_len);
+
+ fprintf(debug.fp, "order %16ld\n", t->sequence);
+}
+
+static void dump_bits(struct trace *t1, struct trace *t2, const char *descr)
+{
+ struct blk_io_trace *bit1 = &t1->bit;
+ struct blk_io_trace *bit2 = &t2->bit;
+
+ if (!debug.fn)
+ return;
+
+ fprintf(debug.fp, "--- %s ---\n", descr);
+ fprintf(debug.fp, "magic %16d %16d\n", bit1->magic, bit2->magic);
+ fprintf(debug.fp, "sequence %16d %16d\n",
+ bit1->sequence, bit2->sequence);
+ fprintf(debug.fp, "time %16ld %16ld\n",
+ (unsigned long)bit1->time, (unsigned long)bit2->time);
+ fprintf(debug.fp, "sector %16ld %16ld\n",
+ (unsigned long)bit1->sector, (unsigned long)bit2->sector);
+ fprintf(debug.fp, "bytes %16d %16d\n", bit1->bytes, bit2->bytes);
+ fprintf(debug.fp, "action %16x %16x\n", bit1->action, bit2->action);
+ fprintf(debug.fp, "pid %16d %16d\n", bit1->pid, bit2->pid);
+ fprintf(debug.fp, "device %16d %16d\n", bit1->device, bit2->device);
+ fprintf(debug.fp, "cpu %16d %16d\n", bit1->cpu, bit2->cpu);
+ fprintf(debug.fp, "error %16d %16d\n", bit1->error, bit2->error);
+ fprintf(debug.fp, "pdu_len %16d %16d\n", bit1->pdu_len, bit2->pdu_len);
+
+ fprintf(debug.fp, "order %16ld %16ld\n", t1->sequence, t2->sequence);
+}
+
+static struct dstat *blkiomon_alloc_dstat(void)
+{
+ struct dstat *dstat;
+
+ if (vacant_dstats_list) {
+ dstat = vacant_dstats_list;
+ vacant_dstats_list = dstat->next;
+ } else
+ dstat = malloc(sizeof(*dstat));
+ if (!dstat) {
+ perror("blkiomon: could not allocate device statistic");
+ return NULL;
+ }
+
+ memset(dstat, 0, sizeof(*dstat));
+ return dstat;
+}
+
+static struct dstat *blkiomon_find_dstat(struct rb_search *search, __u32 device)
+{
+ struct rb_node **p = &(dstat_tree[dstat_curr].rb_node);
+ struct rb_node *parent = NULL;
+ struct dstat *dstat;
+
+ while (*p) {
+ parent = *p;
+
+ dstat = rb_entry(parent, struct dstat, node);
+
+ if (dstat->msg.stat.device < device)
+ p = &(*p)->rb_left;
+ else if (dstat->msg.stat.device > device)
+ p = &(*p)->rb_right;
+ else
+ return dstat;
+ }
+ search->node_ptr = p;
+ search->parent = parent;
+ return NULL;
+}
+
+static struct dstat *blkiomon_get_dstat(__u32 device)
+{
+ struct dstat *dstat;
+ struct rb_search search;
+
+ pthread_mutex_lock(&dstat_mutex);
+
+ dstat = blkiomon_find_dstat(&search, device);
+ if (dstat)
+ goto out;
+
+ dstat = blkiomon_alloc_dstat();
+ if (!dstat)
+ goto out;
+
+ dstat->msg.stat.device = device;
+ dstat->msg.stat.size_mm.min = -1ULL;
+ dstat->msg.stat.d2c_mm.min = -1ULL;
+
+ rb_link_node(&dstat->node, search.parent, search.node_ptr);
+ rb_insert_color(&dstat->node, &dstat_tree[dstat_curr]);
+
+ dstat->next = dstat_list[dstat_curr];
+ dstat_list[dstat_curr] = dstat;
+
+out:
+ pthread_mutex_unlock(&dstat_mutex);
+ return dstat;
+}
+
+static int blkiomon_output_msg_q(struct dstat *dstat)
+{
+ if (!msg_q_name)
+ return 0;
+
+ dstat->msg.mtype = msg_id;
+ return msgsnd(msg_q, &dstat->msg, sizeof(struct blkiomon_stat), 0);
+}
+
+static int blkiomon_output_binary(struct dstat *dstat)
+{
+ struct blkiomon_stat *p = &dstat->msg.stat;
+
+ if (!binary.fn)
+ return 0;
+
+ if (fwrite(p, sizeof(*p), 1, binary.fp) != 1)
+ goto failed;
+ if (binary.pipe && fflush(binary.fp))
+ goto failed;
+ return 0;
+
+failed:
+ fprintf(stderr, "blkiomon: could not write to %s\n", binary.fn);
+ fclose(binary.fp);
+ binary.fn = NULL;
+ return 1;
+}
+
+static struct dstat *blkiomon_output(struct dstat *head, struct timespec *ts)
+{
+ struct dstat *dstat, *tail = NULL;
+
+ for (dstat = head; dstat; dstat = dstat->next) {
+ dstat->msg.stat.time = ts->tv_sec;
+ blkiomon_stat_print(human.fp, &dstat->msg.stat);
+ blkiomon_stat_to_be(&dstat->msg.stat);
+ blkiomon_output_binary(dstat);
+ blkiomon_output_msg_q(dstat);
+ tail = dstat;
+ }
+ return tail;
+}
+
+static void *blkiomon_interval(void *data)
+{
+ struct timespec wake, r;
+ struct dstat *head, *tail;
+ int finished;
+
+ clock_gettime(CLOCK_REALTIME, &wake);
+
+ while (1) {
+ wake.tv_sec += interval;
+ if (clock_nanosleep(CLOCK_REALTIME, TIMER_ABSTIME, &wake, &r)) {
+ perror("blkiomon: interrupted sleep");
+ continue;
+ }
+
+ /* grab tree and make data gatherer build up another tree */
+ pthread_mutex_lock(&dstat_mutex);
+ finished = dstat_curr;
+ dstat_curr = dstat_curr ? 0 : 1;
+ pthread_mutex_unlock(&dstat_mutex);
+
+ head = dstat_list[finished];
+ if (!head)
+ continue;
+ dstat_list[finished] = NULL;
+ dstat_tree[finished] = RB_ROOT;
+ tail = blkiomon_output(head, &wake);
+
+ pthread_mutex_lock(&dstat_mutex);
+ tail->next = vacant_dstats_list;
+ vacant_dstats_list = head;
+ pthread_mutex_unlock(&dstat_mutex);
+ }
+ return data;
+}
+
+#define BLK_DATADIR(a) (((a) >> BLK_TC_SHIFT) & (BLK_TC_READ | BLK_TC_WRITE))
+
+static int blkiomon_account(struct blk_io_trace *bit_d,
+ struct blk_io_trace *bit_c)
+{
+ struct dstat *dstat;
+ struct blkiomon_stat *p;
+ __u64 d2c = (bit_c->time - bit_d->time) / 1000; /* ns -> us */
+ __u32 size = bit_d->bytes;
+
+ dstat = blkiomon_get_dstat(bit_d->device);
+ if (!dstat)
+ return 1;
+ p = &dstat->msg.stat;
+
+ if (BLK_DATADIR(bit_c->action) & BLK_TC_READ)
+ p->read++;
+ else if (BLK_DATADIR(bit_c->action) & BLK_TC_WRITE)
+ p->write++;
+ else
+ p->bidir++;
+
+ histlog2_account(p->size_hist, size, &size_hist);
+ histlog2_account(p->d2c_hist, d2c, &d2c_hist);
+ minmax_account(&p->size_mm, size);
+ minmax_account(&p->d2c_mm, d2c);
+ return 0;
+}
+
+static struct trace *blkiomon_alloc_trace(void)
+{
+ struct trace *t = vacant_traces_list;
+ if (t) {
+ vacant_traces_list = t->next;
+ vacant_traces--;
+ } else
+ t = malloc(sizeof(*t));
+ memset(t, 0, sizeof(*t));
+ return t;
+}
+
+static void blkiomon_free_trace(struct trace *t)
+{
+ if (vacant_traces < 256) {
+ t->next = vacant_traces_list;
+ vacant_traces_list = t;
+ vacant_traces++;
+ } else
+ free(t);
+}
+
+static int action(int a)
+{
+ int bits = BLK_TC_WRITE | BLK_TC_READ | BLK_TC_FS | BLK_TC_PC;
+ return a & (BLK_TC_ACT(bits));
+}
+
+static void blkiomon_store_trace(struct trace *t)
+{
+ int i = t->bit.sector % TRACE_HASH_SIZE;
+
+ t->next = thash[i];
+ thash[i] = t;
+}
+
+static struct trace *blkiomon_fetch_trace(struct blk_io_trace *bit)
+{
+ int i = bit->sector % TRACE_HASH_SIZE;
+ struct trace *t, *prev = NULL;
+
+ for (t = thash[i]; t; t = t->next) {
+ if (t->bit.device == bit->device &&
+ t->bit.sector == bit->sector &&
+ action(t->bit.action) == action(bit->action)) {
+ if (prev)
+ prev->next = t->next;
+ else
+ thash[i] = t->next;
+ return t;
+ }
+ prev = t;
+ }
+ return NULL;
+}
+
+static struct trace *blkiomon_do_trace(struct trace *t)
+{
+ struct trace *t_stored, *t_old, *t_young;
+
+ /* store trace if there is no match yet */
+ t_stored = blkiomon_fetch_trace(&t->bit);
+ if (!t_stored) {
+ blkiomon_store_trace(t);
+ return blkiomon_alloc_trace();
+ }
+
+ /* figure out older trace and younger trace */
+ if (t_stored->bit.time < t->bit.time) {
+ t_old = t_stored;
+ t_young = t;
+ } else {
+ t_old = t;
+ t_young = t_stored;
+ }
+
+ /* we need an older D trace and a younger C trace */
+ if (t_old->bit.action & BLK_TC_ACT(BLK_TC_ISSUE) &&
+ t_young->bit.action & BLK_TC_ACT(BLK_TC_COMPLETE)) {
+ /* matching D and C traces - update statistics */
+ match++;
+ blkiomon_account(&t_old->bit, &t_young->bit);
+ blkiomon_free_trace(t_stored);
+ return t;
+ }
+
+ /* no matching D and C traces - keep more recent trace */
+ dump_bits(t_old, t_young, "mismatch");
+ mismatch++;
+ blkiomon_store_trace(t_young);
+ return t_old;
+}
+
+static int blkiomon_do_fifo(void)
+{
+ struct trace *t;
+ struct blk_io_trace *bit;
+ void *pdu_buf = NULL;
+
+ t = blkiomon_alloc_trace();
+ if (!t)
+ return 1;
+ bit = &t->bit;
+
+ while (up) {
+ if (fread(bit, sizeof(*bit), 1, ifp) != 1) {
+ if (!feof(ifp))
+ fprintf(stderr,
+ "blkiomon: could not read trace");
+ break;
+ }
+ if (ferror(ifp)) {
+ clearerr(ifp);
+ perror("blkiomon: error while reading trace");
+ break;
+ }
+
+ if (data_is_native == -1 && check_data_endianness(bit->magic))
+ break;
+
+ /* endianess */
+ trace_to_cpu(bit);
+ if (verify_trace(bit)) {
+ perror("blkiomon: bad trace");
+ break;
+ }
+
+ /* read additional trace payload */
+ if (bit->pdu_len) {
+ pdu_buf = realloc(pdu_buf, bit->pdu_len);
+ if (fread(pdu_buf, bit->pdu_len, 1, ifp) != 1) {
+ clearerr(ifp);
+ perror("blkiomon: could not read payload");
+ break;
+ }
+ }
+
+ t->sequence = sequence++;
+
+ if (!(bit->action & BLK_TC_ACT(BLK_TC_ISSUE | BLK_TC_COMPLETE)))
+ continue;
+
+ /* try to find matching trace and update statistics */
+ t = blkiomon_do_trace(t);
+ if (!t)
+ break;
+ bit = &t->bit;
+ /* t and bit will be recycled for next incoming trace */
+ }
+ blkiomon_free_trace(t);
+ free(pdu_buf);
+ return 0;
+}
+
+static int blkiomon_open_output(struct output *out)
+{
+ int mode, vbuf_size;
+
+ if (!out->fn)
+ return 0;
+
+ if (!strcmp(out->fn, "-")) {
+ out->fp = fdopen(STDOUT_FILENO, "w");
+ mode = _IOLBF;
+ vbuf_size = 4096;
+ out->pipe = 1;
+ } else {
+ out->fp = fopen(out->fn, "w");
+ mode = _IOFBF;
+ vbuf_size = 128 * 1024;
+ out->pipe = 0;
+ }
+ if (!out->fp)
+ goto failed;
+ out->buf = malloc(128 * 1024);
+ if (setvbuf(out->fp, out->buf, mode, vbuf_size))
+ goto failed;
+ return 0;
+
+failed:
+ fprintf(stderr, "blkiomon: could not write to %s\n", out->fn);
+ out->fn = NULL;
+ free(out->buf);
+ return 1;
+}
+
+static int blkiomon_open_msg_q(void)
+{
+ key_t key;
+
+ if (!msg_q_name)
+ return 0;
+ if (!msg_q_id || msg_id <= 0)
+ return 1;
+ key = ftok(msg_q_name, msg_q_id);
+ if (key == -1)
+ return 1;
+ while (up) {
+ msg_q = msgget(key, S_IRWXU);
+ if (msg_q >= 0)
+ break;
+ }
+ return (msg_q >= 0 ? 0 : -1);
+}
+
+static void blkiomon_debug(void)
+{
+ struct rb_node *n;
+ struct trace *t;
+
+ if (!debug.fn)
+ return;
+
+ for (n = rb_first(&trace_tree); n; n = rb_next(n)) {
+ t = rb_entry(n, struct trace, node);
+ dump_bit(t, "leftover");
+ leftover++;
+ }
+ fprintf(debug.fp, "%ld leftover, %ld match, %ld mismatch, "
+ "%ld driverdata, %ld overall\n",
+ leftover, match, mismatch, driverdata, sequence);
+}
+
+#define S_OPTS "b:D:h:I:Q:q:m:V"
+
+static char usage_str[] = "\n\nblkiomon " \
+ "-I <interval> | --interval=<interval>\n" \
+ "[ -h <file> | --human-readable=<file> ]\n" \
+ "[ -b <file> | --binary=<file> ]\n" \
+ "[ -D <file> | --debug=<file> ]\n" \
+ "[ -Q <path name> | --msg-queue-name=<path name>]\n" \
+ "[ -q <msg queue id> | --msg-queue-id=<msg queue id>]\n" \
+ "[ -m <msg id> | --msg-id=<msg id>]\n" \
+ "[ -V | --version ]\n\n" \
+ "\t-I Sample interval.\n" \
+ "\t-h Human-readable output file.\n" \
+ "\t-b Binary output file.\n" \
+ "\t-D Output file for debugging data.\n" \
+ "\t-Qqm Output to message queue using given ID for messages.\n" \
+ "\t-V Print program version.\n\n";
+
+static struct option l_opts[] = {
+ {
+ .name = "human-readable",
+ .has_arg = required_argument,
+ .flag = NULL,
+ .val = 'h'
+ },
+ {
+ .name = "binary",
+ .has_arg = required_argument,
+ .flag = NULL,
+ .val = 'b'
+ },
+ {
+ .name = "debug",
+ .has_arg = required_argument,
+ .flag = NULL,
+ .val = 'D'
+ },
+ {
+ .name = "interval",
+ .has_arg = required_argument,
+ .flag = NULL,
+ .val = 'I'
+ },
+ {
+ .name = "msg-queue",
+ .has_arg = required_argument,
+ .flag = NULL,
+ .val = 'Q'
+ },
+ {
+ .name = "msg-queue-id",
+ .has_arg = required_argument,
+ .flag = NULL,
+ .val = 'q'
+ },
+ {
+ .name = "msg-id",
+ .has_arg = required_argument,
+ .flag = NULL,
+ .val = 'm'
+ },
+ {
+ .name = "version",
+ .has_arg = no_argument,
+ .flag = NULL,
+ .val = 'V'
+ },
+ {
+ .name = NULL,
+ }
+};
+
+static void blkiomon_signal(int signal)
+{
+ fprintf(stderr, "blkiomon: terminated by signal\n");
+ up = signal & 0;
+}
+
+int main(int argc, char *argv[])
+{
+ int c;
+
+ signal(SIGALRM, blkiomon_signal);
+ signal(SIGINT, blkiomon_signal);
+ signal(SIGTERM, blkiomon_signal);
+ signal(SIGQUIT, blkiomon_signal);
+
+ while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
+ switch (c) {
+ case 'h':
+ human.fn = optarg;
+ break;
+ case 'b':
+ binary.fn = optarg;
+ break;
+ case 'D':
+ debug.fn = optarg;
+ break;
+ case 'I':
+ interval = atoi(optarg);
+ break;
+ case 'Q':
+ msg_q_name = optarg;
+ break;
+ case 'q':
+ msg_q_id = atoi(optarg);
+ break;
+ case 'm':
+ msg_id = atoi(optarg);
+ break;
+ case 'V':
+ printf("%s version %s\n", argv[0], blkiomon_version);
+ return 0;
+ default:
+ fprintf(stderr, "Usage: %s", usage_str);
+ return 1;
+ }
+ }
+
+ if (interval <= 0) {
+ fprintf(stderr, "Usage: %s", usage_str);
+ return 1;
+ }
+
+ ifp = fdopen(STDIN_FILENO, "r");
+ if (!ifp) {
+ perror("blkiomon: could not open stdin for reading");
+ return 1;
+ }
+
+ if (blkiomon_open_output(&human))
+ return 1;
+ if (blkiomon_open_output(&binary))
+ return 1;
+ if (blkiomon_open_output(&debug))
+ return 1;
+ if (blkiomon_open_msg_q())
+ return 1;
+
+ if (pthread_create(&interval_thread, NULL, blkiomon_interval, NULL)) {
+ perror("blkiomon: could not create thread");
+ return 1;
+ }
+
+ blkiomon_do_fifo();
+
+ blkiomon_debug();
+ return 0;
+}
diff --git a/blkiomon.h b/blkiomon.h
new file mode 100644
index 0000000..2b1f7b2
--- /dev/null
+++ b/blkiomon.h
@@ -0,0 +1,105 @@
+/*
+ * I/O monitor based on block queue trace data
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Author(s): Martin Peschke <mp3@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef BLKIOMON_H
+#define BLKIOMON_H
+
+#include <string.h>
+
+#include "stats.h"
+#include "blktrace.h"
+
+#define BLKIOMON_SIZE_BUCKETS 16
+#define BLKIOMON_D2C_BUCKETS 25
+struct blkiomon_stat {
+ __u64 time;
+ __u32 size_hist[BLKIOMON_SIZE_BUCKETS];
+ __u32 d2c_hist[BLKIOMON_D2C_BUCKETS];
+ struct minmax size_mm;
+ struct minmax d2c_mm;
+ __u64 read;
+ __u64 write;
+ __u64 bidir;
+ __u32 device;
+};
+
+static struct histlog2 size_hist = {
+ .first = 0,
+ .delta = 1024,
+ .num = BLKIOMON_SIZE_BUCKETS
+};
+
+static struct histlog2 d2c_hist = {
+ .first = 0,
+ .delta = 8,
+ .num = BLKIOMON_D2C_BUCKETS
+};
+
+static inline void blkiomon_stat_init(struct blkiomon_stat *bstat)
+{
+ memset(bstat, 0, sizeof(*bstat));
+ minmax_init(&bstat->size_mm);
+ minmax_init(&bstat->d2c_mm);
+}
+
+static inline void blkiomon_stat_to_be(struct blkiomon_stat *bstat)
+{
+ histlog2_to_be(bstat->size_hist, &size_hist);
+ histlog2_to_be(bstat->d2c_hist, &d2c_hist);
+ minmax_to_be(&bstat->size_mm);
+ minmax_to_be(&bstat->d2c_mm);
+ bstat->read = cpu_to_be64(bstat->read);
+ bstat->write = cpu_to_be64(bstat->write);
+ bstat->bidir = cpu_to_be64(bstat->bidir);
+ bstat->time = cpu_to_be64(bstat->time);
+ bstat->device = cpu_to_be32(bstat->device);
+}
+
+static inline void blkiomon_stat_merge(struct blkiomon_stat *dst,
+ struct blkiomon_stat *src)
+{
+ histlog2_merge(&size_hist, dst->size_hist, src->size_hist);
+ histlog2_merge(&d2c_hist, dst->d2c_hist, src->d2c_hist);
+ minmax_merge(&dst->size_mm, &src->size_mm);
+ minmax_merge(&dst->d2c_mm, &src->d2c_mm);
+ dst->read += src->read;
+ dst->write += src->write;
+ dst->bidir += src->bidir;
+}
+
+static inline void blkiomon_stat_print(FILE *fp, struct blkiomon_stat *p)
+{
+ if (!fp)
+ return;
+
+ fprintf(fp, "\ntime: %s", ctime((void *)&p->time));
+ fprintf(fp, "device: %d,%d\n", MAJOR(p->device), MINOR(p->device));
+ fprintf(fp, "requests: read %ld, write %ld, bidir: %ld\n",
+ (unsigned long)p->read, (unsigned long)p->write,
+ (unsigned long)p->bidir);
+ minmax_print(fp, "sizes", &p->size_mm);
+ minmax_print(fp, "d2c", &p->d2c_mm);
+ histlog2_print(fp, "sizes histogram (in kB)", p->size_hist, &size_hist);
+ histlog2_print(fp, "d2c histogram (in usec)", p->d2c_hist, &d2c_hist);
+}
+
+#endif
diff --git a/doc/blkiomon.8 b/doc/blkiomon.8
new file mode 100644
index 0000000..54ff099
--- /dev/null
+++ b/doc/blkiomon.8
@@ -0,0 +1,116 @@
+.TH BLKIOMON 8 "July 17, 2008" "" ""
+
+
+.SH NAME
+blkiomon \- monitor block device I/O based o blktrace data
+
+
+.SH SYNOPSIS
+.B blkiomon \-I \fIinterval\fR [ \-h \fIfile\fR ] [ \-b \fIfile\fR ]
+[ \-D \fIfile\fR ] [ \-Q \fIpath_name\fR
+\-q \fImsg_queue_id\fR \-m \fImsg_id\fR ] [ \-V ]
+.br
+
+
+.SH DESCRIPTION
+blkiomon is a block device I/O monitor. It periodically generates per device
+request size and request latency statistics from blktrace data. It provides
+histograms as well as data that can be used to calculate min, max, average
+and variance. For this purpose, it consumes D and C traces read from stdin.
+
+There are options for binary output and human-readable output to files and
+stdout. Output to a message queue is supported as well.
+
+There is no need to use blkparse with blkiomon. blkiomon is capable of
+consuming binary output written to stdout by blktrace.
+
+
+.SH OPTIONS
+
+\-I \fIinterval\fR
+.br
+\-\-interval=\fIinterval\fR
+.RS
+Set sample interval
+.RE
+
+\-h \fIfile\fR
+.br
+\-\-human\-readable=\fIfile\fR
+.RS
+Human-readable output file. Use '\-' for stdout.
+.RE
+
+\-b \fIfile\fR
+.br
+\-\-binary=\fIfile\fR
+.RS
+Binary output file. Use '\-' for stdout.
+.RE
+
+\-D \fIfile\fR
+.br
+\-\-debug=\fIfile\fR
+.RS
+Output file for debugging data. Use '\-' for stdout.
+.RE
+
+\-Q \fIpath_name\fR
+.br
+\-\-msg\-queue\-name=\fIpath_name\fR
+.RS
+Sets \fIpath_name\fR as path name for existing message queue to be used
+for binary output.
+.RE
+
+\-q \fImsg_queue_id\fR
+.br
+\-\-msg\-queue\-id=\fImsg_queue_id\fR
+.RS
+Sets \fImsg_queue_id\fR as ID for an existing message queue to be used
+for binary output.
+.RE
+
+\-m \fImsg_id\fR
+.br
+\-\-msg\-id=\fImsg_id\fR
+.RS
+Sets \fImsg_id\fR as message identifier to be used for binary output
+messages written to an existing message queue.
+.RE
+
+\-V
+.br
+\-\-version
+.RS
+Print program version.
+.RE
+
+
+.SH EXAMPLES
+To get I/O statistics for /dev/sdw every 10 seconds for a period of one hour,
+use the following command:
+
+ % blktrace /dev/sdw -a issue -a complete -w 3600 -o - | blkiomon -I 10 -h -
+
+
+.SH AUTHORS
+blkiomon and this man page were written by Martin Peschke.
+
+
+.SH "REPORTING BUGS"
+Report bugs to <linux\-btrace@vger.kernel.org>
+
+
+.SH COPYRIGHT
+Copyright \(co 2008 IBM Corp.
+.br
+This is free software. You may redistribute copies of it under the terms of
+the GNU General Public License <http://www.gnu.org/licenses/gpl.html>.
+There is NO WARRANTY, to the extent permitted by law.
+
+
+.SH "SEE ALSO"
+btrace (8), blktrace (8), blkparse (1), verify_blkparse (1), blkrawverify (1),
+btt (1)
+
diff --git a/stats.h b/stats.h
new file mode 100644
index 0000000..34f492c
--- /dev/null
+++ b/stats.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright IBM Corp. 2008
+ *
+ * Author(s): Martin Peschke <mp3@de.ibm.com>
+ * Stefan Raspl <stefan.raspl@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef STATS_H
+#define STATS_H
+
+#include <linux/types.h>
+#include "endian.h"
+
+struct minmax {
+ __u64 min;
+ __u64 max;
+ __u64 sum;
+ __u64 sos;
+ __u64 num;
+};
+
+static inline void minmax_init(struct minmax *mm)
+{
+ mm->min = -1ULL;
+ mm->max = 0;
+ mm->sum = 0;
+ mm->sos = 0;
+ mm->num = 0;
+}
+
+static inline void minmax_account(struct minmax *mm, __u64 value)
+{
+ mm->sum += value;
+ mm->sos += value * value;
+ if (value < mm->min)
+ mm->min = value;
+ if (value > mm->max)
+ mm->max = value;
+ mm->num++;
+}
+
+static inline void minmax_merge(struct minmax *dst, struct minmax *src)
+{
+ dst->sum += src->sum;
+ dst->sos += src->sos;
+ if (src->min < dst->min)
+ dst->min = src->min;
+ if (src->max > dst->max)
+ dst->max = src->max;
+ dst->num += src->num;
+}
+
+static inline void minmax_to_be(struct minmax *mm)
+{
+ mm->sum = cpu_to_be64(mm->sum);
+ mm->sos = cpu_to_be64(mm->sos);
+ mm->min = cpu_to_be64(mm->min);
+ mm->max = cpu_to_be64(mm->max);
+ mm->num = cpu_to_be64(mm->num);
+}
+
+static inline double minmax_avg(struct minmax *mm)
+{
+ return (mm->sum / (double)mm->num);
+}
+
+static inline double minmax_var(struct minmax *mm)
+{
+ double num = (double)mm->num;
+
+ return ((mm->sos - ((mm->sum * mm->sum) / num)) / num);
+}
+
+static inline int minmax_print(FILE *fp, const char *s, struct minmax *mm)
+{
+ return fprintf(fp, "%s: num %Ld, min %Ld, max %Ld, sum %Ld, squ %Ld, "
+ "avg %.1f, var %.1f\n", s, (unsigned long long)mm->num,
+ (unsigned long long)mm->min, (unsigned long long)mm->max,
+ (unsigned long long)mm->sum, (unsigned long long)mm->sos,
+ minmax_avg(mm), minmax_var(mm));
+}
+
+struct histlog2 {
+ int first;
+ int delta;
+ int num;
+};
+
+static inline __u64 histlog2_upper_limit(int index, struct histlog2 *h)
+{
+ return h->first + (index ? h->delta << (index - 1) : 0);
+}
+
+static inline int histlog2_index(__u64 val, struct histlog2 *h)
+{
+ int i;
+
+ for (i = 0; i < (h->num - 1) && val > histlog2_upper_limit(i, h); i++);
+ return i;
+}
+
+static inline void histlog2_account(__u32 *bucket, __u32 val,
+ struct histlog2 *h)
+{
+ int index = histlog2_index(val, h);
+ bucket[index]++;
+}
+
+static inline void histlog2_merge(struct histlog2 *h, __u32 *dst, __u32 *src)
+{
+ int i;
+
+ for (i = 0; i < h->num - 1; i++)
+ dst[i] += src[i];
+}
+
+static inline void histlog2_to_be(__u32 a[], struct histlog2 *h)
+{
+ int i;
+
+ for (i = 0; i < h->num - 1; i++)
+ a[i] = cpu_to_be32(a[i]);
+}
+
+static inline void histlog2_print(FILE *fp, const char *s, __u32 a[],
+ struct histlog2 *h)
+{
+ int i;
+
+ fprintf(fp, "%s:\n", s);
+ for (i = 0; i < h->num - 1; i++) {
+ fprintf(fp, " %10ld:%6d",
+ (unsigned long)(histlog2_upper_limit(i, h)), a[i]);
+ if (!((i + 1) % 4))
+ fprintf(fp, "\n");
+ }
+ fprintf(fp, " >%8ld:%6d\n",
+ (unsigned long)(histlog2_upper_limit(i - 1, h)), a[i]);
+}
+
+#endif