btreplay/btrecord man pages
[blktrace.git] / blktrace.c
index dd0d5bfb5dbe617a8ba26de7db7adbc812bbd7fb..4d458acc8e26065e31eaabaf9a47051a968e0760 100644 (file)
@@ -2,6 +2,7 @@
  * block queue tracing application
  *
  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
+ * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -46,7 +47,7 @@
 #include "blktrace.h"
 #include "barrier.h"
 
-static char blktrace_version[] = "0.99";
+static char blktrace_version[] = "0.99.3";
 
 /*
  * You may want to increase this even more, if you are logging at a high
@@ -57,9 +58,9 @@ static char blktrace_version[] = "0.99";
 
 #define OFILE_BUF      (128 * 1024)
 
-#define RELAYFS_TYPE   0xF0B4A981
+#define DEBUGFS_TYPE   0x64626720
 
-#define S_OPTS "d:a:A:r:o:kw:Vb:n:D:lh:p:s"
+#define S_OPTS "d:a:A:r:o:kw:Vb:n:D:lh:p:sI:"
 static struct option l_opts[] = {
        {
                .name = "dev",
@@ -67,6 +68,12 @@ static struct option l_opts[] = {
                .flag = NULL,
                .val = 'd'
        },
+       {
+               .name = "input-devs",
+               .has_arg = required_argument,
+               .flag = NULL,
+               .val = 'I'
+       },
        {
                .name = "act-mask",
                .has_arg = required_argument,
@@ -146,7 +153,7 @@ static struct option l_opts[] = {
                .val = 'p'
        },
        {
-               .name = "sendfile",
+               .name = "no-sendfile",
                .has_arg = no_argument,
                .flag = NULL,
                .val = 's'
@@ -160,7 +167,6 @@ struct tip_subbuf {
        void *buf;
        unsigned int len;
        unsigned int max_len;
-       off_t offset;
 };
 
 #define FIFO_SIZE      (1024)  /* should be plenty big! */
@@ -192,6 +198,7 @@ struct thread_information {
 
        unsigned long events_processed;
        unsigned long long data_read;
+       unsigned long long data_queued;
        struct device_information *device;
 
        int exited;
@@ -210,6 +217,8 @@ struct thread_information {
        unsigned long fs_off;
        void *fs_buf;
        unsigned long fs_buf_len;
+
+       struct net_connection *nc;
 };
 
 struct device_information {
@@ -219,6 +228,13 @@ struct device_information {
        volatile int trace_started;
        unsigned long drop_count;
        struct thread_information *threads;
+       unsigned long buf_size;
+       unsigned long buf_nr;
+       unsigned int page_size;
+
+       struct cl_host *ch;
+       u32 cl_id;
+       time_t cl_connect_time;
 };
 
 static int ncpus;
@@ -227,7 +243,7 @@ static int ndevs;
 static struct device_information *device_information;
 
 /* command line option globals */
-static char *relay_path;
+static char *debugfs_path;
 static char *output_name;
 static char *output_dir;
 static int act_mask = ~0U;
@@ -252,12 +268,20 @@ static void exit_trace(int status);
 #define dip_tracing(dip)       (*(volatile int *)(&(dip)->trace_started))
 #define dip_set_tracing(dip, v)        ((dip)->trace_started = (v))
 
-#define __for_each_dip(__d, __i, __e)  \
-       for (__i = 0, __d = device_information; __i < __e; __i++, __d++)
+#define __for_each_dip(__d, __di, __e, __i)    \
+       for (__i = 0, __d = __di; __i < __e; __i++, __d++)
 
-#define for_each_dip(__d, __i) __for_each_dip(__d, __i, ndevs)
+#define for_each_dip(__d, __i)         \
+       __for_each_dip(__d, device_information, ndevs, __i)
+#define for_each_nc_dip(__nc, __d, __i)                \
+       __for_each_dip(__d, (__nc)->ch->device_information, (__nc)->ch->ndevs, __i)
+
+#define __for_each_tip(__d, __t, __ncpus, __j) \
+       for (__j = 0, __t = (__d)->threads; __j < __ncpus; __j++, __t++)
 #define for_each_tip(__d, __t, __j)    \
-       for (__j = 0, __t = (__d)->threads; __j < ncpus; __j++, __t++)
+       __for_each_tip(__d, __t, ncpus, __j)
+#define for_each_cl_host(__c)  \
+       for (__c = cl_host_list; __c; __c = __c->list_next)
 
 /*
  * networking stuff follows. we include a magic number so we know whether
@@ -269,6 +293,10 @@ struct blktrace_net_hdr {
        u32 cpu;                /* for which cpu */
        u32 max_cpus;
        u32 len;                /* length of following trace data */
+       u32 cl_id;              /* id for set of client per-cpu connections */
+       u32 buf_size;           /* client buf_size for this trace  */
+       u32 buf_nr;             /* client buf_nr for this trace  */
+       u32 page_size;          /* client page_size for this trace  */
 };
 
 #define TRACE_NET_PORT         (8462)
@@ -285,13 +313,49 @@ enum {
 static char hostname[MAXHOSTNAMELEN];
 static int net_port = TRACE_NET_PORT;
 static int net_mode = 0;
-static int net_sendfile;
+static int net_use_sendfile = 1;
+
+struct cl_host {
+       struct cl_host *list_next;
+       struct in_addr cl_in_addr;
+       struct net_connection *net_connections;
+       int nconn;
+       struct device_information *device_information;
+       int ndevs;
+       int ncpus;
+       int ndevs_done;
+};
+
+struct net_connection {
+       int in_fd;
+       struct pollfd pfd;
+       time_t connect_time;
+       struct cl_host *ch;
+       int ncpus;
+};
+
+#define NET_MAX_CL_HOSTS       (1024)
+static struct cl_host *cl_host_list;
+static int cl_hosts;
+static int net_connects;
 
-static int net_in_fd = -1;
-static int net_out_fd = -1;
+static int *net_out_fd;
 
 static void handle_sigint(__attribute__((__unused__)) int sig)
 {
+       struct device_information *dip;
+       int i;
+
+       /*
+        * stop trace so we can reap currently produced data
+        */
+       for_each_dip(dip, i) {
+               if (dip->fd == -1)
+                       continue;
+               if (ioctl(dip->fd, BLKTRACESTOP) < 0)
+                       perror("BLKTRACESTOP");
+       }
+
        done = 1;
 }
 
@@ -301,7 +365,7 @@ static int get_dropped_count(const char *buts_name)
        char tmp[MAXPATHLEN + 64];
 
        snprintf(tmp, sizeof(tmp), "%s/block/%s/dropped",
-                relay_path, buts_name);
+                debugfs_path, buts_name);
 
        fd = open(tmp, O_RDONLY);
        if (fd < 0) {
@@ -331,8 +395,8 @@ static int start_trace(struct device_information *dip)
        struct blk_user_trace_setup buts;
 
        memset(&buts, 0, sizeof(buts));
-       buts.buf_size = buf_size;
-       buts.buf_nr = buf_nr;
+       buts.buf_size = dip->buf_size;
+       buts.buf_nr = dip->buf_nr;
        buts.act_mask = act_mask;
 
        if (ioctl(dip->fd, BLKTRACESETUP, &buts) < 0) {
@@ -355,8 +419,11 @@ static void stop_trace(struct device_information *dip)
        if (dip_tracing(dip) || kill_running_trace) {
                dip_set_tracing(dip, 0);
 
-               if (ioctl(dip->fd, BLKTRACESTOP) < 0)
-                       perror("BLKTRACESTOP");
+               /*
+                * should be stopped, just don't complain if it isn't
+                */
+               ioctl(dip->fd, BLKTRACESTOP);
+
                if (ioctl(dip->fd, BLKTRACETEARDOWN) < 0)
                        perror("BLKTRACETEARDOWN");
 
@@ -376,17 +443,20 @@ static void stop_all_traces(void)
        }
 }
 
-static void wait_for_data(struct thread_information *tip)
+static void wait_for_data(struct thread_information *tip, int timeout)
 {
        struct pollfd pfd = { .fd = tip->fd, .events = POLLIN };
 
-       do {
-               poll(&pfd, 1, 100);
+       while (!is_done()) {
+               if (poll(&pfd, 1, timeout) < 0) {
+                       perror("poll");
+                       break;
+               }
                if (pfd.revents & POLLIN)
                        break;
                if (tip->ofile_stdout)
                        break;
-       } while (!is_done());
+       }
 }
 
 static int read_data_file(struct thread_information *tip, void *buf,
@@ -395,7 +465,7 @@ static int read_data_file(struct thread_information *tip, void *buf,
        int ret = 0;
 
        do {
-               wait_for_data(tip);
+               wait_for_data(tip, 100);
 
                ret = read(tip->fd, buf, len);
                if (!ret)
@@ -420,11 +490,12 @@ static int read_data_file(struct thread_information *tip, void *buf,
 static int read_data_net(struct thread_information *tip, void *buf,
                         unsigned int len)
 {
+       struct net_connection *nc = tip->nc;
        unsigned int bytes_left = len;
        int ret = 0;
 
        do {
-               ret = recv(net_in_fd, buf, bytes_left, MSG_WAITALL);
+               ret = recv(nc->in_fd, buf, bytes_left, MSG_WAITALL);
 
                if (!ret)
                        continue;
@@ -444,17 +515,6 @@ static int read_data_net(struct thread_information *tip, void *buf,
        return len - bytes_left;
 }
 
-static int read_data(struct thread_information *tip, void *buf,
-                    unsigned int len)
-{
-       int ret = tip->read_data(tip, buf, len);
-
-       if (ret > 0)
-               tip->data_read += ret;
-
-       return ret;
-}
-
 static inline struct tip_subbuf *
 subbuf_fifo_dequeue(struct thread_information *tip)
 {
@@ -496,19 +556,21 @@ static int mmap_subbuf(struct thread_information *tip, unsigned int maxlen)
 {
        int ofd = fileno(tip->ofile);
        int ret;
+       unsigned long nr;
 
        /*
         * extend file, if we have to. use chunks of 16 subbuffers.
         */
-       if (tip->fs_off + buf_size > tip->fs_buf_len) {
+       if (tip->fs_off + maxlen > tip->fs_buf_len) {
                if (tip->fs_buf) {
                        munlock(tip->fs_buf, tip->fs_buf_len);
                        munmap(tip->fs_buf, tip->fs_buf_len);
                        tip->fs_buf = NULL;
                }
 
-               tip->fs_off = tip->fs_size & (page_size - 1);
-               tip->fs_buf_len = (16 * buf_size) - tip->fs_off;
+               tip->fs_off = tip->fs_size & (tip->device->page_size - 1);
+               nr = max(16, tip->device->buf_nr);
+               tip->fs_buf_len = (nr * tip->device->buf_size) - tip->fs_off;
                tip->fs_max_size += tip->fs_buf_len;
 
                if (ftruncate(ofd, tip->fs_max_size) < 0) {
@@ -525,8 +587,9 @@ static int mmap_subbuf(struct thread_information *tip, unsigned int maxlen)
                mlock(tip->fs_buf, tip->fs_buf_len);
        }
 
-       ret = read_data(tip, tip->fs_buf + tip->fs_off, maxlen);
+       ret = tip->read_data(tip, tip->fs_buf + tip->fs_off, maxlen);
        if (ret >= 0) {
+               tip->data_read += ret;
                tip->fs_size += ret;
                tip->fs_off += ret;
                return 0;
@@ -535,28 +598,6 @@ static int mmap_subbuf(struct thread_information *tip, unsigned int maxlen)
        return -1;
 }
 
-static int get_subbuf_sendfile(struct thread_information *tip,
-                              unsigned int maxlen)
-{
-       struct tip_subbuf *ts = malloc(sizeof(*ts));
-       struct stat sb;
-
-       ts->buf = malloc(buf_size);
-       ts->max_len = maxlen;
-       ts->buf = NULL;
-
-       if (fstat(tip->fd, &sb) < 0) {
-               perror("trace stat");
-               return 1;
-       }
-
-       ts->len = sb.st_size - tip->ofile_offset;
-       ts->max_len = ts->len;
-       ts->offset = tip->ofile_offset;
-       tip->ofile_offset += ts->len;
-       return subbuf_fifo_queue(tip, ts);
-}
-
 /*
  * Use the copy approach for pipes and network
  */
@@ -565,13 +606,20 @@ static int get_subbuf(struct thread_information *tip, unsigned int maxlen)
        struct tip_subbuf *ts = malloc(sizeof(*ts));
        int ret;
 
-       ts->buf = malloc(buf_size);
+       ts->buf = malloc(tip->device->buf_size);
        ts->max_len = maxlen;
 
-       ret = read_data(tip, ts->buf, ts->max_len);
+       ret = tip->read_data(tip, ts->buf, ts->max_len);
        if (ret > 0) {
                ts->len = ret;
-               return subbuf_fifo_queue(tip, ts);
+               tip->data_read += ret;
+               if (subbuf_fifo_queue(tip, ts))
+                       ret = -1;
+       }
+
+       if (ret <= 0) {
+               free(ts->buf);
+               free(ts);
        }
 
        return ret;
@@ -599,7 +647,7 @@ static void tip_ftrunc_final(struct thread_information *tip)
        /*
         * truncate to right size and cleanup mmap
         */
-       if (tip->ofile_mmap) {
+       if (tip->ofile_mmap && tip->ofile) {
                int ofd = fileno(tip->ofile);
 
                if (tip->fs_buf)
@@ -624,7 +672,7 @@ static void *thread_main(void *arg)
        }
 
        snprintf(tip->fn, sizeof(tip->fn), "%s/block/%s/trace%d",
-                       relay_path, tip->device->buts_name, tip->cpu);
+                       debugfs_path, tip->device->buts_name, tip->cpu);
        tip->fd = open(tip->fn, O_RDONLY);
        if (tip->fd < 0) {
                perror(tip->fn);
@@ -634,10 +682,16 @@ static void *thread_main(void *arg)
        }
 
        while (!is_done()) {
-               if (tip->get_subbuf(tip, buf_size))
+               if (tip->get_subbuf(tip, tip->device->buf_size) < 0)
                        break;
        }
 
+       /*
+        * trace is stopped, pull data until we get a short read
+        */
+       while (tip->get_subbuf(tip, tip->device->buf_size) > 0)
+               ;
+
        tip_ftrunc_final(tip);
        tip->exited = 1;
        return NULL;
@@ -671,8 +725,12 @@ static int net_send_header(struct thread_information *tip, unsigned int len)
        hdr.cpu = tip->cpu;
        hdr.max_cpus = ncpus;
        hdr.len = len;
-
-       return write_data_net(net_out_fd, &hdr, sizeof(hdr));
+       hdr.cl_id = getpid();
+       hdr.buf_size = tip->device->buf_size;
+       hdr.buf_nr = tip->device->buf_nr;
+       hdr.page_size = tip->device->page_size;
+       
+       return write_data_net(net_out_fd[tip->cpu], &hdr, sizeof(hdr));
 }
 
 /*
@@ -680,41 +738,101 @@ static int net_send_header(struct thread_information *tip, unsigned int len)
  */
 static void net_client_send_close(void)
 {
+       struct device_information *dip;
        struct blktrace_net_hdr hdr;
+       int i;
 
-       hdr.magic = BLK_IO_TRACE_MAGIC;
-       hdr.cpu = 0;
-       hdr.max_cpus = ncpus;
-       hdr.len = 0;
+       for_each_dip(dip, i) {
+               hdr.magic = BLK_IO_TRACE_MAGIC;
+               hdr.max_cpus = ncpus;
+               hdr.len = 0;
+               strcpy(hdr.buts_name, dip->buts_name);
+               hdr.cpu = get_dropped_count(dip->buts_name);
+               hdr.cl_id = getpid();
+               hdr.buf_size = dip->buf_size;
+               hdr.buf_nr = dip->buf_nr;
+               hdr.page_size = dip->page_size;
+
+               write_data_net(net_out_fd[0], &hdr, sizeof(hdr));
+       }
 
-       write_data_net(net_out_fd, &hdr, sizeof(hdr));
 }
 
 static int flush_subbuf_net(struct thread_information *tip,
                            struct tip_subbuf *ts)
 {
        if (net_send_header(tip, ts->len))
-               return 1;
-       if (write_data_net(net_out_fd, ts->buf, ts->len))
-               return 1;
+               return -1;
+       if (write_data_net(net_out_fd[tip->cpu], ts->buf, ts->len))
+               return -1;
 
        free(ts->buf);
        free(ts);
+       return 1;
+}
+
+static int net_sendfile(struct thread_information *tip, struct tip_subbuf *ts)
+{
+       int ret = sendfile(net_out_fd[tip->cpu], tip->fd, NULL, ts->len);
+
+       if (ret < 0) {
+               perror("sendfile");
+               return 1;
+       } else if (ret < (int) ts->len) {
+               fprintf(stderr, "short sendfile send (%d of %d)\n", ret, ts->len);
+               return 1;
+       }
+
        return 0;
 }
 
 static int flush_subbuf_sendfile(struct thread_information *tip,
                                 struct tip_subbuf *ts)
 {
+       int ret = -1;
+
        if (net_send_header(tip, ts->len))
-               return 1;
-       if (sendfile(net_out_fd, tip->fd, &ts->offset, ts->len) < 0) {
-               perror("sendfile");
-               return 1;
-       }
+               goto err;
+       if (net_sendfile(tip, ts))
+               goto err;
 
+       tip->data_read += ts->len;
+       ret = 1;
+err:
        free(ts);
-       return 0;
+       return ret;
+}
+
+static int get_subbuf_sendfile(struct thread_information *tip,
+                              __attribute__((__unused__)) unsigned int maxlen)
+{
+       struct tip_subbuf *ts;
+       struct stat sb;
+       unsigned int ready;
+
+       wait_for_data(tip, -1);
+
+       if (fstat(tip->fd, &sb) < 0) {
+               perror("trace stat");
+               return -1;
+       }
+
+       ready = sb.st_size - tip->data_queued;
+       if (!ready) {
+               usleep(1000);
+               return 0;
+       }
+
+       ts = malloc(sizeof(*ts));
+       ts->buf = NULL;
+       ts->max_len = 0;
+       ts->len = ready;
+       tip->data_queued += ready;
+
+       if (flush_subbuf_sendfile(tip, ts) < 0)
+               return -1;
+
+       return ready;
 }
 
 static int write_data(struct thread_information *tip, void *buf,
@@ -725,15 +843,11 @@ static int write_data(struct thread_information *tip, void *buf,
        if (!buf_len)
                return 0;
 
-       while (1) {
-               ret = fwrite(buf, buf_len, 1, tip->ofile);
-               if (ret == 1)
-                       break;
-
-               if (ret < 0) {
-                       perror("write");
-                       return 1;
-               }
+       ret = fwrite(buf, buf_len, 1, tip->ofile);
+       if (ferror(tip->ofile) || ret != 1) {
+               perror("fwrite");
+               clearerr(tip->ofile);
+               return 1;
        }
 
        if (tip->ofile_stdout)
@@ -838,7 +952,7 @@ static void get_and_write_events(void)
                }
 
                if (!events)
-                       usleep(10);
+                       usleep(100000);
        }
 
        /*
@@ -866,7 +980,7 @@ static void wait_for_threads(void)
         * for files, we just wait around for trace threads to exit
         */
        if ((output_name && !strcmp(output_name, "-")) ||
-           net_mode == Net_client)
+           ((net_mode == Net_client) && !net_use_sendfile))
                get_and_write_events();
        else {
                struct device_information *dip;
@@ -875,7 +989,7 @@ static void wait_for_threads(void)
 
                do {
                        tips_running = 0;
-                       usleep(1000);
+                       usleep(100000);
 
                        for_each_dip(dip, i)
                                for_each_tip(dip, tip, j)
@@ -887,17 +1001,43 @@ static void wait_for_threads(void)
                net_client_send_close();
 }
 
-static void fill_ofname(char *dst, char *buts_name, int cpu)
+static int fill_ofname(struct device_information *dip,
+                      struct thread_information *tip, char *dst,
+                      char *buts_name)
 {
+       struct stat sb;
        int len = 0;
 
        if (output_dir)
                len = sprintf(dst, "%s/", output_dir);
+       else
+               len = sprintf(dst, "./");
+
+       if (net_mode == Net_server) {
+               struct net_connection *nc = tip->nc;
+
+               len += sprintf(dst + len, "%s-", inet_ntoa(nc->ch->cl_in_addr));
+               len += strftime(dst + len, 64, "%F-%T/", gmtime(&dip->cl_connect_time));
+       }
+
+       if (stat(dst, &sb) < 0) {
+               if (errno != ENOENT) {
+                       perror("stat");
+                       return 1;
+               }
+               if (mkdir(dst, 0755) < 0) {
+                       perror(dst);
+                       fprintf(stderr, "Can't make output dir\n");
+                       return 1;
+               }
+       }
 
        if (output_name)
-               sprintf(dst + len, "%s.blktrace.%d", output_name, cpu);
+               sprintf(dst + len, "%s.blktrace.%d", output_name, tip->cpu);
        else
-               sprintf(dst + len, "%s.blktrace.%d", buts_name, cpu);
+               sprintf(dst + len, "%s.blktrace.%d", buts_name, tip->cpu);
+
+       return 0;
 }
 
 static void fill_ops(struct thread_information *tip)
@@ -906,9 +1046,9 @@ static void fill_ops(struct thread_information *tip)
         * setup ops
         */
        if (net_mode == Net_client) {
-               if (net_sendfile) {
+               if (net_use_sendfile) {
                        tip->get_subbuf = get_subbuf_sendfile;
-                       tip->flush_subbuf = flush_subbuf_sendfile;
+                       tip->flush_subbuf = NULL;
                } else {
                        tip->get_subbuf = get_subbuf;
                        tip->flush_subbuf = flush_subbuf_net;
@@ -928,48 +1068,66 @@ static void fill_ops(struct thread_information *tip)
                tip->read_data = read_data_file;
 }
 
+static int tip_open_output(struct device_information *dip,
+                          struct thread_information *tip)
+{
+       int pipeline = output_name && !strcmp(output_name, "-");
+       int mode, vbuf_size;
+       char op[128];
+
+       if (net_mode == Net_client) {
+               tip->ofile = NULL;
+               tip->ofile_stdout = 0;
+               tip->ofile_mmap = 0;
+               goto done;
+       } else if (pipeline) {
+               tip->ofile = fdopen(STDOUT_FILENO, "w");
+               tip->ofile_stdout = 1;
+               tip->ofile_mmap = 0;
+               mode = _IOLBF;
+               vbuf_size = 512;
+       } else {
+               if (fill_ofname(dip, tip, op, dip->buts_name))
+                       return 1;
+               tip->ofile = fopen(op, "w+");
+               tip->ofile_stdout = 0;
+               tip->ofile_mmap = 1;
+               mode = _IOFBF;
+               vbuf_size = OFILE_BUF;
+       }
+
+       if (tip->ofile == NULL) {
+               perror(op);
+               return 1;
+       }
+
+       tip->ofile_buffer = malloc(vbuf_size);
+       if (setvbuf(tip->ofile, tip->ofile_buffer, mode, vbuf_size)) {
+               perror("setvbuf");
+               close_thread(tip);
+               return 1;
+       }
+
+done:
+       fill_ops(tip);
+       return 0;
+}
+
 static int start_threads(struct device_information *dip)
 {
        struct thread_information *tip;
-       int j, pipeline = output_name && !strcmp(output_name, "-");
-       int mode, vbuf_size;
-       char op[64];
+       int j;
 
        for_each_tip(dip, tip, j) {
                tip->cpu = j;
                tip->device = dip;
                tip->events_processed = 0;
+               tip->fd = -1;
                memset(&tip->fifo, 0, sizeof(tip->fifo));
                tip->leftover_ts = NULL;
 
-               if (pipeline) {
-                       tip->ofile = fdopen(STDOUT_FILENO, "w");
-                       tip->ofile_stdout = 1;
-                       tip->ofile_mmap = 0;
-                       mode = _IOLBF;
-                       vbuf_size = 512;
-               } else {
-                       fill_ofname(op, dip->buts_name, tip->cpu);
-                       tip->ofile = fopen(op, "w+");
-                       tip->ofile_stdout = 0;
-                       tip->ofile_mmap = 1;
-                       mode = _IOFBF;
-                       vbuf_size = OFILE_BUF;
-               }
-
-               if (tip->ofile == NULL) {
-                       perror(op);
+               if (tip_open_output(dip, tip))
                        return 1;
-               }
-
-               tip->ofile_buffer = malloc(vbuf_size);
-               if (setvbuf(tip->ofile, tip->ofile_buffer, mode, vbuf_size)) {
-                       perror("setvbuf");
-                       close_thread(tip);
-                       return 1;
-               }
-
-               fill_ops(tip);
 
                if (pthread_create(&tip->thread, NULL, thread_main, tip)) {
                        perror("pthread_create");
@@ -1047,6 +1205,9 @@ static int open_devices(void)
                        perror(dip->path);
                        return 1;
                }
+               dip->buf_size = buf_size;
+               dip->buf_nr = buf_nr;
+               dip->page_size = page_size;
        }
 
        return 0;
@@ -1063,6 +1224,7 @@ static int start_devices(void)
                fprintf(stderr, "Out of memory, threads (%d)\n", size * ndevs);
                return 1;
        }
+       memset(thread_information, 0, size * ndevs);
 
        for_each_dip(dip, i) {
                if (start_trace(dip)) {
@@ -1074,7 +1236,7 @@ static int start_devices(void)
        }
 
        if (i != ndevs) {
-               __for_each_dip(dip, j, i)
+               __for_each_dip(dip, device_information, i, j)
                        stop_trace(dip);
 
                return 1;
@@ -1089,7 +1251,7 @@ static int start_devices(void)
        }
 
        if (i != ndevs) {
-               __for_each_dip(dip, j, i)
+               __for_each_dip(dip, device_information, i, j)
                        stop_threads(dip);
                for_each_dip(dip, i)
                        stop_trace(dip);
@@ -1100,7 +1262,7 @@ static int start_devices(void)
        return 0;
 }
 
-static void show_stats(void)
+static void show_stats(struct device_information *dips, int ndips, int cpus)
 {
        struct device_information *dip;
        struct thread_information *tip;
@@ -1117,12 +1279,12 @@ static void show_stats(void)
        stat_shown = 1;
 
        total_drops = 0;
-       for_each_dip(dip, i) {
+       __for_each_dip(dip, dips, ndips, i) {
                if (!no_stdout)
                        printf("Device: %s\n", dip->path);
                events_processed = 0;
                data_read = 0;
-               for_each_tip(dip, tip, j) {
+               __for_each_tip(dip, tip, cpus, j) {
                        if (!no_stdout)
                                printf("  CPU%3d: %20lu events, %8llu KiB data\n",
                                        tip->cpu, tip->events_processed,
@@ -1141,96 +1303,203 @@ static void show_stats(void)
                fprintf(stderr, "You have dropped events, consider using a larger buffer size (-b)\n");
 }
 
-static struct device_information *net_get_dip(char *buts_name)
+static struct device_information *net_get_dip(struct net_connection *nc,
+                                             struct blktrace_net_hdr *bnh)
 {
-       struct device_information *dip;
+       struct device_information *dip, *cl_dip = NULL;
+       struct cl_host *ch = nc->ch;
        int i;
 
-       for (i = 0; i < ndevs; i++) {
-               dip = &device_information[i];
+       for (i = 0; i < ch->ndevs; i++) {
+               dip = &ch->device_information[i];
 
-               if (!strcmp(dip->buts_name, buts_name))
+               if (!strcmp(dip->buts_name, bnh->buts_name))
                        return dip;
+
+               if (dip->cl_id == bnh->cl_id)
+                       cl_dip = dip;
        }
 
-       device_information = realloc(device_information, (ndevs + 1) * sizeof(*dip));
-       dip = &device_information[ndevs];
-       strcpy(dip->buts_name, buts_name);
-       strcpy(dip->path, buts_name);
-       ndevs++;
-       dip->threads = malloc(ncpus * sizeof(struct thread_information));
-       memset(dip->threads, 0, ncpus * sizeof(struct thread_information));
+       ch->device_information = realloc(ch->device_information, (ch->ndevs + 1) * sizeof(*dip));
+       dip = &ch->device_information[ch->ndevs];
+       memset(dip, 0, sizeof(*dip));
+       dip->fd = -1;
+       dip->ch = ch;
+       dip->cl_id = bnh->cl_id;
+       dip->buf_size = bnh->buf_size;
+       dip->buf_nr = bnh->buf_nr;
+       dip->page_size = bnh->page_size;
+
+       if (cl_dip)
+               dip->cl_connect_time = cl_dip->cl_connect_time;
+       else
+               dip->cl_connect_time = nc->connect_time;
+       strcpy(dip->buts_name, bnh->buts_name);
+       dip->path = strdup(bnh->buts_name);
+       dip->trace_started = 1;
+       ch->ndevs++;
+       dip->threads = malloc(nc->ncpus * sizeof(struct thread_information));
+       memset(dip->threads, 0, nc->ncpus * sizeof(struct thread_information));
 
        /*
         * open all files
         */
-       for (i = 0; i < ncpus; i++) {
+       for (i = 0; i < nc->ncpus; i++) {
                struct thread_information *tip = &dip->threads[i];
-               char op[64];
 
                tip->cpu = i;
-               tip->ofile_stdout = 0;
-               tip->ofile_mmap = 1;
                tip->device = dip;
-
-               fill_ops(tip);
-
-               fill_ofname(op, dip->buts_name, tip->cpu);
-
-               tip->ofile = fopen(op, "w+");
-               if (!tip->ofile) {
-                       perror("fopen");
+               tip->fd = -1;
+               tip->nc = nc;
+               
+               if (tip_open_output(dip, tip))
                        return NULL;
-               }
+
+               tip->nc = NULL;
        }
 
        return dip;
 }
 
-static struct thread_information *net_get_tip(struct blktrace_net_hdr *bnh)
+static struct thread_information *net_get_tip(struct net_connection *nc,
+                                             struct blktrace_net_hdr *bnh)
 {
        struct device_information *dip;
+       struct thread_information *tip;
+
+       dip = net_get_dip(nc, bnh);
+       if (!dip->trace_started) {
+               fprintf(stderr, "Events for closed devices %s\n", dip->buts_name);
+               return NULL;
+       }
 
-       ncpus = bnh->max_cpus;
-       dip = net_get_dip(bnh->buts_name);
-       return &dip->threads[bnh->cpu];
+       tip = &dip->threads[bnh->cpu];
+       if (!tip->nc)
+               tip->nc = nc;
+       
+       return tip;
 }
 
-static int net_get_header(struct blktrace_net_hdr *bnh)
+static int net_get_header(struct net_connection *nc,
+                         struct blktrace_net_hdr *bnh)
 {
-       int fl = fcntl(net_in_fd, F_GETFL);
+       int fl = fcntl(nc->in_fd, F_GETFL);
        int bytes_left, ret;
        void *p = bnh;
 
-       fcntl(net_in_fd, F_SETFL, fl | O_NONBLOCK);
+       fcntl(nc->in_fd, F_SETFL, fl | O_NONBLOCK);
        bytes_left = sizeof(*bnh);
        while (bytes_left && !is_done()) {
-               ret = recv(net_in_fd, p, bytes_left, MSG_WAITALL);
+               ret = recv(nc->in_fd, p, bytes_left, MSG_WAITALL);
                if (ret < 0) {
                        if (errno != EAGAIN) {
                                perror("recv header");
                                return 1;
                        }
-                       usleep(100);
+                       usleep(1000);
                        continue;
                } else if (!ret) {
-                       usleep(100);
+                       usleep(1000);
                        continue;
                } else {
                        p += ret;
                        bytes_left -= ret;
                }
        }
-       fcntl(net_in_fd, F_SETFL, fl & ~O_NONBLOCK);
-       return 0;
+       fcntl(nc->in_fd, F_SETFL, fl & ~O_NONBLOCK);
+       return bytes_left;
 }
 
-static int net_server_loop(void)
+/*
+ * finalize a net client: truncate files, show stats, cleanup, etc
+ */
+static void device_done(struct net_connection *nc, struct device_information *dip)
+{
+       struct thread_information *tip;
+       int i;
+
+       __for_each_tip(dip, tip, nc->ncpus, i)
+               tip_ftrunc_final(tip);
+
+       show_stats(dip, 1, nc->ncpus);
+
+       /*
+        * cleanup for next run
+        */
+       __for_each_tip(dip, tip, nc->ncpus, i) {
+               if (tip->ofile)
+                       fclose(tip->ofile);
+       }
+
+       free(dip->threads);
+       free(dip->path);
+
+       close(nc->in_fd);
+       nc->in_fd = -1;
+
+       stat_shown = 0;
+}
+
+static inline int in_addr_eq(struct in_addr a, struct in_addr b)
+{
+       return a.s_addr == b.s_addr;
+}
+
+static void net_add_client_host(struct cl_host *ch)
+{
+       ch->list_next = cl_host_list;
+       cl_host_list = ch;
+       cl_hosts++;
+}
+
+static void net_remove_client_host(struct cl_host *ch)
+{
+       struct cl_host *p, *c;
+       
+       for (p = c = cl_host_list; c; c = c->list_next) {
+               if (c == ch) {
+                       if (p == c)
+                               cl_host_list = c->list_next;
+                       else
+                               p->list_next = c->list_next;
+                       cl_hosts--;
+                       return;
+               }
+               p = c;
+       }
+}
+
+static struct cl_host *net_find_client_host(struct in_addr cl_in_addr)
+{
+       struct cl_host *ch = cl_host_list;
+
+       while (ch) {
+               if (in_addr_eq(ch->cl_in_addr, cl_in_addr))
+                       return ch;
+               ch = ch->list_next;
+       }
+
+       return NULL;
+}
+
+static void net_client_host_done(struct cl_host *ch)
+{
+       free(ch->device_information);
+       free(ch->net_connections);
+       net_connects -= ch->nconn;
+       net_remove_client_host(ch);
+       free(ch);
+}
+
+/*
+ * handle incoming events from a net client
+ */
+static int net_client_data(struct net_connection *nc)
 {
        struct thread_information *tip;
        struct blktrace_net_hdr bnh;
 
-       if (net_get_header(&bnh))
+       if (net_get_header(nc, &bnh))
                return 1;
 
        if (data_is_native == -1 && check_data_endianness(bnh.magic)) {
@@ -1239,19 +1508,48 @@ static int net_server_loop(void)
        }
 
        if (!data_is_native) {
+               bnh.magic = be32_to_cpu(bnh.magic);
                bnh.cpu = be32_to_cpu(bnh.cpu);
+               bnh.max_cpus = be32_to_cpu(bnh.max_cpus);
                bnh.len = be32_to_cpu(bnh.len);
+               bnh.cl_id = be32_to_cpu(bnh.cl_id);
+               bnh.buf_size = be32_to_cpu(bnh.buf_size);
+               bnh.buf_nr = be32_to_cpu(bnh.buf_nr);
+               bnh.page_size = be32_to_cpu(bnh.page_size);
        }
 
+       if ((bnh.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
+               fprintf(stderr, "server: bad data magic\n");
+               return 1;
+       }
+
+       if (nc->ncpus == -1)
+               nc->ncpus = bnh.max_cpus;
+
        /*
         * len == 0 means that the other end signalled end-of-run
         */
        if (!bnh.len) {
-               fprintf(stderr, "server: end of run\n");
-               return 1;
+               /*
+                * overload cpu count with dropped events
+                */
+               struct device_information *dip;
+
+               dip = net_get_dip(nc, &bnh);
+               dip->drop_count = bnh.cpu;
+               dip->trace_started = 0;
+
+               printf("server: end of run for %s\n", dip->buts_name);
+
+               device_done(nc, dip);
+
+               if (++nc->ch->ndevs_done == nc->ch->ndevs)
+                       net_client_host_done(nc->ch);
+
+               return 0;
        }
 
-       tip = net_get_tip(&bnh);
+       tip = net_get_tip(nc, &bnh);
        if (!tip)
                return 1;
 
@@ -1261,17 +1559,114 @@ static int net_server_loop(void)
        return 0;
 }
 
+static void net_add_connection(int listen_fd, struct sockaddr_in *addr)
+{
+       socklen_t socklen = sizeof(*addr);
+       struct net_connection *nc;
+       struct cl_host *ch;
+       int in_fd;
+
+       in_fd = accept(listen_fd, (struct sockaddr *) addr, &socklen);
+       if (in_fd < 0) {
+               perror("accept");
+               return;
+       }
+
+       ch = net_find_client_host(addr->sin_addr);
+       if (!ch) {
+               if (cl_hosts == NET_MAX_CL_HOSTS) {
+                       fprintf(stderr, "server: no more clients allowed\n");
+                       return;
+               }
+               ch = malloc(sizeof(struct cl_host));
+               memset(ch, 0, sizeof(*ch));
+               ch->cl_in_addr = addr->sin_addr;
+               net_add_client_host(ch);
+
+               printf("server: connection from %s\n", inet_ntoa(addr->sin_addr));
+       }
+
+       ch->net_connections = realloc(ch->net_connections, (ch->nconn + 1) * sizeof(*nc));
+       nc = &ch->net_connections[ch->nconn++];
+       memset(nc, 0, sizeof(*nc));
+
+       time(&nc->connect_time);
+       nc->ch = ch;
+       nc->in_fd = in_fd;
+       nc->ncpus = -1;
+       net_connects++;
+}
+
+/*
+ * event driven loop, handle new incoming connections and data from
+ * existing connections
+ */
+static void net_server_handle_connections(int listen_fd,
+                                         struct sockaddr_in *addr)
+{
+       struct pollfd *pfds = NULL;
+       struct net_connection **ncs = NULL;
+       int max_connects = 0;
+       int i, nconns, events;
+       struct cl_host *ch;
+       struct net_connection *nc;
+       
+       printf("server: waiting for connections...\n");
+
+       while (!is_done()) {
+               if (net_connects >= max_connects) {
+                       pfds = realloc(pfds, (net_connects + 1) * sizeof(*pfds));
+                       ncs = realloc(ncs, (net_connects + 1) * sizeof(*ncs));
+                       max_connects = net_connects + 1;
+               }
+               /*
+                * the zero entry is for incoming connections, remaining
+                * entries for clients
+                */
+               pfds[0].fd = listen_fd;
+               pfds[0].events = POLLIN;
+               nconns = 0;
+               for_each_cl_host(ch) {
+                       for (i = 0; i < ch->nconn; i++) {
+                               nc = &ch->net_connections[i];
+                               pfds[nconns + 1].fd = nc->in_fd;
+                               pfds[nconns + 1].events = POLLIN;
+                               ncs[nconns++] = nc;
+                       }
+               }
+
+               events = poll(pfds, 1 + nconns, -1);
+               if (events < 0) {
+                       if (errno == EINTR)
+                               continue;
+
+                       perror("poll");
+                       break;
+               } else if (!events)
+                       continue;
+
+               if (pfds[0].revents & POLLIN) {
+                       net_add_connection(listen_fd, addr);
+                       events--;
+               }
+
+               for (i = 0; events && i < nconns; i++) {
+                       if (pfds[i + 1].revents & POLLIN) {
+                               net_client_data(ncs[i]);
+                               events--;
+                       }
+               }
+       }
+}
+
 /*
  * Start here when we are in server mode - just fetch data from the network
  * and dump to files
  */
 static int net_server(void)
 {
-       struct device_information *dip;
-       struct thread_information *tip;
        struct sockaddr_in addr;
-       socklen_t socklen;
-       int fd, opt, i, j;
+       int fd, opt;
 
        fd = socket(AF_INET, SOCK_STREAM, 0);
        if (fd < 0) {
@@ -1300,64 +1695,15 @@ static int net_server(void)
                return 1;
        }
 
-repeat:
-       signal(SIGINT, NULL);
-       signal(SIGHUP, NULL);
-       signal(SIGTERM, NULL);
-       signal(SIGALRM, NULL);
-
-       printf("blktrace: waiting for incoming connection...\n");
-
-       socklen = sizeof(addr);
-       net_in_fd = accept(fd, (struct sockaddr *) &addr, &socklen);
-       if (net_in_fd < 0) {
-               perror("accept");
-               return 1;
-       }
-
-       signal(SIGINT, handle_sigint);
-       signal(SIGHUP, handle_sigint);
-       signal(SIGTERM, handle_sigint);
-       signal(SIGALRM, handle_sigint);
-
-       printf("blktrace: connection from %s\n", inet_ntoa(addr.sin_addr));
-
-       while (!is_done()) {
-               if (net_server_loop())
-                       break;
-       }
-
-       for_each_dip(dip, i)
-               for_each_tip(dip, tip, j)
-                       tip_ftrunc_final(tip);
-
-       show_stats();
-
-       if (is_done())
-               return 0;
-
-       /*
-        * cleanup for next run
-        */
-       for_each_dip(dip, i) {
-               for_each_tip(dip, tip, j)
-                       fclose(tip->ofile);
-
-               free(dip->threads);
-       }
-
-       free(device_information);
-       device_information = NULL;
-       ncpus = ndevs = 0;
-       goto repeat;
+       net_server_handle_connections(fd, &addr);
+       return 0;
 }
 
 /*
  * Setup outgoing network connection where we will transmit data
  */
-static int net_setup_client(void)
+static int net_setup_client_cpu(int i, struct sockaddr_in *addr)
 {
-       struct sockaddr_in addr;
        int fd;
 
        fd = socket(AF_INET, SOCK_STREAM, 0);
@@ -1366,6 +1712,20 @@ static int net_setup_client(void)
                return 1;
        }
 
+       if (connect(fd, (struct sockaddr *) addr, sizeof(*addr)) < 0) {
+               perror("client: connect");
+               return 1;
+       }
+
+       net_out_fd[i] = fd;
+       return 0;
+}
+
+static int net_setup_client(void)
+{
+       struct sockaddr_in addr;
+       int i;
+
        memset(&addr, 0, sizeof(addr));
        addr.sin_family = AF_INET;
        addr.sin_port = htons(net_port);
@@ -1383,21 +1743,22 @@ static int net_setup_client(void)
 
        printf("blktrace: connecting to %s\n", hostname);
 
-       if (connect(fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
-               perror("client: connect");
-               return 1;
+       net_out_fd = malloc(ncpus * sizeof(*net_out_fd));
+       for (i = 0; i < ncpus; i++) {
+               if (net_setup_client_cpu(i, &addr))
+                       return 1;
        }
 
        printf("blktrace: connected!\n");
-       net_out_fd = fd;
+       
        return 0;
 }
 
 static char usage_str[] = \
-       "-d <dev> [ -r relay path ] [ -o <output> ] [-k ] [ -w time ]\n" \
-       "[ -a action ] [ -A action mask ] [ -v ]\n\n" \
+       "-d <dev> [ -r debugfs path ] [ -o <output> ] [-k ] [ -w time ]\n" \
+       "[ -a action ] [ -A action mask ] [ -I  <devs file> ] [ -v ]\n\n" \
        "\t-d Use specified device. May also be given last after options\n" \
-       "\t-r Path to mounted relayfs, defaults to /relay\n" \
+       "\t-r Path to mounted debugfs, defaults to /sys/kernel/debug\n" \
        "\t-o File(s) to send output to\n" \
        "\t-D Directory to prepend to output file names\n" \
        "\t-k Kill a running trace\n" \
@@ -1409,7 +1770,8 @@ static char usage_str[] = \
        "\t-l Run in network listen mode (blktrace server)\n" \
        "\t-h Run in network client mode, connecting to the given host\n" \
        "\t-p Network port to use (default 8462)\n" \
-       "\t-s Make the network client use sendfile() to transfer data\n" \
+       "\t-s Make the network client NOT use sendfile() to transfer data\n" \
+       "\t-I Add devices found in <devs file>\n" \
        "\t-V Print program version info\n\n";
 
 static void show_usage(char *program)
@@ -1419,7 +1781,7 @@ static void show_usage(char *program)
 
 int main(int argc, char *argv[])
 {
-       static char default_relay_path[] = "/relay";
+       static char default_debugfs_path[] = "/sys/kernel/debug";
        struct statfs st;
        int i, c;
        int stop_watch = 0;
@@ -1453,8 +1815,26 @@ int main(int argc, char *argv[])
                                return 1;
                        break;
 
+               case 'I': {
+                       char dev_line[256];
+                       FILE *ifp = fopen(optarg, "r");
+
+                       if (!ifp) {
+                               fprintf(stderr, 
+                                       "Invalid file for devices %s\n", 
+                                       optarg);
+                               return 1;
+                       }
+
+                       while (fscanf(ifp, "%s\n", dev_line) == 1)
+                               if (resize_devices(strdup(dev_line)) != 0)
+                                       return 1;
+                       break;
+               }
+                       
+
                case 'r':
-                       relay_path = optarg;
+                       debugfs_path = optarg;
                        break;
 
                case 'o':
@@ -1506,7 +1886,7 @@ int main(int argc, char *argv[])
                        net_port = atoi(optarg);
                        break;
                case 's':
-                       net_sendfile = 1;
+                       net_use_sendfile = 0;
                        break;
                default:
                        show_usage(argv[0]);
@@ -1518,8 +1898,14 @@ int main(int argc, char *argv[])
 
        page_size = getpagesize();
 
-       if (net_mode == Net_server)
+       if (net_mode == Net_server) {
+               if (output_name) {
+                       fprintf(stderr, "-o ignored in server mode\n");
+                       output_name = NULL;
+               }
+
                return net_server();
+       }
 
        while (optind < argc) {
                if (resize_devices(argv[optind++]) != 0)
@@ -1531,20 +1917,20 @@ int main(int argc, char *argv[])
                return 1;
        }
 
-       if (!relay_path)
-               relay_path = default_relay_path;
-
        if (act_mask_tmp != 0)
                act_mask = act_mask_tmp;
 
-       if (statfs(relay_path, &st) < 0) {
+       if (!debugfs_path)
+               debugfs_path = default_debugfs_path;
+
+       if (statfs(debugfs_path, &st) < 0) {
                perror("statfs");
                fprintf(stderr,"%s does not appear to be a valid path\n",
-                       relay_path);
+                       debugfs_path);
                return 1;
-       } else if (st.f_type != (long) RELAYFS_TYPE) {
-               fprintf(stderr,"%s does not appear to be a relay filesystem\n",
-                       relay_path);
+       } else if (st.f_type != (long) DEBUGFS_TYPE) {
+               fprintf(stderr,"%s does not appear to be a debug filesystem\n",
+                       debugfs_path);
                return 1;
        }
 
@@ -1566,6 +1952,7 @@ int main(int argc, char *argv[])
        signal(SIGHUP, handle_sigint);
        signal(SIGTERM, handle_sigint);
        signal(SIGALRM, handle_sigint);
+       signal(SIGPIPE, SIG_IGN);
 
        if (net_mode == Net_client && net_setup_client())
                return 1;
@@ -1586,7 +1973,7 @@ int main(int argc, char *argv[])
                stop_all_traces();
        }
 
-       show_stats();
+       show_stats(device_information, ndevs, ncpus);
 
        return 0;
 }