[PATCH] blktrace: no need to track ts->offset anymore
[blktrace.git] / blktrace.c
index b945e0e2839586d0963fbb807a6d869c44ead0c9..24b48d5b2a214ecc9d86222a6263366afbfbe2b7 100644 (file)
 #include <sys/ioctl.h>
 #include <sys/param.h>
 #include <sys/statfs.h>
+#include <sys/poll.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sched.h>
 #include <ctype.h>
 #include <getopt.h>
+#include <errno.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <sys/sendfile.h>
 
 #include "blktrace.h"
+#include "barrier.h"
 
-static char blktrace_version[] = "0.90";
+static char blktrace_version[] = "0.99";
 
-#define BUF_SIZE       (128 *1024)
+/*
+ * You may want to increase this even more, if you are logging at a high
+ * rate and see skipped/missed events
+ */
+#define BUF_SIZE       (512 * 1024)
 #define BUF_NR         (4)
 
+#define OFILE_BUF      (128 * 1024)
+
 #define RELAYFS_TYPE   0xF0B4A981
 
-#define S_OPTS "d:a:A:r:o:kw:vb:n:D:"
+#define S_OPTS "d:a:A:r:o:kw:Vb:n:D:lh:p:s"
 static struct option l_opts[] = {
        {
                .name = "dev",
@@ -92,26 +107,68 @@ static struct option l_opts[] = {
                .name = "version",
                .has_arg = no_argument,
                .flag = NULL,
-               .val = 'v'
+               .val = 'V'
        },
        {
-               .name = "buffer size (in KiB)",
+               .name = "buffer-size",
                .has_arg = required_argument,
                .flag = NULL,
                .val = 'b'
        },
        {
-               .name = "nr of sub buffers",
+               .name = "num-sub-buffers",
                .has_arg = required_argument,
                .flag = NULL,
                .val = 'n'
        },
        {
-               .name = "output directory",
+               .name = "output-dir",
                .has_arg = required_argument,
                .flag = NULL,
                .val = 'D'
        },
+       {
+               .name = "listen",
+               .has_arg = no_argument,
+               .flag = NULL,
+               .val = 'l'
+       },
+       {
+               .name = "host",
+               .has_arg = required_argument,
+               .flag = NULL,
+               .val = 'h'
+       },
+       {
+               .name = "port",
+               .has_arg = required_argument,
+               .flag = NULL,
+               .val = 'p'
+       },
+       {
+               .name = "sendfile",
+               .has_arg = no_argument,
+               .flag = NULL,
+               .val = 's'
+       },
+       {
+               .name = NULL,
+       }
+};
+
+struct tip_subbuf {
+       void *buf;
+       unsigned int len;
+       unsigned int max_len;
+};
+
+#define FIFO_SIZE      (1024)  /* should be plenty big! */
+#define CL_SIZE                (128)   /* cache line, any bigger? */
+
+struct tip_subbuf_fifo {
+       int tail __attribute__((aligned(CL_SIZE)));
+       int head __attribute__((aligned(CL_SIZE)));
+       struct tip_subbuf *q[FIFO_SIZE];
 };
 
 struct thread_information {
@@ -119,24 +176,53 @@ struct thread_information {
        pthread_t thread;
 
        int fd;
+       void *fd_buf;
        char fn[MAXPATHLEN + 64];
-       void *buf;
-       unsigned long buf_offset;
-       unsigned int buf_subbuf;
-       unsigned int sequence;
 
-       pthread_mutex_t *fd_lock;
-       int ofd;
+       int pfd;
+       size_t *pfd_buf;
+
+       struct in_addr cl_in_addr;
+
+       FILE *ofile;
+       char *ofile_buffer;
+       off_t ofile_offset;
+       int ofile_stdout;
+       int ofile_mmap;
+       volatile int sendfile_pending;
+
+       int (*get_subbuf)(struct thread_information *, unsigned int);
+       int (*flush_subbuf)(struct thread_information *, struct tip_subbuf *);
+       int (*read_data)(struct thread_information *, void *, unsigned int);
 
        unsigned long events_processed;
+       unsigned long long data_read;
        struct device_information *device;
+
+       int exited;
+
+       /*
+        * piped fifo buffers
+        */
+       struct tip_subbuf_fifo fifo;
+       struct tip_subbuf *leftover_ts;
+
+       /*
+        * mmap controlled output files
+        */
+       unsigned long long fs_size;
+       unsigned long long fs_max_size;
+       unsigned long fs_off;
+       void *fs_buf;
+       unsigned long fs_buf_len;
 };
 
 struct device_information {
        int fd;
        char *path;
        char buts_name[32];
-       int trace_started;
+       volatile int trace_started;
+       unsigned long drop_count;
        struct thread_information *threads;
 };
 
@@ -151,16 +237,113 @@ static char *output_name;
 static char *output_dir;
 static int act_mask = ~0U;
 static int kill_running_trace;
-static unsigned int buf_size = BUF_SIZE;
-static unsigned int buf_nr = BUF_NR;
+static unsigned long buf_size = BUF_SIZE;
+static unsigned long buf_nr = BUF_NR;
+static unsigned int page_size;
 
 #define is_done()      (*(volatile int *)(&done))
 static volatile int done;
 
-static pthread_mutex_t stdout_mutex = PTHREAD_MUTEX_INITIALIZER;
+#define is_trace_stopped()     (*(volatile int *)(&trace_stopped))
+static volatile int trace_stopped;
+
+#define is_stat_shown()        (*(volatile int *)(&stat_shown))
+static volatile int stat_shown;
+
+int data_is_native = -1;
 
 static void exit_trace(int status);
 
+#define dip_tracing(dip)       (*(volatile int *)(&(dip)->trace_started))
+#define dip_set_tracing(dip, v)        ((dip)->trace_started = (v))
+
+#define __for_each_dip(__d, __i, __e)  \
+       for (__i = 0, __d = device_information; __i < __e; __i++, __d++)
+
+#define for_each_dip(__d, __i) __for_each_dip(__d, __i, ndevs)
+#define for_each_tip(__d, __t, __j)    \
+       for (__j = 0, __t = (__d)->threads; __j < ncpus; __j++, __t++)
+
+/*
+ * networking stuff follows. we include a magic number so we know whether
+ * to endianness convert or not
+ */
+struct blktrace_net_hdr {
+       u32 magic;              /* same as trace magic */
+       char buts_name[32];     /* trace name */
+       u32 cpu;                /* for which cpu */
+       u32 max_cpus;
+       u32 len;                /* length of following trace data */
+};
+
+#define TRACE_NET_PORT         (8462)
+
+enum {
+       Net_none = 0,
+       Net_server,
+       Net_client,
+};
+
+/*
+ * network cmd line params
+ */
+static char hostname[MAXHOSTNAMELEN];
+static int net_port = TRACE_NET_PORT;
+static int net_mode = 0;
+static int net_use_sendfile;
+
+static int net_in_fd = -1;
+static int net_out_fd = -1;
+
+static void handle_sigint(__attribute__((__unused__)) int sig)
+{
+       struct device_information *dip;
+       int i;
+
+       /*
+        * stop trace so we can reap currently produced data
+        */
+       for_each_dip(dip, i) {
+               if (dip->fd == -1)
+                       continue;
+               if (ioctl(dip->fd, BLKTRACESTOP) < 0)
+                       perror("BLKTRACESTOP");
+       }
+
+       done = 1;
+}
+
+static int get_dropped_count(const char *buts_name)
+{
+       int fd;
+       char tmp[MAXPATHLEN + 64];
+
+       snprintf(tmp, sizeof(tmp), "%s/block/%s/dropped",
+                relay_path, buts_name);
+
+       fd = open(tmp, O_RDONLY);
+       if (fd < 0) {
+               /*
+                * this may be ok, if the kernel doesn't support dropped counts
+                */
+               if (errno == ENOENT)
+                       return 0;
+
+               fprintf(stderr, "Couldn't open dropped file %s\n", tmp);
+               return -1;
+       }
+
+       if (read(fd, tmp, sizeof(tmp)) < 0) {
+               perror(tmp);
+               close(fd);
+               return -1;
+       }
+
+       close(fd);
+
+       return atoi(tmp);
+}
+
 static int start_trace(struct device_information *dip)
 {
        struct blk_user_trace_setup buts;
@@ -170,23 +353,36 @@ static int start_trace(struct device_information *dip)
        buts.buf_nr = buf_nr;
        buts.act_mask = act_mask;
 
-       if (ioctl(dip->fd, BLKSTARTTRACE, &buts) < 0) {
-               perror("BLKSTARTTRACE");
+       if (ioctl(dip->fd, BLKTRACESETUP, &buts) < 0) {
+               perror("BLKTRACESETUP");
+               return 1;
+       }
+
+       if (ioctl(dip->fd, BLKTRACESTART) < 0) {
+               perror("BLKTRACESTART");
                return 1;
        }
 
        memcpy(dip->buts_name, buts.name, sizeof(dip->buts_name));
-       dip->trace_started = 1;
+       dip_set_tracing(dip, 1);
        return 0;
 }
 
 static void stop_trace(struct device_information *dip)
 {
-       if (dip->trace_started || kill_running_trace) {
-               if (ioctl(dip->fd, BLKSTOPTRACE) < 0)
-                       perror("BLKSTOPTRACE");
+       if (dip_tracing(dip) || kill_running_trace) {
+               dip_set_tracing(dip, 0);
+
+               /*
+                * should be stopped, just don't complain if it isn't
+                */
+               ioctl(dip->fd, BLKTRACESTOP);
+
+               if (ioctl(dip->fd, BLKTRACETEARDOWN) < 0)
+                       perror("BLKTRACETEARDOWN");
+
                close(dip->fd);
-               dip->trace_started = 0;
+               dip->fd = -1;
        }
 }
 
@@ -195,280 +391,737 @@ static void stop_all_traces(void)
        struct device_information *dip;
        int i;
 
-       for (dip = device_information, i = 0; i < ndevs; i++, dip++)
+       for_each_dip(dip, i) {
+               dip->drop_count = get_dropped_count(dip->buts_name);
                stop_trace(dip);
+       }
 }
 
-static int read_data(struct thread_information *tip, void *buf, int len)
+static void wait_for_data(struct thread_information *tip)
 {
-       char *p = buf;
-       int ret, bytes_left = len;
+       struct pollfd pfd = { .fd = tip->fd, .events = POLLIN };
 
-       while (!is_done() && bytes_left > 0) {
-               ret = read(tip->fd, p, bytes_left);
-               if (ret == bytes_left)
-                       return 0;
-
-               if (ret < 0) {
-                       perror(tip->fn);
-                       fprintf(stderr,"Thread %d failed read of %s\n",
-                               tip->cpu, tip->fn);
+       do {
+               poll(&pfd, 1, 100);
+               if (pfd.revents & POLLIN)
                        break;
-               } else if (ret > 0) {
-                       p += ret;
+               if (tip->ofile_stdout)
+                       break;
+       } while (!is_done());
+}
+
+static int read_data_file(struct thread_information *tip, void *buf,
+                         unsigned int len)
+{
+       int ret = 0;
+
+       do {
+               wait_for_data(tip);
+
+               ret = read(tip->fd, buf, len);
+               if (!ret)
+                       continue;
+               else if (ret > 0)
+                       return ret;
+               else {
+                       if (errno != EAGAIN) {
+                               perror(tip->fn);
+                               fprintf(stderr,"Thread %d failed read of %s\n",
+                                       tip->cpu, tip->fn);
+                               break;
+                       }
+                       continue;
+               }
+       } while (!is_done());
+
+       return ret;
+
+}
+
+static int read_data_net(struct thread_information *tip, void *buf,
+                        unsigned int len)
+{
+       unsigned int bytes_left = len;
+       int ret = 0;
+
+       do {
+               ret = recv(net_in_fd, buf, bytes_left, MSG_WAITALL);
+
+               if (!ret)
+                       continue;
+               else if (ret < 0) {
+                       if (errno != EAGAIN) {
+                               perror(tip->fn);
+                               fprintf(stderr, "server: failed read\n");
+                               return 0;
+                       }
+                       continue;
+               } else {
+                       buf += ret;
                        bytes_left -= ret;
-               } else
-                       usleep(1000);
+               }
+       } while (!is_done() && bytes_left);
+
+       return len - bytes_left;
+}
+
+static int read_data(struct thread_information *tip, void *buf,
+                    unsigned int len)
+{
+       return tip->read_data(tip, buf, len);
+}
+
+static inline struct tip_subbuf *
+subbuf_fifo_dequeue(struct thread_information *tip)
+{
+       const int head = tip->fifo.head;
+       const int next = (head + 1) & (FIFO_SIZE - 1);
+
+       if (head != tip->fifo.tail) {
+               struct tip_subbuf *ts = tip->fifo.q[head];
+
+               store_barrier();
+               tip->fifo.head = next;
+               return ts;
+       }
+
+       return NULL;
+}
+
+static inline int subbuf_fifo_queue(struct thread_information *tip,
+                                   struct tip_subbuf *ts)
+{
+       const int tail = tip->fifo.tail;
+       const int next = (tail + 1) & (FIFO_SIZE - 1);
+
+       if (next != tip->fifo.head) {
+               tip->fifo.q[tail] = ts;
+               store_barrier();
+               tip->fifo.tail = next;
+               return 0;
+       }
+
+       fprintf(stderr, "fifo too small!\n");
+       return 1;
+}
+
+/*
+ * For file output, truncate and mmap the file appropriately
+ */
+static int mmap_subbuf(struct thread_information *tip, unsigned int maxlen)
+{
+       int ofd = fileno(tip->ofile);
+       int ret;
+
+       /*
+        * extend file, if we have to. use chunks of 16 subbuffers.
+        */
+       if (tip->fs_off + buf_size > tip->fs_buf_len) {
+               if (tip->fs_buf) {
+                       munlock(tip->fs_buf, tip->fs_buf_len);
+                       munmap(tip->fs_buf, tip->fs_buf_len);
+                       tip->fs_buf = NULL;
+               }
+
+               tip->fs_off = tip->fs_size & (page_size - 1);
+               tip->fs_buf_len = (16 * buf_size) - tip->fs_off;
+               tip->fs_max_size += tip->fs_buf_len;
+
+               if (ftruncate(ofd, tip->fs_max_size) < 0) {
+                       perror("ftruncate");
+                       return -1;
+               }
+
+               tip->fs_buf = mmap(NULL, tip->fs_buf_len, PROT_WRITE,
+                                  MAP_SHARED, ofd, tip->fs_size - tip->fs_off);
+               if (tip->fs_buf == MAP_FAILED) {
+                       perror("mmap");
+                       return -1;
+               }
+               mlock(tip->fs_buf, tip->fs_buf_len);
+       }
+
+       ret = read_data(tip, tip->fs_buf + tip->fs_off, maxlen);
+       if (ret >= 0) {
+               tip->data_read += ret;
+               tip->fs_size += ret;
+               tip->fs_off += ret;
+               return 0;
        }
 
        return -1;
 }
 
-static int write_data(int fd, void *buf, unsigned int buf_len)
+/*
+ * Use the copy approach for pipes and network
+ */
+static int get_subbuf(struct thread_information *tip, unsigned int maxlen)
+{
+       struct tip_subbuf *ts = malloc(sizeof(*ts));
+       int ret;
+
+       ts->buf = malloc(buf_size);
+       ts->max_len = maxlen;
+
+       ret = read_data(tip, ts->buf, ts->max_len);
+       if (ret > 0) {
+               ts->len = ret;
+               tip->data_read += ret;
+               if (subbuf_fifo_queue(tip, ts))
+                       return -1;
+       }
+
+       return ret;
+}
+
+static int get_subbuf_sendfile(struct thread_information *tip,
+                              unsigned int maxlen)
 {
-       int ret, bytes_left;
-       char *p = buf;
+       struct tip_subbuf *ts;
+
+       wait_for_data(tip);
+
+       /*
+        * hack to get last data out, we can't use sendfile for that
+        */
+       if (is_done())
+               return get_subbuf(tip, maxlen);
 
-       bytes_left = buf_len;
-       while (bytes_left > 0) {
-               ret = write(fd, p, bytes_left);
-               if (ret == bytes_left)
+       if (tip->sendfile_pending) {
+               usleep(100);
+               return 0;
+       }
+
+       ts = malloc(sizeof(*ts));
+       ts->buf = NULL;
+       ts->max_len = 0;
+
+       if (subbuf_fifo_queue(tip, ts))
+               return -1;
+
+       tip->sendfile_pending++;
+       return buf_size;
+}
+
+static void close_thread(struct thread_information *tip)
+{
+       if (tip->fd != -1)
+               close(tip->fd);
+       if (tip->pfd != -1)
+               close(tip->pfd);
+       if (tip->ofile)
+               fclose(tip->ofile);
+       if (tip->ofile_buffer)
+               free(tip->ofile_buffer);
+       if (tip->fd_buf)
+               free(tip->fd_buf);
+       if (tip->pfd_buf)
+               free(tip->pfd_buf);
+
+       tip->fd = -1;
+       tip->pfd = -1;
+       tip->ofile = NULL;
+       tip->ofile_buffer = NULL;
+       tip->fd_buf = NULL;
+}
+
+static void tip_ftrunc_final(struct thread_information *tip)
+{
+       /*
+        * truncate to right size and cleanup mmap
+        */
+       if (tip->ofile_mmap) {
+               int ofd = fileno(tip->ofile);
+
+               if (tip->fs_buf)
+                       munmap(tip->fs_buf, tip->fs_buf_len);
+
+               ftruncate(ofd, tip->fs_size);
+       }
+}
+
+static void *thread_main(void *arg)
+{
+       struct thread_information *tip = arg;
+       pid_t pid = getpid();
+       cpu_set_t cpu_mask;
+
+       CPU_ZERO(&cpu_mask);
+       CPU_SET((tip->cpu), &cpu_mask);
+
+       if (sched_setaffinity(pid, sizeof(cpu_mask), &cpu_mask) == -1) {
+               perror("sched_setaffinity");
+               exit_trace(1);
+       }
+
+       snprintf(tip->fn, sizeof(tip->fn), "%s/block/%s/trace%d",
+                       relay_path, tip->device->buts_name, tip->cpu);
+       tip->fd = open(tip->fn, O_RDONLY);
+       if (tip->fd < 0) {
+               perror(tip->fn);
+               fprintf(stderr,"Thread %d failed open of %s\n", tip->cpu,
+                       tip->fn);
+               exit_trace(1);
+       }
+
+       if (net_mode == Net_client && net_use_sendfile) {
+               char tmp[MAXPATHLEN + 64];
+
+               snprintf(tmp, sizeof(tmp), "%s/block/%s/trace%d.padding",
+                        relay_path, tip->device->buts_name, tip->cpu);
+
+               tip->pfd = open(tmp, O_RDONLY);
+               if (tip->pfd < 0) {
+                       fprintf(stderr, "Couldn't open padding file %s\n", tmp);
+                       exit_trace(1);
+               }
+
+               tip->pfd_buf = malloc(buf_nr * sizeof(size_t));
+       }
+
+       while (!is_done()) {
+               if (tip->get_subbuf(tip, buf_size) < 0)
                        break;
+       }
 
+       /*
+        * trace is stopped, pull data until we get a short read
+        */
+       while (tip->get_subbuf(tip, buf_size) > 0)
+               ;
+
+       tip_ftrunc_final(tip);
+       tip->exited = 1;
+       return NULL;
+}
+
+static int write_data_net(int fd, void *buf, unsigned int buf_len)
+{
+       unsigned int bytes_left = buf_len;
+       int ret;
+
+       while (bytes_left) {
+               ret = send(fd, buf, bytes_left, 0);
                if (ret < 0) {
-                       perror("write");
-                       return 1;
-               } else if (ret > 0) {
-                       p += ret;
-                       bytes_left -= ret;
-               } else {
-                       fprintf(stderr, "Zero write?\n");
+                       perror("send");
                        return 1;
                }
+
+               buf += ret;
+               bytes_left -= ret;
        }
 
        return 0;
 }
 
-static void *extract_data(struct thread_information *tip, int nb)
+static int net_send_header(struct thread_information *tip, unsigned int len)
 {
-       unsigned char *buf;
+       struct blktrace_net_hdr hdr;
 
-       buf = malloc(nb);
-       if (!read_data(tip, buf, nb))
-               return buf;
+       hdr.magic = BLK_IO_TRACE_MAGIC;
+       strcpy(hdr.buts_name, tip->device->buts_name);
+       hdr.cpu = tip->cpu;
+       hdr.max_cpus = ncpus;
+       hdr.len = len;
 
-       free(buf);
-       return NULL;
+       return write_data_net(net_out_fd, &hdr, sizeof(hdr));
 }
 
 /*
- * trace may start inside 'bit' or may need to be gotten further on
+ * send header with 0 length to signal end-of-run
  */
-static int get_event_slow(struct thread_information *tip,
-                         struct blk_io_trace *bit)
+static void net_client_send_close(void)
+{
+       struct blktrace_net_hdr hdr;
+
+       hdr.magic = BLK_IO_TRACE_MAGIC;
+       hdr.cpu = 0;
+       hdr.max_cpus = ncpus;
+       hdr.len = 0;
+
+       write_data_net(net_out_fd, &hdr, sizeof(hdr));
+}
+
+static int flush_subbuf_net(struct thread_information *tip,
+                           struct tip_subbuf *ts)
+{
+       if (net_send_header(tip, ts->len))
+               return 1;
+       if (write_data_net(net_out_fd, ts->buf, ts->len))
+               return 1;
+
+       free(ts->buf);
+       free(ts);
+       return 0;
+}
+
+static int net_sendfile(struct thread_information *tip, struct tip_subbuf *ts)
+{
+       int ret = sendfile(net_out_fd, tip->fd, NULL, ts->len);
+
+       if (ret < 0) {
+               perror("sendfile");
+               return 1;
+       } else if (ret < (int) ts->len) {
+               fprintf(stderr, "short sendfile send (%d of %d)\n", ret, ts->len);
+               return 1;
+       }
+
+       return 0;
+}
+
+static int get_subbuf_padding(struct thread_information *tip, off_t off)
 {
-       const int inc = sizeof(__u32);
-       struct blk_io_trace foo;
-       int offset;
-       void *p;
+       int padding_size = buf_nr * sizeof(size_t);
+       int ret;
+
+       ret = read(tip->pfd, tip->pfd_buf, padding_size);
+       if (ret == padding_size) {
+               int subbuf = (off / buf_size) % buf_nr;
+
+               ret = tip->pfd_buf[subbuf];
+       } else if (ret < 0)
+               perror("tip pad read");
+       else {
+               fprintf(stderr, "bad pad size read\n");
+               ret = -1;
+       }
+
+       return ret;
+}
+
+static int flush_subbuf_sendfile(struct thread_information *tip,
+                                struct tip_subbuf *ts)
+{
+       int pad, ret = 1;
 
        /*
-        * check is trace is inside
+        * currently we cannot use sendfile() on the last bytes read, as they
+        * may not be a full subbuffer. get_subbuf_sendfile() falls back to
+        * the read approach for those, so use send() to ship them out
         */
-       offset = 0;
-       p = bit;
-       while (offset < sizeof(*bit)) {
-               p += inc;
-               offset += inc;
+       if (ts->buf)
+               return flush_subbuf_net(tip, ts);
+       
+       pad = get_subbuf_padding(tip, tip->ofile_offset);
+       if (pad == -1)
+               goto err;
+
+       ts->len = buf_size - pad;
+
+       if (net_send_header(tip, ts->len))
+               goto err;
+       if (net_sendfile(tip, ts))
+               goto err;
+
+       tip->data_read += ts->len;
+       tip->ofile_offset += buf_size;
+       ret = 0;
+err:
+       tip->sendfile_pending--;
+       free(ts);
+       return ret;
+}
 
-               memcpy(&foo, p, inc);
+static int write_data(struct thread_information *tip, void *buf,
+                     unsigned int buf_len)
+{
+       int ret;
 
-               if (CHECK_MAGIC(&foo))
+       if (!buf_len)
+               return 0;
+
+       while (1) {
+               ret = fwrite(buf, buf_len, 1, tip->ofile);
+               if (ret == 1)
                        break;
+
+               if (ret < 0) {
+                       perror("write");
+                       return 1;
+               }
        }
 
+       if (tip->ofile_stdout)
+               fflush(tip->ofile);
+
+       return 0;
+}
+
+static int flush_subbuf_file(struct thread_information *tip,
+                            struct tip_subbuf *ts)
+{
+       unsigned int offset = 0;
+       struct blk_io_trace *t;
+       int pdu_len, events = 0;
+
        /*
-        * part trace found inside, read the rest
+        * surplus from last run
         */
-       if (offset < sizeof(*bit)) {
-               int good_bytes = sizeof(*bit) - offset;
+       if (tip->leftover_ts) {
+               struct tip_subbuf *prev_ts = tip->leftover_ts;
 
-               memmove(bit, p, good_bytes);
-               p = (void *) bit + good_bytes;
+               if (prev_ts->len + ts->len > prev_ts->max_len) {
+                       prev_ts->max_len += ts->len;
+                       prev_ts->buf = realloc(prev_ts->buf, prev_ts->max_len);
+               }
 
-               return read_data(tip, p, offset);
+               memcpy(prev_ts->buf + prev_ts->len, ts->buf, ts->len);
+               prev_ts->len += ts->len;
+
+               free(ts->buf);
+               free(ts);
+
+               ts = prev_ts;
+               tip->leftover_ts = NULL;
        }
 
-       /*
-        * nothing found, keep looking for start of trace
-        */
-       do {
-               if (read_data(tip, bit, sizeof(bit->magic)))
+       while (offset + sizeof(*t) <= ts->len) {
+               t = ts->buf + offset;
+
+               if (verify_trace(t)) {
+                       write_data(tip, ts->buf, offset);
                        return -1;
-       } while (!CHECK_MAGIC(bit));
+               }
+
+               pdu_len = t->pdu_len;
+
+               if (offset + sizeof(*t) + pdu_len > ts->len)
+                       break;
+
+               offset += sizeof(*t) + pdu_len;
+               tip->events_processed++;
+               tip->data_read += sizeof(*t) + pdu_len;
+               events++;
+       }
+
+       if (write_data(tip, ts->buf, offset))
+               return -1;
 
        /*
-        * now get the rest of it
+        * leftover bytes, save them for next time
         */
-       p = &bit->sequence;
-       if (!read_data(tip, p, sizeof(*bit) - inc))
-               return -1;
+       if (offset != ts->len) {
+               tip->leftover_ts = ts;
+               ts->len -= offset;
+               memmove(ts->buf, ts->buf + offset, ts->len);
+       } else {
+               free(ts->buf);
+               free(ts);
+       }
+
+       return events;
+}
+
+static int write_tip_events(struct thread_information *tip)
+{
+       struct tip_subbuf *ts = subbuf_fifo_dequeue(tip);
+
+       if (ts)
+               return tip->flush_subbuf(tip, ts);
 
        return 0;
 }
 
 /*
- * Sometimes relayfs screws us a little, if an event crosses a sub buffer
- * boundary. So keep looking forward in the trace data until an event
- * is found
+ * scans the tips we know and writes out the subbuffers we accumulate
  */
-static int get_event(struct thread_information *tip, struct blk_io_trace *bit)
+static void get_and_write_events(void)
 {
-       /*
-        * optimize for the common fast case, a full trace read that
-        * succeeds
-        */
-       if (read_data(tip, bit, sizeof(*bit)))
-               return -1;
+       struct device_information *dip;
+       struct thread_information *tip;
+       int i, j, events, ret, tips_running;
+
+       while (!is_done()) {
+               events = 0;
 
-       if (CHECK_MAGIC(bit))
-               return 0;
+               for_each_dip(dip, i) {
+                       for_each_tip(dip, tip, j) {
+                               ret = write_tip_events(tip);
+                               if (ret > 0)
+                                       events += ret;
+                       }
+               }
+
+               if (!events)
+                       usleep(10);
+       }
 
        /*
-        * ok that didn't work, the event may start somewhere inside the
-        * trace itself
+        * reap stored events
         */
-       return get_event_slow(tip, bit);
+       do {
+               events = 0;
+               tips_running = 0;
+               for_each_dip(dip, i) {
+                       for_each_tip(dip, tip, j) {
+                               ret = write_tip_events(tip);
+                               if (ret > 0)
+                                       events += ret;
+                               tips_running += !tip->exited;
+                       }
+               }
+               usleep(10);
+       } while (events || tips_running);
 }
 
-static inline void tip_fd_unlock(struct thread_information *tip)
+static void wait_for_threads(void)
 {
-       if (tip->fd_lock)
-               pthread_mutex_unlock(tip->fd_lock);
-}
+       /*
+        * for piped or network output, poll and fetch data for writeout.
+        * for files, we just wait around for trace threads to exit
+        */
+       if ((output_name && !strcmp(output_name, "-")) ||
+           net_mode == Net_client)
+               get_and_write_events();
+       else {
+               struct device_information *dip;
+               struct thread_information *tip;
+               int i, j, tips_running;
+
+               do {
+                       tips_running = 0;
+                       usleep(1000);
 
-static inline void tip_fd_lock(struct thread_information *tip)
-{
-       if (tip->fd_lock)
-               pthread_mutex_lock(tip->fd_lock);
+                       for_each_dip(dip, i)
+                               for_each_tip(dip, tip, j)
+                                       tips_running += !tip->exited;
+               } while (tips_running);
+       }
+
+       if (net_mode == Net_client)
+               net_client_send_close();
 }
 
-static void *extract(void *arg)
+static int fill_ofname(struct thread_information *tip, char *dst,
+                      char *buts_name)
 {
-       struct thread_information *tip = arg;
-       int pdu_len;
-       char *pdu_data;
-       struct blk_io_trace t;
-       pid_t pid = getpid();
-       cpu_set_t cpu_mask;
+       struct stat sb;
+       int len = 0;
+       time_t t;
 
-       CPU_ZERO(&cpu_mask);
-       CPU_SET((tip->cpu), &cpu_mask);
+       if (output_dir)
+               len = sprintf(dst, "%s/", output_dir);
 
-       if (sched_setaffinity(pid, sizeof(cpu_mask), &cpu_mask) == -1) {
-               perror("sched_setaffinity");
-               exit_trace(1);
+       if (net_mode == Net_server) {
+               len += sprintf(dst + len, "%s-", inet_ntoa(tip->cl_in_addr));
+               time(&t);
+               len += strftime(dst + len, 64, "%F-%T/", gmtime(&t));
        }
 
-       snprintf(tip->fn, sizeof(tip->fn), "%s/block/%s/trace%d",
-                       relay_path, tip->device->buts_name, tip->cpu);
-       tip->fd = open(tip->fn, O_RDONLY);
-       if (tip->fd < 0) {
-               perror(tip->fn);
-               fprintf(stderr,"Thread %d failed open of %s\n", tip->cpu,
-                       tip->fn);
-               exit_trace(1);
+       if (stat(dst, &sb) < 0) {
+               if (errno != ENOENT) {
+                       perror("stat");
+                       return 1;
+               }
+               if (mkdir(dst, 0755) < 0) {
+                       perror(dst);
+                       fprintf(stderr, "Can't make output dir\n");
+                       return 1;
+               }
        }
 
-       pdu_data = NULL;
-       while (!is_done()) {
-               if (get_event(tip, &t))
-                       break;
-
-               if (verify_trace(&t))
-                       break;
-
-               pdu_len = t.pdu_len;
-
-               trace_to_be(&t);
-
-               if (pdu_len) {
-                       pdu_data = extract_data(tip, pdu_len);
-                       if (!pdu_data)
-                               break;
-               }
+       if (output_name)
+               sprintf(dst + len, "%s.blktrace.%d", output_name, tip->cpu);
+       else
+               sprintf(dst + len, "%s.blktrace.%d", buts_name, tip->cpu);
 
-               /*
-                * now we have both trace and payload, get a lock on the
-                * output descriptor and send it off
-                */
-               tip_fd_lock(tip);
+       return 0;
+}
 
-               if (write_data(tip->ofd, &t, sizeof(t))) {
-                       tip_fd_unlock(tip);
-                       break;
+static void fill_ops(struct thread_information *tip)
+{
+       /*
+        * setup ops
+        */
+       if (net_mode == Net_client) {
+               if (net_use_sendfile) {
+                       tip->get_subbuf = get_subbuf_sendfile;
+                       tip->flush_subbuf = flush_subbuf_sendfile;
+               } else {
+                       tip->get_subbuf = get_subbuf;
+                       tip->flush_subbuf = flush_subbuf_net;
                }
+       } else {
+               if (tip->ofile_mmap)
+                       tip->get_subbuf = mmap_subbuf;
+               else
+                       tip->get_subbuf = get_subbuf;
 
-               if (pdu_data && write_data(tip->ofd, pdu_data, pdu_len)) {
-                       tip_fd_unlock(tip);
-                       break;
-               }
+               tip->flush_subbuf = flush_subbuf_file;
+       }
+                       
+       if (net_mode == Net_server)
+               tip->read_data = read_data_net;
+       else
+               tip->read_data = read_data_file;
+}
 
-               tip_fd_unlock(tip);
+static int tip_open_output(struct device_information *dip,
+                          struct thread_information *tip)
+{
+       int pipeline = output_name && !strcmp(output_name, "-");
+       int mode, vbuf_size;
+       char op[128];
+
+       if (net_mode == Net_client) {
+               tip->ofile = NULL;
+               tip->ofile_stdout = 0;
+               tip->ofile_mmap = 0;
+               goto done;
+       } else if (pipeline) {
+               tip->ofile = fdopen(STDOUT_FILENO, "w");
+               tip->ofile_stdout = 1;
+               tip->ofile_mmap = 0;
+               mode = _IOLBF;
+               vbuf_size = 512;
+       } else {
+               if (fill_ofname(tip, op, dip->buts_name))
+                       return 1;
+               tip->ofile = fopen(op, "w+");
+               tip->ofile_stdout = 0;
+               tip->ofile_mmap = 1;
+               mode = _IOFBF;
+               vbuf_size = OFILE_BUF;
+       }
 
-               if (pdu_data) {
-                       free(pdu_data);
-                       pdu_data = NULL;
-               }
+       if (tip->ofile == NULL) {
+               perror(op);
+               return 1;
+       }
 
-               tip->events_processed++;
+       tip->ofile_buffer = malloc(vbuf_size);
+       if (setvbuf(tip->ofile, tip->ofile_buffer, mode, vbuf_size)) {
+               perror("setvbuf");
+               close_thread(tip);
+               return 1;
        }
 
-       exit_trace(1);
-       return NULL;
+done:
+       fill_ops(tip);
+       return 0;
 }
 
 static int start_threads(struct device_information *dip)
 {
        struct thread_information *tip;
-       char op[64];
-       int j, pipeline = output_name && !strcmp(output_name, "-");
-       int len;
+       int j;
 
-       for (tip = dip->threads, j = 0; j < ncpus; j++, tip++) {
+       for_each_tip(dip, tip, j) {
                tip->cpu = j;
                tip->device = dip;
-               tip->fd_lock = NULL;
                tip->events_processed = 0;
+               tip->fd = -1;
+               tip->pfd = -1;
+               memset(&tip->fifo, 0, sizeof(tip->fifo));
+               tip->leftover_ts = NULL;
 
-               if (pipeline) {
-                       tip->ofd = dup(STDOUT_FILENO);
-                       tip->fd_lock = &stdout_mutex;
-               } else {
-                       len = 0;
-
-                       if (output_dir)
-                               len = sprintf(op, "%s/", output_dir);
-
-                       if (output_name) {
-                               sprintf(op + len, "%s.blktrace.%d", output_name,
-                                       tip->cpu);
-                       } else {
-                               sprintf(op + len, "%s.blktrace.%d",
-                                       dip->buts_name, tip->cpu);
-                       }
-                       tip->ofd = open(op, O_CREAT|O_TRUNC|O_WRONLY, 0644);
-               }
-
-               if (tip->ofd < 0) {
-                       perror(op);
+               if (tip_open_output(dip, tip))
                        return 1;
-               }
 
-               if (pthread_create(&tip->thread, NULL, extract, tip)) {
+               if (pthread_create(&tip->thread, NULL, thread_main, tip)) {
                        perror("pthread_create");
-                       close(tip->ofd);
+                       close_thread(tip);
                        return 1;
                }
        }
@@ -476,25 +1129,14 @@ static int start_threads(struct device_information *dip)
        return 0;
 }
 
-static void close_thread(struct thread_information *tip)
-{
-       if (tip->fd != -1)
-               close(tip->fd);
-       if (tip->ofd != -1)
-               close(tip->ofd);
-
-       tip->fd = tip->ofd = -1;
-}
-
 static void stop_threads(struct device_information *dip)
 {
        struct thread_information *tip;
-       long ret;
-       int j;
+       unsigned long ret;
+       int i;
 
-       for (tip = dip->threads, j = 0; j < ncpus; j++, tip++) {
-               if (pthread_join(tip->thread, (void *) &ret))
-                       perror("thread_join");
+       for_each_tip(dip, tip, i) {
+               (void) pthread_join(tip->thread, (void *) &ret);
                close_thread(tip);
        }
 }
@@ -504,26 +1146,27 @@ static void stop_all_threads(void)
        struct device_information *dip;
        int i;
 
-       for (dip = device_information, i = 0; i < ndevs; i++, dip++)
+       for_each_dip(dip, i)
                stop_threads(dip);
 }
 
 static void stop_all_tracing(void)
 {
        struct device_information *dip;
-       struct thread_information *tip;
-       int i, j;
+       int i;
 
-       for (dip = device_information, i = 0; i < ndevs; i++, dip++) {
-               for (tip = dip->threads, j = 0; j < ncpus; j++, tip++)
-                       close_thread(tip);
+       for_each_dip(dip, i)
                stop_trace(dip);
-       }
 }
 
 static void exit_trace(int status)
 {
-       stop_all_tracing();
+       if (!is_trace_stopped()) {
+               trace_stopped = 1;
+               stop_all_threads();
+               stop_all_tracing();
+       }
+
        exit(status);
 }
 
@@ -546,13 +1189,14 @@ static int open_devices(void)
        struct device_information *dip;
        int i;
 
-       for (dip = device_information, i = 0; i < ndevs; i++, dip++) {
-               dip->fd = open(dip->path, O_RDONLY);
+       for_each_dip(dip, i) {
+               dip->fd = open(dip->path, O_RDONLY | O_NONBLOCK);
                if (dip->fd < 0) {
                        perror(dip->path);
                        return 1;
                }
        }
+
        return 0;
 }
 
@@ -568,7 +1212,7 @@ static int start_devices(void)
                return 1;
        }
 
-       for (dip = device_information, i = 0; i < ndevs; i++, dip++) {
+       for_each_dip(dip, i) {
                if (start_trace(dip)) {
                        close(dip->fd);
                        fprintf(stderr, "Failed to start trace on %s\n",
@@ -576,24 +1220,28 @@ static int start_devices(void)
                        break;
                }
        }
+
        if (i != ndevs) {
-               for (dip = device_information, j = 0; j < i; j++, dip++)
+               __for_each_dip(dip, j, i)
                        stop_trace(dip);
+
                return 1;
        }
 
-       for (dip = device_information, i = 0; i < ndevs; i++, dip++) {
+       for_each_dip(dip, i) {
                dip->threads = thread_information + (i * ncpus);
                if (start_threads(dip)) {
                        fprintf(stderr, "Failed to start worker threads\n");
                        break;
                }
        }
+
        if (i != ndevs) {
-               for (dip = device_information, j = 0; j < i; j++, dip++)
+               __for_each_dip(dip, j, i)
                        stop_threads(dip);
-               for (dip = device_information, i = 0; i < ndevs; i++, dip++)
+               for_each_dip(dip, i)
                        stop_trace(dip);
+
                return 1;
        }
 
@@ -602,24 +1250,303 @@ static int start_devices(void)
 
 static void show_stats(void)
 {
-       int i, j;
        struct device_information *dip;
        struct thread_information *tip;
-       unsigned long long events_processed;
+       unsigned long long events_processed, data_read;
+       unsigned long total_drops;
+       int i, j, no_stdout = 0;
 
-       if (output_name && !strcmp(output_name, "-"))
+       if (is_stat_shown())
                return;
 
-       for (dip = device_information, i = 0; i < ndevs; i++, dip++) {
-               printf("Device: %s\n", dip->path);
+       if (output_name && !strcmp(output_name, "-"))
+               no_stdout = 1;
+
+       stat_shown = 1;
+
+       total_drops = 0;
+       for_each_dip(dip, i) {
+               if (!no_stdout)
+                       printf("Device: %s\n", dip->path);
                events_processed = 0;
-               for (tip = dip->threads, j = 0; j < ncpus; j++, tip++) {
-                       printf("  CPU%3d: %20ld events\n",
-                              tip->cpu, tip->events_processed);
+               data_read = 0;
+               for_each_tip(dip, tip, j) {
+                       if (!no_stdout)
+                               printf("  CPU%3d: %20lu events, %8llu KiB data\n",
+                                       tip->cpu, tip->events_processed,
+                                       (tip->data_read + 1023) >> 10);
                        events_processed += tip->events_processed;
+                       data_read += tip->data_read;
+               }
+               total_drops += dip->drop_count;
+               if (!no_stdout)
+                       printf("  Total:  %20llu events (dropped %lu), %8llu KiB data\n",
+                                       events_processed, dip->drop_count,
+                                       (data_read + 1023) >> 10);
+       }
+
+       if (total_drops)
+               fprintf(stderr, "You have dropped events, consider using a larger buffer size (-b)\n");
+}
+
+static struct device_information *net_get_dip(char *buts_name,
+                                             struct in_addr *cl_in_addr)
+{
+       struct device_information *dip;
+       int i;
+
+       for (i = 0; i < ndevs; i++) {
+               dip = &device_information[i];
+
+               if (!strcmp(dip->buts_name, buts_name))
+                       return dip;
+       }
+
+       device_information = realloc(device_information, (ndevs + 1) * sizeof(*dip));
+       dip = &device_information[ndevs];
+       memset(dip, 0, sizeof(*dip));
+       dip->fd = -1;
+       strcpy(dip->buts_name, buts_name);
+       dip->path = strdup(buts_name);
+       ndevs++;
+       dip->threads = malloc(ncpus * sizeof(struct thread_information));
+       memset(dip->threads, 0, ncpus * sizeof(struct thread_information));
+
+       /*
+        * open all files
+        */
+       for (i = 0; i < ncpus; i++) {
+               struct thread_information *tip = &dip->threads[i];
+
+               tip->cpu = i;
+               tip->device = dip;
+               tip->fd = -1;
+               tip->pfd = -1;
+               tip->cl_in_addr = *cl_in_addr;
+
+               if (tip_open_output(dip, tip))
+                       return NULL;
+       }
+
+       return dip;
+}
+
+static struct thread_information *net_get_tip(struct blktrace_net_hdr *bnh,
+                                             struct in_addr *cl_in_addr)
+{
+       struct device_information *dip;
+
+       ncpus = bnh->max_cpus;
+       dip = net_get_dip(bnh->buts_name, cl_in_addr);
+       return &dip->threads[bnh->cpu];
+}
+
+static int net_get_header(struct blktrace_net_hdr *bnh)
+{
+       int fl = fcntl(net_in_fd, F_GETFL);
+       int bytes_left, ret;
+       void *p = bnh;
+
+       fcntl(net_in_fd, F_SETFL, fl | O_NONBLOCK);
+       bytes_left = sizeof(*bnh);
+       while (bytes_left && !is_done()) {
+               ret = recv(net_in_fd, p, bytes_left, MSG_WAITALL);
+               if (ret < 0) {
+                       if (errno != EAGAIN) {
+                               perror("recv header");
+                               return 1;
+                       }
+                       usleep(100);
+                       continue;
+               } else if (!ret) {
+                       usleep(100);
+                       continue;
+               } else {
+                       p += ret;
+                       bytes_left -= ret;
+               }
+       }
+       fcntl(net_in_fd, F_SETFL, fl & ~O_NONBLOCK);
+       return bytes_left;
+}
+
+static int net_server_loop(struct in_addr *cl_in_addr)
+{
+       struct thread_information *tip;
+       struct blktrace_net_hdr bnh;
+
+       if (net_get_header(&bnh))
+               return 1;
+
+       if (data_is_native == -1 && check_data_endianness(bnh.magic)) {
+               fprintf(stderr, "server: received data is bad\n");
+               return 1;
+       }
+
+       if (!data_is_native) {
+               bnh.magic = be32_to_cpu(bnh.magic);
+               bnh.cpu = be32_to_cpu(bnh.cpu);
+               bnh.len = be32_to_cpu(bnh.len);
+       }
+
+       if ((bnh.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
+               fprintf(stderr, "server: bad data magic\n");
+               return 1;
+       }
+
+       /*
+        * len == 0 means that the other end signalled end-of-run
+        */
+       if (!bnh.len) {
+               fprintf(stderr, "server: end of run\n");
+               return 1;
+       }
+
+       tip = net_get_tip(&bnh, cl_in_addr);
+       if (!tip)
+               return 1;
+
+       if (mmap_subbuf(tip, bnh.len))
+               return 1;
+
+       return 0;
+}
+
+/*
+ * Start here when we are in server mode - just fetch data from the network
+ * and dump to files
+ */
+static int net_server(void)
+{
+       struct device_information *dip;
+       struct thread_information *tip;
+       struct sockaddr_in addr;
+       socklen_t socklen;
+       int fd, opt, i, j;
+
+       fd = socket(AF_INET, SOCK_STREAM, 0);
+       if (fd < 0) {
+               perror("server: socket");
+               return 1;
+       }
+
+       opt = 1;
+       if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) < 0) {
+               perror("setsockopt");
+               return 1;
+       }
+
+       memset(&addr, 0, sizeof(addr));
+       addr.sin_family = AF_INET;
+       addr.sin_addr.s_addr = htonl(INADDR_ANY);
+       addr.sin_port = htons(net_port);
+
+       if (bind(fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
+               perror("bind");
+               return 1;
+       }
+
+       if (listen(fd, 1) < 0) {
+               perror("listen");
+               return 1;
+       }
+
+repeat:
+       signal(SIGINT, NULL);
+       signal(SIGHUP, NULL);
+       signal(SIGTERM, NULL);
+       signal(SIGALRM, NULL);
+
+       printf("blktrace: waiting for incoming connection...\n");
+
+       socklen = sizeof(addr);
+       net_in_fd = accept(fd, (struct sockaddr *) &addr, &socklen);
+       if (net_in_fd < 0) {
+               perror("accept");
+               return 1;
+       }
+
+       signal(SIGINT, handle_sigint);
+       signal(SIGHUP, handle_sigint);
+       signal(SIGTERM, handle_sigint);
+       signal(SIGALRM, handle_sigint);
+
+       printf("blktrace: connection from %s\n", inet_ntoa(addr.sin_addr));
+
+       while (!is_done()) {
+               if (net_server_loop(&addr.sin_addr))
+                       break;
+       }
+
+       for_each_dip(dip, i)
+               for_each_tip(dip, tip, j)
+                       tip_ftrunc_final(tip);
+
+       show_stats();
+
+       if (is_done())
+               return 0;
+
+       /*
+        * cleanup for next run
+        */
+       for_each_dip(dip, i) {
+               for_each_tip(dip, tip, j)
+                       fclose(tip->ofile);
+
+               free(dip->threads);
+               free(dip->path);
+       }
+
+       free(device_information);
+       device_information = NULL;
+       ncpus = ndevs = 0;
+
+       close(net_in_fd);
+       net_in_fd = -1;
+       stat_shown = 0;
+       goto repeat;
+}
+
+/*
+ * Setup outgoing network connection where we will transmit data
+ */
+static int net_setup_client(void)
+{
+       struct sockaddr_in addr;
+       int fd;
+
+       fd = socket(AF_INET, SOCK_STREAM, 0);
+       if (fd < 0) {
+               perror("client: socket");
+               return 1;
+       }
+
+       memset(&addr, 0, sizeof(addr));
+       addr.sin_family = AF_INET;
+       addr.sin_port = htons(net_port);
+
+       if (inet_aton(hostname, &addr.sin_addr) != 1) {
+               struct hostent *hent = gethostbyname(hostname);
+               if (!hent) {
+                       perror("gethostbyname");
+                       return 1;
                }
-               printf("  Total:  %20lld events\n", events_processed);
+
+               memcpy(&addr.sin_addr, hent->h_addr, 4);
+               strcpy(hostname, hent->h_name);
        }
+
+       printf("blktrace: connecting to %s\n", hostname);
+
+       if (connect(fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
+               perror("client: connect");
+               return 1;
+       }
+
+       printf("blktrace: connected!\n");
+       net_out_fd = fd;
+       return 0;
 }
 
 static char usage_str[] = \
@@ -635,18 +1562,17 @@ static char usage_str[] = \
        "\t-A Give trace mask as a single value. See documentation\n" \
        "\t-b Sub buffer size in KiB\n" \
        "\t-n Number of sub buffers\n" \
-       "\t-v Print program version info\n\n";
+       "\t-l Run in network listen mode (blktrace server)\n" \
+       "\t-h Run in network client mode, connecting to the given host\n" \
+       "\t-p Network port to use (default 8462)\n" \
+       "\t-s Make the network client use sendfile() to transfer data\n" \
+       "\t-V Print program version info\n\n";
 
 static void show_usage(char *program)
 {
        fprintf(stderr, "Usage: %s %s %s",program, blktrace_version, usage_str);
 }
 
-static void handle_sigint(__attribute__((__unused__)) int sig)
-{
-       done = 1;
-}
-
 int main(int argc, char *argv[])
 {
        static char default_relay_path[] = "/relay";
@@ -702,35 +1628,55 @@ int main(int argc, char *argv[])
                                return 1;
                        }
                        break;
-               case 'v':
+               case 'V':
                        printf("%s version %s\n", argv[0], blktrace_version);
                        return 0;
                case 'b':
-                       buf_size = atoi(optarg);
-                       if (buf_size <= 0) {
+                       buf_size = strtoul(optarg, NULL, 10);
+                       if (buf_size <= 0 || buf_size > 16*1024) {
                                fprintf(stderr,
-                                       "Invalid buffer size (%d)\n", buf_size);
+                                       "Invalid buffer size (%lu)\n",buf_size);
                                return 1;
                        }
                        buf_size <<= 10;
                        break;
                case 'n':
-                       buf_nr = atoi(optarg);
+                       buf_nr = strtoul(optarg, NULL, 10);
                        if (buf_nr <= 0) {
                                fprintf(stderr,
-                                       "Invalid buffer nr (%d)\n", buf_nr);
+                                       "Invalid buffer nr (%lu)\n", buf_nr);
                                return 1;
                        }
                        break;
                case 'D':
                        output_dir = optarg;
                        break;
+               case 'h':
+                       net_mode = Net_client;
+                       strcpy(hostname, optarg);
+                       break;
+               case 'l':
+                       net_mode = Net_server;
+                       break;
+               case 'p':
+                       net_port = atoi(optarg);
+                       break;
+               case 's':
+                       net_use_sendfile = 1;
+                       break;
                default:
                        show_usage(argv[0]);
                        return 1;
                }
        }
 
+       setlocale(LC_NUMERIC, "en_US");
+
+       page_size = getpagesize();
+
+       if (net_mode == Net_server)
+               return net_server();
+
        while (optind < argc) {
                if (resize_devices(argv[optind++]) != 0)
                        return 1;
@@ -752,7 +1698,7 @@ int main(int argc, char *argv[])
                fprintf(stderr,"%s does not appear to be a valid path\n",
                        relay_path);
                return 1;
-       } else if (st.f_type != RELAYFS_TYPE) {
+       } else if (st.f_type != (long) RELAYFS_TYPE) {
                fprintf(stderr,"%s does not appear to be a relay filesystem\n",
                        relay_path);
                return 1;
@@ -766,32 +1712,36 @@ int main(int argc, char *argv[])
                return 0;
        }
 
-       setlocale(LC_NUMERIC, "en_US");
-
        ncpus = sysconf(_SC_NPROCESSORS_ONLN);
        if (ncpus < 0) {
                fprintf(stderr, "sysconf(_SC_NPROCESSORS_ONLN) failed\n");
                return 1;
        }
 
-       if (start_devices() != 0)
-               return 1;
-
        signal(SIGINT, handle_sigint);
        signal(SIGHUP, handle_sigint);
        signal(SIGTERM, handle_sigint);
        signal(SIGALRM, handle_sigint);
 
+       if (net_mode == Net_client && net_setup_client())
+               return 1;
+
+       if (start_devices() != 0)
+               return 1;
+
        atexit(stop_all_tracing);
 
        if (stop_watch)
                alarm(stop_watch);
 
-       while (!is_done())
-               sleep(1);
+       wait_for_threads();
+
+       if (!is_trace_stopped()) {
+               trace_stopped = 1;
+               stop_all_threads();
+               stop_all_traces();
+       }
 
-       stop_all_threads();
-       stop_all_traces();
        show_stats();
 
        return 0;