blktrace: don't stop tracer if not setup trace successfully
[blktrace.git] / blktrace.c
index 29e55b84e6e87d6a95462657b4827f0943d77c16..d0d271f9add5106b4481321058787f1ba1aa82a4 100644 (file)
@@ -2,6 +2,10 @@
  * block queue tracing application
  *
  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
+ * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
+ *
+ * Rewrite to have a single thread per CPU (managing all devices on that CPU)
+ *     Alan D. Brunelle <alan.brunelle@hp.com> - January 2009
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  */
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/stat.h>
+
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <sched.h>
 #include <unistd.h>
-#include <locale.h>
+#include <poll.h>
 #include <signal.h>
-#include <fcntl.h>
-#include <string.h>
+#include <pthread.h>
+#include <locale.h>
 #include <sys/ioctl.h>
-#include <sys/param.h>
-#include <sys/statfs.h>
-#include <sys/poll.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/vfs.h>
 #include <sys/mman.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/resource.h>
 #include <sys/socket.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sched.h>
-#include <ctype.h>
-#include <getopt.h>
-#include <errno.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
 #include <netdb.h>
 #include <sys/sendfile.h>
 
+#include "btt/list.h"
 #include "blktrace.h"
-#include "barrier.h"
-
-static char blktrace_version[] = "0.99";
 
 /*
  * You may want to increase this even more, if you are logging at a high
  * rate and see skipped/missed events
  */
-#define BUF_SIZE       (512 * 1024)
-#define BUF_NR         (4)
+#define BUF_SIZE               (512 * 1024)
+#define BUF_NR                 (4)
+
+#define FILE_VBUF_SIZE         (128 * 1024)
+
+#define DEBUGFS_TYPE           (0x64626720)
+#define TRACE_NET_PORT         (8462)
+
+enum {
+       Net_none = 0,
+       Net_server,
+       Net_client,
+};
+
+enum thread_status {
+       Th_running,
+       Th_leaving,
+       Th_error
+};
+
+/*
+ * Generic stats collected: nevents can be _roughly_ estimated by data_read
+ * (discounting pdu...)
+ *
+ * These fields are updated w/ pdc_dr_update & pdc_nev_update below.
+ */
+struct pdc_stats {
+       unsigned long long data_read;
+       unsigned long long nevents;
+};
+
+struct devpath {
+       struct list_head head;
+       char *path;                     /* path to device special file */
+       char *buts_name;                /* name returned from bt kernel code */
+       struct pdc_stats *stats;
+       int fd, ncpus;
+       unsigned long long drops;
+
+       /*
+        * For piped output only:
+        *
+        * Each tracer will have a tracer_devpath_head that it will add new
+        * data onto. It's list is protected above (tracer_devpath_head.mutex)
+        * and it will signal the processing thread using the dp_cond,
+        * dp_mutex & dp_entries variables above.
+        */
+       struct tracer_devpath_head *heads;
+
+       /*
+        * For network server mode only:
+        */
+       struct cl_host *ch;
+       u32 cl_id;
+       time_t cl_connect_time;
+       int setup_done; /* ioctl BLKTRACESETUP done */
+       struct io_info *ios;
+};
+
+/*
+ * For piped output to stdout we will have each tracer thread (one per dev)
+ * tack buffers read from the relay queues on a per-device list.
+ *
+ * The main thread will then collect trace buffers from each of lists in turn.
+ *
+ * We will use a mutex to guard each of the trace_buf list. The tracers
+ * can then signal the main thread using <dp_cond,dp_mutex> and
+ * dp_entries. (When dp_entries is 0, and a tracer adds an entry it will
+ * signal. When dp_entries is 0, the main thread will wait for that condition
+ * to be signalled.)
+ *
+ * adb: It may be better just to have a large buffer per tracer per dev,
+ * and then use it as a ring-buffer. This would certainly cut down a lot
+ * of malloc/free thrashing, at the cost of more memory movements (potentially).
+ */
+struct trace_buf {
+       struct list_head head;
+       struct devpath *dpp;
+       void *buf;
+       int cpu, len;
+};
+
+struct tracer_devpath_head {
+       pthread_mutex_t mutex;
+       struct list_head head;
+       struct trace_buf *prev;
+};
+
+/*
+ * Used to handle the mmap() interfaces for output file (containing traces)
+ */
+struct mmap_info {
+       void *fs_buf;
+       unsigned long long fs_size, fs_max_size, fs_off, fs_buf_len;
+       unsigned long buf_size, buf_nr;
+       int pagesize;
+};
+
+/*
+ * Each thread doing work on a (client) side of blktrace will have one
+ * of these. The ios array contains input/output information, pfds holds
+ * poll() data. The volatile's provide flags to/from the main executing
+ * thread.
+ */
+struct tracer {
+       struct list_head head;
+       struct io_info *ios;
+       struct pollfd *pfds;
+       pthread_t thread;
+       int cpu, nios;
+       volatile int status, is_done;
+};
+
+/*
+ * networking stuff follows. we include a magic number so we know whether
+ * to endianness convert or not.
+ *
+ * The len field is overloaded:
+ *     0 - Indicates an "open" - allowing the server to set up for a dev/cpu
+ *     1 - Indicates a "close" - Shut down connection orderly
+ *
+ * The cpu field is overloaded on close: it will contain the number of drops.
+ */
+struct blktrace_net_hdr {
+       u32 magic;              /* same as trace magic */
+       char buts_name[32];     /* trace name */
+       u32 cpu;                /* for which cpu */
+       u32 max_cpus;
+       u32 len;                /* length of following trace data */
+       u32 cl_id;              /* id for set of client per-cpu connections */
+       u32 buf_size;           /* client buf_size for this trace  */
+       u32 buf_nr;             /* client buf_nr for this trace  */
+       u32 page_size;          /* client page_size for this trace  */
+};
+
+/*
+ * Each host encountered has one of these. The head is used to link this
+ * on to the network server's ch_list. Connections associated with this
+ * host are linked on conn_list, and any devices traced on that host
+ * are connected on the devpaths list.
+ */
+struct cl_host {
+       struct list_head head;
+       struct list_head conn_list;
+       struct list_head devpaths;
+       struct net_server_s *ns;
+       char *hostname;
+       struct in_addr cl_in_addr;
+       int connects, ndevs, cl_opens;
+};
+
+/*
+ * Each connection (client to server socket ('fd')) has one of these. A
+ * back reference to the host ('ch'), and lists headers (for the host
+ * list, and the network server conn_list) are also included.
+ */
+struct cl_conn {
+       struct list_head ch_head, ns_head;
+       struct cl_host *ch;
+       int fd, ncpus;
+       time_t connect_time;
+};
+
+/*
+ * The network server requires some poll structures to be maintained -
+ * one per conection currently on conn_list. The nchs/ch_list values
+ * are for each host connected to this server. The addr field is used
+ * for scratch as new connections are established.
+ */
+struct net_server_s {
+       struct list_head conn_list;
+       struct list_head ch_list;
+       struct pollfd *pfds;
+       int listen_fd, connects, nchs;
+       struct sockaddr_in addr;
+};
+
+/*
+ * This structure is (generically) used to providide information
+ * for a read-to-write set of values.
+ *
+ * ifn & ifd represent input information
+ *
+ * ofn, ofd, ofp, obuf & mmap_info are used for output file (optionally).
+ */
+struct io_info {
+       struct devpath *dpp;
+       FILE *ofp;
+       char *obuf;
+       struct cl_conn *nc;     /* Server network connection */
+
+       /*
+        * mmap controlled output files
+        */
+       struct mmap_info mmap_info;
+
+       /*
+        * Client network fields
+        */
+       unsigned int ready;
+       unsigned long long data_queued;
+
+       /*
+        * Input/output file descriptors & names
+        */
+       int ifd, ofd;
+       char ifn[MAXPATHLEN + 64];
+       char ofn[MAXPATHLEN + 64];
+};
+
+static char blktrace_version[] = "2.0.0";
+
+/*
+ * Linkage to blktrace helper routines (trace conversions)
+ */
+int data_is_native = -1;
+
+static int ndevs;
+static int max_cpus;
+static int ncpus;
+static cpu_set_t *online_cpus;
+static int pagesize;
+static int act_mask = ~0U;
+static int kill_running_trace;
+static int stop_watch;
+static int piped_output;
+
+static char *debugfs_path = "/sys/kernel/debug";
+static char *output_name;
+static char *output_dir;
+
+static unsigned long buf_size = BUF_SIZE;
+static unsigned long buf_nr = BUF_NR;
+
+static FILE *pfp;
+
+static LIST_HEAD(devpaths);
+static LIST_HEAD(tracers);
+
+static volatile int done;
+
+/*
+ * tracer threads add entries, the main thread takes them off and processes
+ * them. These protect the dp_entries variable.
+ */
+static pthread_cond_t dp_cond = PTHREAD_COND_INITIALIZER;
+static pthread_mutex_t dp_mutex = PTHREAD_MUTEX_INITIALIZER;
+static volatile int dp_entries;
 
-#define OFILE_BUF      (128 * 1024)
+/*
+ * These synchronize master / thread interactions.
+ */
+static pthread_cond_t mt_cond = PTHREAD_COND_INITIALIZER;
+static pthread_mutex_t mt_mutex = PTHREAD_MUTEX_INITIALIZER;
+static volatile int nthreads_running;
+static volatile int nthreads_leaving;
+static volatile int nthreads_error;
+static volatile int tracers_run;
+
+/*
+ * network cmd line params
+ */
+static struct sockaddr_in hostname_addr;
+static char hostname[MAXHOSTNAMELEN];
+static int net_port = TRACE_NET_PORT;
+static int net_use_sendfile = 1;
+static int net_mode;
+static int *cl_fds;
 
-#define RELAYFS_TYPE   0xF0B4A981
+static int (*handle_pfds)(struct tracer *, int, int);
+static int (*handle_list)(struct tracer_devpath_head *, struct list_head *);
 
-#define S_OPTS "d:a:A:r:o:kw:Vb:n:D:lh:p:s"
+#define S_OPTS "d:a:A:r:o:kw:vVb:n:D:lh:p:sI:"
 static struct option l_opts[] = {
        {
                .name = "dev",
@@ -67,6 +337,12 @@ static struct option l_opts[] = {
                .flag = NULL,
                .val = 'd'
        },
+       {
+               .name = "input-devs",
+               .has_arg = required_argument,
+               .flag = NULL,
+               .val = 'I'
+       },
        {
                .name = "act-mask",
                .has_arg = required_argument,
@@ -103,6 +379,12 @@ static struct option l_opts[] = {
                .flag = NULL,
                .val = 'w'
        },
+       {
+               .name = "version",
+               .has_arg = no_argument,
+               .flag = NULL,
+               .val = 'v'
+       },
        {
                .name = "version",
                .has_arg = no_argument,
@@ -146,7 +428,7 @@ static struct option l_opts[] = {
                .val = 'p'
        },
        {
-               .name = "sendfile",
+               .name = "no-sendfile",
                .has_arg = no_argument,
                .flag = NULL,
                .val = 's'
@@ -156,1403 +438,1672 @@ static struct option l_opts[] = {
        }
 };
 
-struct tip_subbuf {
-       void *buf;
-       unsigned int len;
-       unsigned int max_len;
-};
+static char usage_str[] = "\n\n" \
+       "-d <dev>             | --dev=<dev>\n" \
+        "[ -r <debugfs path>  | --relay=<debugfs path> ]\n" \
+        "[ -o <file>          | --output=<file>]\n" \
+        "[ -D <dir>           | --output-dir=<dir>\n" \
+        "[ -w <time>          | --stopwatch=<time>]\n" \
+        "[ -a <action field>  | --act-mask=<action field>]\n" \
+        "[ -A <action mask>   | --set-mask=<action mask>]\n" \
+        "[ -b <size>          | --buffer-size]\n" \
+        "[ -n <number>        | --num-sub-buffers=<number>]\n" \
+        "[ -l                 | --listen]\n" \
+        "[ -h <hostname>      | --host=<hostname>]\n" \
+        "[ -p <port number>   | --port=<port number>]\n" \
+        "[ -s                 | --no-sendfile]\n" \
+        "[ -I <devs file>     | --input-devs=<devs file>]\n" \
+        "[ -v <version>       | --version]\n" \
+        "[ -V <version>       | --version]\n" \
 
-#define FIFO_SIZE      (1024)  /* should be plenty big! */
-#define CL_SIZE                (128)   /* cache line, any bigger? */
+       "\t-d Use specified device. May also be given last after options\n" \
+       "\t-r Path to mounted debugfs, defaults to /sys/kernel/debug\n" \
+       "\t-o File(s) to send output to\n" \
+       "\t-D Directory to prepend to output file names\n" \
+       "\t-w Stop after defined time, in seconds\n" \
+       "\t-a Only trace specified actions. See documentation\n" \
+       "\t-A Give trace mask as a single value. See documentation\n" \
+       "\t-b Sub buffer size in KiB (default 512)\n" \
+       "\t-n Number of sub buffers (default 4)\n" \
+       "\t-l Run in network listen mode (blktrace server)\n" \
+       "\t-h Run in network client mode, connecting to the given host\n" \
+       "\t-p Network port to use (default 8462)\n" \
+       "\t-s Make the network client NOT use sendfile() to transfer data\n" \
+       "\t-I Add devices found in <devs file>\n" \
+       "\t-v Print program version info\n" \
+       "\t-V Print program version info\n\n";
 
-struct tip_subbuf_fifo {
-       int tail __attribute__((aligned(CL_SIZE)));
-       int head __attribute__((aligned(CL_SIZE)));
-       struct tip_subbuf *q[FIFO_SIZE];
-};
+static void clear_events(struct pollfd *pfd)
+{
+       pfd->events = 0;
+       pfd->revents = 0;
+}
 
-struct thread_information {
-       int cpu;
-       pthread_t thread;
+static inline int net_client_use_sendfile(void)
+{
+       return net_mode == Net_client && net_use_sendfile;
+}
 
-       int fd;
-       void *fd_buf;
-       char fn[MAXPATHLEN + 64];
+static inline int net_client_use_send(void)
+{
+       return net_mode == Net_client && !net_use_sendfile;
+}
 
-       struct in_addr cl_in_addr;
+static inline int use_tracer_devpaths(void)
+{
+       return piped_output || net_client_use_send();
+}
 
-       FILE *ofile;
-       char *ofile_buffer;
-       off_t ofile_offset;
-       int ofile_stdout;
-       int ofile_mmap;
+static inline int in_addr_eq(struct in_addr a, struct in_addr b)
+{
+       return a.s_addr == b.s_addr;
+}
 
-       int (*get_subbuf)(struct thread_information *, unsigned int);
-       int (*flush_subbuf)(struct thread_information *, struct tip_subbuf *);
-       int (*read_data)(struct thread_information *, void *, unsigned int);
+static inline void pdc_dr_update(struct devpath *dpp, int cpu, int data_read)
+{
+       dpp->stats[cpu].data_read += data_read;
+}
 
-       unsigned long events_processed;
-       unsigned long long data_read;
-       unsigned long long data_queued;
-       struct device_information *device;
+static inline void pdc_nev_update(struct devpath *dpp, int cpu, int nevents)
+{
+       dpp->stats[cpu].nevents += nevents;
+}
 
-       int exited;
+static void show_usage(char *prog)
+{
+       fprintf(stderr, "Usage: %s %s", prog, usage_str);
+}
 
-       /*
-        * piped fifo buffers
-        */
-       struct tip_subbuf_fifo fifo;
-       struct tip_subbuf *leftover_ts;
+/*
+ * Create a timespec 'msec' milliseconds into the future
+ */
+static inline void make_timespec(struct timespec *tsp, long delta_msec)
+{
+       struct timeval now;
 
-       /*
-        * mmap controlled output files
-        */
-       unsigned long long fs_size;
-       unsigned long long fs_max_size;
-       unsigned long fs_off;
-       void *fs_buf;
-       unsigned long fs_buf_len;
-};
+       gettimeofday(&now, NULL);
+       tsp->tv_sec = now.tv_sec;
+       tsp->tv_nsec = 1000L * now.tv_usec;
 
-struct device_information {
-       int fd;
-       char *path;
-       char buts_name[32];
-       volatile int trace_started;
-       unsigned long drop_count;
-       struct thread_information *threads;
-};
+       tsp->tv_nsec += (delta_msec * 1000000L);
+       if (tsp->tv_nsec > 1000000000L) {
+               long secs = tsp->tv_nsec / 1000000000L;
 
-static int ncpus;
-static struct thread_information *thread_information;
-static int ndevs;
-static struct device_information *device_information;
+               tsp->tv_sec += secs;
+               tsp->tv_nsec -= (secs * 1000000000L);
+       }
+}
 
-/* command line option globals */
-static char *relay_path;
-static char *output_name;
-static char *output_dir;
-static int act_mask = ~0U;
-static int kill_running_trace;
-static unsigned long buf_size = BUF_SIZE;
-static unsigned long buf_nr = BUF_NR;
-static unsigned int page_size;
+/*
+ * Add a timer to ensure wait ends
+ */
+static void t_pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex)
+{
+       struct timespec ts;
 
-#define is_done()      (*(volatile int *)(&done))
-static volatile int done;
+       make_timespec(&ts, 50);
+       pthread_cond_timedwait(cond, mutex, &ts);
+}
 
-#define is_trace_stopped()     (*(volatile int *)(&trace_stopped))
-static volatile int trace_stopped;
+static void unblock_tracers(void)
+{
+       pthread_mutex_lock(&mt_mutex);
+       tracers_run = 1;
+       pthread_cond_broadcast(&mt_cond);
+       pthread_mutex_unlock(&mt_mutex);
+}
 
-#define is_stat_shown()        (*(volatile int *)(&stat_shown))
-static volatile int stat_shown;
+static void tracer_wait_unblock(struct tracer *tp)
+{
+       pthread_mutex_lock(&mt_mutex);
+       while (!tp->is_done && !tracers_run)
+               pthread_cond_wait(&mt_cond, &mt_mutex);
+       pthread_mutex_unlock(&mt_mutex);
+}
 
-int data_is_native = -1;
+static void tracer_signal_ready(struct tracer *tp,
+                               enum thread_status th_status,
+                               int status)
+{
+       pthread_mutex_lock(&mt_mutex);
+       tp->status = status;
 
-static void exit_trace(int status);
+       if (th_status == Th_running)
+               nthreads_running++;
+       else if (th_status == Th_error)
+               nthreads_error++;
+       else
+               nthreads_leaving++;
 
-#define dip_tracing(dip)       (*(volatile int *)(&(dip)->trace_started))
-#define dip_set_tracing(dip, v)        ((dip)->trace_started = (v))
+       pthread_cond_signal(&mt_cond);
+       pthread_mutex_unlock(&mt_mutex);
+}
 
-#define __for_each_dip(__d, __i, __e)  \
-       for (__i = 0, __d = device_information; __i < __e; __i++, __d++)
+static void wait_tracers_ready(int ncpus_started)
+{
+       pthread_mutex_lock(&mt_mutex);
+       while ((nthreads_running + nthreads_error) < ncpus_started)
+               t_pthread_cond_wait(&mt_cond, &mt_mutex);
+       pthread_mutex_unlock(&mt_mutex);
+}
 
-#define for_each_dip(__d, __i) __for_each_dip(__d, __i, ndevs)
-#define for_each_tip(__d, __t, __j)    \
-       for (__j = 0, __t = (__d)->threads; __j < ncpus; __j++, __t++)
+static void wait_tracers_leaving(void)
+{
+       pthread_mutex_lock(&mt_mutex);
+       while (nthreads_leaving < nthreads_running)
+               t_pthread_cond_wait(&mt_cond, &mt_mutex);
+       pthread_mutex_unlock(&mt_mutex);
+}
 
-/*
- * networking stuff follows. we include a magic number so we know whether
- * to endianness convert or not
- */
-struct blktrace_net_hdr {
-       u32 magic;              /* same as trace magic */
-       char buts_name[32];     /* trace name */
-       u32 cpu;                /* for which cpu */
-       u32 max_cpus;
-       u32 len;                /* length of following trace data */
-};
+static void init_mmap_info(struct mmap_info *mip)
+{
+       mip->buf_size = buf_size;
+       mip->buf_nr = buf_nr;
+       mip->pagesize = pagesize;
+}
 
-#define TRACE_NET_PORT         (8462)
+static void net_close_connection(int *fd)
+{
+       shutdown(*fd, SHUT_RDWR);
+       close(*fd);
+       *fd = -1;
+}
 
-enum {
-       Net_none = 0,
-       Net_server,
-       Net_client,
-};
+static void dpp_free(struct devpath *dpp)
+{
+       if (dpp->stats)
+               free(dpp->stats);
+       if (dpp->ios)
+               free(dpp->ios);
+       if (dpp->path)
+               free(dpp->path);
+       if (dpp->buts_name)
+               free(dpp->buts_name);
+       free(dpp);
+}
 
-/*
- * network cmd line params
- */
-static char hostname[MAXHOSTNAMELEN];
-static int net_port = TRACE_NET_PORT;
-static int net_mode = 0;
-static int net_use_sendfile;
-
-static int net_in_fd = -1;
-static int net_out_fd = -1;
-
-static void handle_sigint(__attribute__((__unused__)) int sig)
+static int lock_on_cpu(int cpu)
 {
-       struct device_information *dip;
-       int i;
+       cpu_set_t * cpu_mask;
+       size_t size;
 
-       /*
-        * stop trace so we can reap currently produced data
-        */
-       for_each_dip(dip, i) {
-               if (dip->fd == -1)
-                       continue;
-               if (ioctl(dip->fd, BLKTRACESTOP) < 0)
-                       perror("BLKTRACESTOP");
+       cpu_mask = CPU_ALLOC(max_cpus);
+       size = CPU_ALLOC_SIZE(max_cpus);
+
+       CPU_ZERO_S(size, cpu_mask);
+       CPU_SET_S(cpu, size, cpu_mask);
+       if (sched_setaffinity(0, size, cpu_mask) < 0) {
+               CPU_FREE(cpu_mask);             
+               return errno;
        }
 
-       done = 1;
+       CPU_FREE(cpu_mask);             
+       return 0;
 }
 
-static int get_dropped_count(const char *buts_name)
+static int increase_limit(int resource, rlim_t increase)
 {
-       int fd;
-       char tmp[MAXPATHLEN + 64];
-
-       snprintf(tmp, sizeof(tmp), "%s/block/%s/dropped",
-                relay_path, buts_name);
-
-       fd = open(tmp, O_RDONLY);
-       if (fd < 0) {
-               /*
-                * this may be ok, if the kernel doesn't support dropped counts
-                */
-               if (errno == ENOENT)
-                       return 0;
+       struct rlimit rlim;
+       int save_errno = errno;
 
-               fprintf(stderr, "Couldn't open dropped file %s\n", tmp);
-               return -1;
-       }
+       if (!getrlimit(resource, &rlim)) {
+               rlim.rlim_cur += increase;
+               if (rlim.rlim_cur >= rlim.rlim_max)
+                       rlim.rlim_max = rlim.rlim_cur + increase;
 
-       if (read(fd, tmp, sizeof(tmp)) < 0) {
-               perror(tmp);
-               close(fd);
-               return -1;
+               if (!setrlimit(resource, &rlim))
+                       return 1;
        }
 
-       close(fd);
-
-       return atoi(tmp);
+       errno = save_errno;
+       return 0;
 }
 
-static int start_trace(struct device_information *dip)
+static int handle_open_failure(void)
 {
-       struct blk_user_trace_setup buts;
-
-       memset(&buts, 0, sizeof(buts));
-       buts.buf_size = buf_size;
-       buts.buf_nr = buf_nr;
-       buts.act_mask = act_mask;
-
-       if (ioctl(dip->fd, BLKTRACESETUP, &buts) < 0) {
-               perror("BLKTRACESETUP");
-               return 1;
-       }
-
-       if (ioctl(dip->fd, BLKTRACESTART) < 0) {
-               perror("BLKTRACESTART");
-               return 1;
-       }
-
-       memcpy(dip->buts_name, buts.name, sizeof(dip->buts_name));
-       dip_set_tracing(dip, 1);
+       if (errno == ENFILE || errno == EMFILE)
+               return increase_limit(RLIMIT_NOFILE, 16);
        return 0;
 }
 
-static void stop_trace(struct device_information *dip)
+static int handle_mem_failure(size_t length)
 {
-       if (dip_tracing(dip) || kill_running_trace) {
-               dip_set_tracing(dip, 0);
+       if (errno == ENFILE)
+               return handle_open_failure();
+       else if (errno == ENOMEM)
+               return increase_limit(RLIMIT_MEMLOCK, 2 * length);
+       return 0;
+}
 
-               /*
-                * should be stopped, just don't complain if it isn't
-                */
-               ioctl(dip->fd, BLKTRACESTOP);
+static FILE *my_fopen(const char *path, const char *mode)
+{
+       FILE *fp;
 
-               if (ioctl(dip->fd, BLKTRACETEARDOWN) < 0)
-                       perror("BLKTRACETEARDOWN");
+       do {
+               fp = fopen(path, mode);
+       } while (fp == NULL && handle_open_failure());
 
-               close(dip->fd);
-               dip->fd = -1;
-       }
+       return fp;
 }
 
-static void stop_all_traces(void)
+static int my_open(const char *path, int flags)
 {
-       struct device_information *dip;
-       int i;
+       int fd;
 
-       for_each_dip(dip, i) {
-               dip->drop_count = get_dropped_count(dip->buts_name);
-               stop_trace(dip);
-       }
+       do {
+               fd = open(path, flags);
+       } while (fd < 0 && handle_open_failure());
+
+       return fd;
 }
 
-static void wait_for_data(struct thread_information *tip, int events)
+static int my_socket(int domain, int type, int protocol)
 {
-       struct pollfd pfd = { .fd = tip->fd, .events = events };
+       int fd;
 
        do {
-               if (poll(&pfd, 1, 100) < 0) {
-                       perror("poll");
-                       break;
-               }
-               if (pfd.revents & events)
-                       break;
-               if (tip->ofile_stdout)
-                       break;
-       } while (!is_done());
+               fd = socket(domain, type, protocol);
+       } while (fd < 0 && handle_open_failure());
+
+       return fd;
 }
 
-static int read_data_file(struct thread_information *tip, void *buf,
-                         unsigned int len)
+static int my_accept(int sockfd, struct sockaddr *addr, socklen_t *addrlen)
 {
-       int ret = 0;
+       int fd;
 
        do {
-               wait_for_data(tip, POLLIN);
+               fd = accept(sockfd, addr, addrlen);
+       } while (fd < 0 && handle_open_failure());
 
-               ret = read(tip->fd, buf, len);
-               if (!ret)
-                       continue;
-               else if (ret > 0)
-                       return ret;
-               else {
-                       if (errno != EAGAIN) {
-                               perror(tip->fn);
-                               fprintf(stderr,"Thread %d failed read of %s\n",
-                                       tip->cpu, tip->fn);
-                               break;
-                       }
-                       continue;
-               }
-       } while (!is_done());
+       return fd;
+}
 
-       return ret;
+static void *my_mmap(void *addr, size_t length, int prot, int flags, int fd,
+                    off_t offset)
+{
+       void *new;
 
+       do {
+               new = mmap(addr, length, prot, flags, fd, offset);
+       } while (new == MAP_FAILED && handle_mem_failure(length));
+
+       return new;
 }
 
-static int read_data_net(struct thread_information *tip, void *buf,
-                        unsigned int len)
+static int my_mlock(struct tracer *tp,
+                   const void *addr, size_t len)
 {
-       unsigned int bytes_left = len;
-       int ret = 0;
+       int ret, retry = 0;
 
        do {
-               ret = recv(net_in_fd, buf, bytes_left, MSG_WAITALL);
-
-               if (!ret)
-                       continue;
-               else if (ret < 0) {
-                       if (errno != EAGAIN) {
-                               perror(tip->fn);
-                               fprintf(stderr, "server: failed read\n");
-                               return 0;
-                       }
-                       continue;
-               } else {
-                       buf += ret;
-                       bytes_left -= ret;
-               }
-       } while (!is_done() && bytes_left);
+               ret = mlock(addr, len);
+               if ((retry >= 10) && tp && tp->is_done)
+                       break;
+               retry++;
+       } while (ret < 0 && handle_mem_failure(len));
 
-       return len - bytes_left;
+       return ret;
 }
 
-static inline struct tip_subbuf *
-subbuf_fifo_dequeue(struct thread_information *tip)
+static int setup_mmap(int fd, unsigned int maxlen,
+                     struct mmap_info *mip,
+                     struct tracer *tp)
 {
-       const int head = tip->fifo.head;
-       const int next = (head + 1) & (FIFO_SIZE - 1);
+       if (mip->fs_off + maxlen > mip->fs_buf_len) {
+               unsigned long nr = max(16, mip->buf_nr);
 
-       if (head != tip->fifo.tail) {
-               struct tip_subbuf *ts = tip->fifo.q[head];
+               if (mip->fs_buf) {
+                       munlock(mip->fs_buf, mip->fs_buf_len);
+                       munmap(mip->fs_buf, mip->fs_buf_len);
+                       mip->fs_buf = NULL;
+               }
+
+               mip->fs_off = mip->fs_size & (mip->pagesize - 1);
+               mip->fs_buf_len = (nr * mip->buf_size) - mip->fs_off;
+               mip->fs_max_size += mip->fs_buf_len;
+
+               if (ftruncate(fd, mip->fs_max_size) < 0) {
+                       perror("setup_mmap: ftruncate");
+                       return 1;
+               }
 
-               store_barrier();
-               tip->fifo.head = next;
-               return ts;
+               mip->fs_buf = my_mmap(NULL, mip->fs_buf_len, PROT_WRITE,
+                                     MAP_SHARED, fd,
+                                     mip->fs_size - mip->fs_off);
+               if (mip->fs_buf == MAP_FAILED) {
+                       perror("setup_mmap: mmap");
+                       return 1;
+               }
+               if (my_mlock(tp, mip->fs_buf, mip->fs_buf_len) < 0) {
+                       perror("setup_mlock: mlock");
+                       return 1;
+               }
        }
 
-       return NULL;
+       return 0;
+}
+
+static int __stop_trace(int fd)
+{
+       /*
+        * Should be stopped, don't complain if it isn't
+        */
+       ioctl(fd, BLKTRACESTOP);
+       return ioctl(fd, BLKTRACETEARDOWN);
 }
 
-static inline int subbuf_fifo_queue(struct thread_information *tip,
-                                   struct tip_subbuf *ts)
+static int write_data(char *buf, int len)
 {
-       const int tail = tip->fifo.tail;
-       const int next = (tail + 1) & (FIFO_SIZE - 1);
+       int ret;
 
-       if (next != tip->fifo.head) {
-               tip->fifo.q[tail] = ts;
-               store_barrier();
-               tip->fifo.tail = next;
-               return 0;
+rewrite:
+       ret = fwrite(buf, len, 1, pfp);
+       if (ferror(pfp) || ret != 1) {
+               if (errno == EINTR) {
+                       clearerr(pfp);
+                       goto rewrite;
+               }
+
+               if (!piped_output || (errno != EPIPE && errno != EBADF)) {
+                       fprintf(stderr, "write(%d) failed: %d/%s\n",
+                               len, errno, strerror(errno));
+               }
+               goto err;
        }
 
-       fprintf(stderr, "fifo too small!\n");
+       fflush(pfp);
+       return 0;
+
+err:
+       clearerr(pfp);
        return 1;
 }
 
 /*
- * For file output, truncate and mmap the file appropriately
+ * Returns the number of bytes read (successfully)
  */
-static int mmap_subbuf(struct thread_information *tip, unsigned int maxlen)
+static int __net_recv_data(int fd, void *buf, unsigned int len)
 {
-       int ofd = fileno(tip->ofile);
-       int ret;
-
-       /*
-        * extend file, if we have to. use chunks of 16 subbuffers.
-        */
-       if (tip->fs_off + buf_size > tip->fs_buf_len) {
-               if (tip->fs_buf) {
-                       munlock(tip->fs_buf, tip->fs_buf_len);
-                       munmap(tip->fs_buf, tip->fs_buf_len);
-                       tip->fs_buf = NULL;
-               }
-
-               tip->fs_off = tip->fs_size & (page_size - 1);
-               tip->fs_buf_len = (16 * buf_size) - tip->fs_off;
-               tip->fs_max_size += tip->fs_buf_len;
+       unsigned int bytes_left = len;
 
-               if (ftruncate(ofd, tip->fs_max_size) < 0) {
-                       perror("ftruncate");
-                       return -1;
-               }
+       while (bytes_left && !done) {
+               int ret = recv(fd, buf, bytes_left, MSG_WAITALL);
 
-               tip->fs_buf = mmap(NULL, tip->fs_buf_len, PROT_WRITE,
-                                  MAP_SHARED, ofd, tip->fs_size - tip->fs_off);
-               if (tip->fs_buf == MAP_FAILED) {
-                       perror("mmap");
-                       return -1;
+               if (ret == 0)
+                       break;
+               else if (ret < 0) {
+                       if (errno == EAGAIN) {
+                               usleep(50);
+                               continue;
+                       }
+                       perror("server: net_recv_data: recv failed");
+                       break;
+               } else {
+                       buf += ret;
+                       bytes_left -= ret;
                }
-               mlock(tip->fs_buf, tip->fs_buf_len);
        }
 
-       ret = tip->read_data(tip, tip->fs_buf + tip->fs_off, maxlen);
-       if (ret >= 0) {
-               tip->data_read += ret;
-               tip->fs_size += ret;
-               tip->fs_off += ret;
-               return 0;
-       }
+       return len - bytes_left;
+}
 
-       return -1;
+static int net_recv_data(int fd, void *buf, unsigned int len)
+{
+       return __net_recv_data(fd, buf, len);
 }
 
 /*
- * Use the copy approach for pipes and network
+ * Returns number of bytes written
  */
-static int get_subbuf(struct thread_information *tip, unsigned int maxlen)
+static int net_send_data(int fd, void *buf, unsigned int buf_len)
 {
-       struct tip_subbuf *ts = malloc(sizeof(*ts));
        int ret;
+       unsigned int bytes_left = buf_len;
 
-       ts->buf = malloc(buf_size);
-       ts->max_len = maxlen;
+       while (bytes_left) {
+               ret = send(fd, buf, bytes_left, 0);
+               if (ret < 0) {
+                       perror("send");
+                       break;
+               }
 
-       ret = tip->read_data(tip, ts->buf, ts->max_len);
-       if (ret > 0) {
-               ts->len = ret;
-               tip->data_read += ret;
-               if (subbuf_fifo_queue(tip, ts))
-                       return -1;
+               buf += ret;
+               bytes_left -= ret;
        }
 
-       return ret;
+       return buf_len - bytes_left;
 }
 
-static int get_subbuf_sendfile(struct thread_information *tip,
-                              unsigned int maxlen)
+static int net_send_header(int fd, int cpu, char *buts_name, int len)
 {
-       struct tip_subbuf *ts;
-       struct stat sb;
-       unsigned int ready;
-
-       wait_for_data(tip, POLLMSG);
+       struct blktrace_net_hdr hdr;
 
-       /*
-        * hack to get last data out, we can't use sendfile for that
-        */
-       if (is_done())
-               return get_subbuf(tip, maxlen);
+       memset(&hdr, 0, sizeof(hdr));
 
-       if (fstat(tip->fd, &sb) < 0) {
-               perror("trace stat");
-               return -1;
-       }
-       ready = sb.st_size - tip->data_queued;
-       if (!ready) {
-               usleep(1000);
-               return 0;
-       }
+       hdr.magic = BLK_IO_TRACE_MAGIC;
+       memset(hdr.buts_name, 0, sizeof(hdr.buts_name));
+       strncpy(hdr.buts_name, buts_name, sizeof(hdr.buts_name));
+       hdr.buts_name[sizeof(hdr.buts_name) - 1] = '\0';
+       hdr.cpu = cpu;
+       hdr.max_cpus = max_cpus;
+       hdr.len = len;
+       hdr.cl_id = getpid();
+       hdr.buf_size = buf_size;
+       hdr.buf_nr = buf_nr;
+       hdr.page_size = pagesize;
 
-       ts = malloc(sizeof(*ts));
-       ts->buf = NULL;
-       ts->max_len = 0;
-       ts->len = ready;
-       tip->data_queued += ready;
+       return net_send_data(fd, &hdr, sizeof(hdr)) != sizeof(hdr);
+}
 
-       if (subbuf_fifo_queue(tip, ts))
-               return -1;
+static void net_send_open_close(int fd, int cpu, char *buts_name, int len)
+{
+       struct blktrace_net_hdr ret_hdr;
 
-       return ready;
+       net_send_header(fd, cpu, buts_name, len);
+       net_recv_data(fd, &ret_hdr, sizeof(ret_hdr));
 }
 
-static void close_thread(struct thread_information *tip)
+static void net_send_open(int fd, int cpu, char *buts_name)
 {
-       if (tip->fd != -1)
-               close(tip->fd);
-       if (tip->ofile)
-               fclose(tip->ofile);
-       if (tip->ofile_buffer)
-               free(tip->ofile_buffer);
-       if (tip->fd_buf)
-               free(tip->fd_buf);
-
-       tip->fd = -1;
-       tip->ofile = NULL;
-       tip->ofile_buffer = NULL;
-       tip->fd_buf = NULL;
+       net_send_open_close(fd, cpu, buts_name, 0);
 }
 
-static void tip_ftrunc_final(struct thread_information *tip)
+static void net_send_close(int fd, char *buts_name, int drops)
 {
        /*
-        * truncate to right size and cleanup mmap
+        * Overload CPU w/ number of drops
+        *
+        * XXX: Need to clear/set done around call - done=1 (which
+        * is true here) stops reads from happening... :-(
         */
-       if (tip->ofile_mmap) {
-               int ofd = fileno(tip->ofile);
-
-               if (tip->fs_buf)
-                       munmap(tip->fs_buf, tip->fs_buf_len);
-
-               ftruncate(ofd, tip->fs_size);
-       }
+       done = 0;
+       net_send_open_close(fd, drops, buts_name, 1);
+       done = 1;
 }
 
-static void *thread_main(void *arg)
+static void ack_open_close(int fd, char *buts_name)
 {
-       struct thread_information *tip = arg;
-       pid_t pid = getpid();
-       cpu_set_t cpu_mask;
+       net_send_header(fd, 0, buts_name, 2);
+}
 
-       CPU_ZERO(&cpu_mask);
-       CPU_SET((tip->cpu), &cpu_mask);
+static void net_send_drops(int fd)
+{
+       struct list_head *p;
 
-       if (sched_setaffinity(pid, sizeof(cpu_mask), &cpu_mask) == -1) {
-               perror("sched_setaffinity");
-               exit_trace(1);
-       }
+       __list_for_each(p, &devpaths) {
+               struct devpath *dpp = list_entry(p, struct devpath, head);
 
-       snprintf(tip->fn, sizeof(tip->fn), "%s/block/%s/trace%d",
-                       relay_path, tip->device->buts_name, tip->cpu);
-       tip->fd = open(tip->fn, O_RDONLY);
-       if (tip->fd < 0) {
-               perror(tip->fn);
-               fprintf(stderr,"Thread %d failed open of %s\n", tip->cpu,
-                       tip->fn);
-               exit_trace(1);
+               net_send_close(fd, dpp->buts_name, dpp->drops);
        }
+}
 
-       while (!is_done()) {
-               if (tip->get_subbuf(tip, buf_size) < 0)
-                       break;
-       }
+/*
+ * Returns:
+ *      0: "EOF"
+ *      1: OK
+ *     -1: Error
+ */
+static int net_get_header(struct cl_conn *nc, struct blktrace_net_hdr *bnh)
+{
+       int bytes_read;
+       int fl = fcntl(nc->fd, F_GETFL);
 
-       /*
-        * trace is stopped, pull data until we get a short read
-        */
-       while (tip->get_subbuf(tip, buf_size) > 0)
-               ;
+       fcntl(nc->fd, F_SETFL, fl | O_NONBLOCK);
+       bytes_read = __net_recv_data(nc->fd, bnh, sizeof(*bnh));
+       fcntl(nc->fd, F_SETFL, fl & ~O_NONBLOCK);
 
-       tip_ftrunc_final(tip);
-       tip->exited = 1;
-       return NULL;
+       if (bytes_read == sizeof(*bnh))
+               return 1;
+       else if (bytes_read == 0)
+               return 0;
+       else
+               return -1;
 }
 
-static int write_data_net(int fd, void *buf, unsigned int buf_len)
+static int net_setup_addr(void)
 {
-       unsigned int bytes_left = buf_len;
-       int ret;
+       struct sockaddr_in *addr = &hostname_addr;
 
-       while (bytes_left) {
-               ret = send(fd, buf, bytes_left, 0);
-               if (ret < 0) {
-                       perror("send");
+       memset(addr, 0, sizeof(*addr));
+       addr->sin_family = AF_INET;
+       addr->sin_port = htons(net_port);
+
+       if (inet_aton(hostname, &addr->sin_addr) != 1) {
+               struct hostent *hent;
+retry:
+               hent = gethostbyname(hostname);
+               if (!hent) {
+                       if (h_errno == TRY_AGAIN) {
+                               usleep(100);
+                               goto retry;
+                       } else if (h_errno == NO_RECOVERY) {
+                               fprintf(stderr, "gethostbyname(%s)"
+                                       "non-recoverable error encountered\n",
+                                       hostname);
+                       } else {
+                               /*
+                                * HOST_NOT_FOUND, NO_ADDRESS or NO_DATA
+                                */
+                               fprintf(stderr, "Host %s not found\n",
+                                       hostname);
+                       }
                        return 1;
                }
 
-               buf += ret;
-               bytes_left -= ret;
+               memcpy(&addr->sin_addr, hent->h_addr, 4);
+               memset(hostname, 0, sizeof(hostname));
+               strncpy(hostname, hent->h_name, sizeof(hostname));
+               hostname[sizeof(hostname) - 1] = '\0';
        }
 
        return 0;
 }
 
-static int net_send_header(struct thread_information *tip, unsigned int len)
+static int net_setup_client(void)
 {
-       struct blktrace_net_hdr hdr;
+       int fd;
+       struct sockaddr_in *addr = &hostname_addr;
 
-       hdr.magic = BLK_IO_TRACE_MAGIC;
-       strcpy(hdr.buts_name, tip->device->buts_name);
-       hdr.cpu = tip->cpu;
-       hdr.max_cpus = ncpus;
-       hdr.len = len;
+       fd = my_socket(AF_INET, SOCK_STREAM, 0);
+       if (fd < 0) {
+               perror("client: socket");
+               return -1;
+       }
+
+       if (connect(fd, (struct sockaddr *)addr, sizeof(*addr)) < 0) {
+               if (errno == ECONNREFUSED)
+                       fprintf(stderr,
+                               "\nclient: Connection to %s refused, "
+                               "perhaps the server is not started?\n\n",
+                               hostname);
+               else
+                       perror("client: connect");
 
-       return write_data_net(net_out_fd, &hdr, sizeof(hdr));
+               close(fd);
+               return -1;
+       }
+
+       return fd;
 }
 
-/*
- * send header with 0 length to signal end-of-run
- */
-static void net_client_send_close(void)
+static int open_client_connections(void)
 {
-       struct device_information *dip;
-       struct blktrace_net_hdr hdr;
-       int i;
-
-       hdr.magic = BLK_IO_TRACE_MAGIC;
-       hdr.cpu = 0;
-       hdr.max_cpus = ncpus;
-       hdr.len = 0;
+       int cpu;
+       size_t alloc_size = CPU_ALLOC_SIZE(max_cpus);
 
-       for_each_dip(dip, i) {
-               strcpy(hdr.buts_name, dip->buts_name);
-               hdr.cpu += get_dropped_count(dip->buts_name);
+       cl_fds = calloc(ncpus, sizeof(*cl_fds));
+       for (cpu = 0; cpu < max_cpus; cpu++) {
+               if (!CPU_ISSET_S(cpu, alloc_size, online_cpus))
+                       continue;
+               cl_fds[cpu] = net_setup_client();
+               if (cl_fds[cpu] < 0)
+                       goto err;
        }
+       return 0;
 
-       write_data_net(net_out_fd, &hdr, sizeof(hdr));
+err:
+       while (cpu > 0)
+               close(cl_fds[cpu--]);
+       free(cl_fds);
+       return 1;
 }
 
-static int flush_subbuf_net(struct thread_information *tip,
-                           struct tip_subbuf *ts)
+static void close_client_connections(void)
 {
-       if (net_send_header(tip, ts->len))
-               return -1;
-       if (write_data_net(net_out_fd, ts->buf, ts->len))
-               return -1;
-
-       free(ts->buf);
-       free(ts);
-       return 1;
+       if (cl_fds) {
+               int cpu, *fdp;
+               size_t alloc_size = CPU_ALLOC_SIZE(max_cpus);
+
+               for (cpu = 0, fdp = cl_fds; cpu < max_cpus; cpu++, fdp++) {
+                       if (!CPU_ISSET_S(cpu, alloc_size, online_cpus))
+                               continue;
+                       if (*fdp >= 0) {
+                               net_send_drops(*fdp);
+                               net_close_connection(fdp);
+                       }
+               }
+               free(cl_fds);
+       }
 }
 
-static int net_sendfile(struct thread_information *tip, struct tip_subbuf *ts)
+static int setup_buts(void)
 {
-       int ret = sendfile(net_out_fd, tip->fd, NULL, ts->len);
+       struct list_head *p;
+       int ret = 0;
 
-       if (ret < 0) {
-               perror("sendfile");
-               return 1;
-       } else if (ret < (int) ts->len) {
-               fprintf(stderr, "short sendfile send (%d of %d)\n", ret, ts->len);
-               return 1;
+       __list_for_each(p, &devpaths) {
+               struct blk_user_trace_setup buts;
+               struct devpath *dpp = list_entry(p, struct devpath, head);
+
+               memset(&buts, 0, sizeof(buts));
+               buts.buf_size = buf_size;
+               buts.buf_nr = buf_nr;
+               buts.act_mask = act_mask;
+
+               if (ioctl(dpp->fd, BLKTRACESETUP, &buts) >= 0) {
+                       dpp->ncpus = max_cpus;
+                       dpp->buts_name = strdup(buts.name);
+                       dpp->setup_done = 1;
+                       if (dpp->stats)
+                               free(dpp->stats);
+                       dpp->stats = calloc(dpp->ncpus, sizeof(*dpp->stats));
+                       memset(dpp->stats, 0, dpp->ncpus * sizeof(*dpp->stats));
+               } else {
+                       fprintf(stderr, "BLKTRACESETUP(2) %s failed: %d/%s\n",
+                               dpp->path, errno, strerror(errno));
+                       ret++;
+               }
        }
 
-       return 0;
+       return ret;
 }
 
-static int flush_subbuf_sendfile(struct thread_information *tip,
-                                struct tip_subbuf *ts)
+static void start_buts(void)
 {
-       int ret = -1;
+       struct list_head *p;
 
-       /*
-        * currently we cannot use sendfile() on the last bytes read, as they
-        * may not be a full subbuffer. get_subbuf_sendfile() falls back to
-        * the read approach for those, so use send() to ship them out
-        */
-       if (ts->buf)
-               return flush_subbuf_net(tip, ts);
-       
-       if (net_send_header(tip, ts->len))
-               goto err;
-       if (net_sendfile(tip, ts))
-               goto err;
+       __list_for_each(p, &devpaths) {
+               struct devpath *dpp = list_entry(p, struct devpath, head);
 
-       tip->data_read += ts->len;
-       tip->ofile_offset += buf_size;
-       ret = 1;
-err:
-       free(ts);
-       return ret;
+               if (ioctl(dpp->fd, BLKTRACESTART) < 0) {
+                       fprintf(stderr, "BLKTRACESTART %s failed: %d/%s\n",
+                               dpp->path, errno, strerror(errno));
+               }
+       }
 }
 
-static int write_data(struct thread_information *tip, void *buf,
-                     unsigned int buf_len)
+static int get_drops(struct devpath *dpp)
 {
-       int ret;
+       int fd, drops = 0;
+       char fn[MAXPATHLEN + 64], tmp[256];
+
+       snprintf(fn, sizeof(fn), "%s/block/%s/dropped", debugfs_path,
+                dpp->buts_name);
 
-       if (!buf_len)
+       fd = my_open(fn, O_RDONLY);
+       if (fd < 0) {
+               /*
+                * This may be ok: the kernel may not support
+                * dropped counts.
+                */
+               if (errno != ENOENT)
+                       fprintf(stderr, "Could not open %s: %d/%s\n",
+                               fn, errno, strerror(errno));
                return 0;
+       } else if (read(fd, tmp, sizeof(tmp)) < 0) {
+               fprintf(stderr, "Could not read %s: %d/%s\n",
+                       fn, errno, strerror(errno));
+       } else
+               drops = atoi(tmp);
+       close(fd);
 
-       while (1) {
-               ret = fwrite(buf, buf_len, 1, tip->ofile);
-               if (ret == 1)
-                       break;
+       return drops;
+}
 
-               if (ret < 0) {
-                       perror("write");
-                       return 1;
-               }
-       }
+static void get_all_drops(void)
+{
+       struct list_head *p;
 
-       if (tip->ofile_stdout)
-               fflush(tip->ofile);
+       __list_for_each(p, &devpaths) {
+               struct devpath *dpp = list_entry(p, struct devpath, head);
 
-       return 0;
+               dpp->drops = get_drops(dpp);
+       }
 }
 
-static int flush_subbuf_file(struct thread_information *tip,
-                            struct tip_subbuf *ts)
+static inline struct trace_buf *alloc_trace_buf(int cpu, int bufsize)
 {
-       unsigned int offset = 0;
-       struct blk_io_trace *t;
-       int pdu_len, events = 0;
+       struct trace_buf *tbp;
 
-       /*
-        * surplus from last run
-        */
-       if (tip->leftover_ts) {
-               struct tip_subbuf *prev_ts = tip->leftover_ts;
+       tbp = malloc(sizeof(*tbp) + bufsize);
+       INIT_LIST_HEAD(&tbp->head);
+       tbp->len = 0;
+       tbp->buf = (void *)(tbp + 1);
+       tbp->cpu = cpu;
+       tbp->dpp = NULL;        /* Will be set when tbp is added */
 
-               if (prev_ts->len + ts->len > prev_ts->max_len) {
-                       prev_ts->max_len += ts->len;
-                       prev_ts->buf = realloc(prev_ts->buf, prev_ts->max_len);
-               }
+       return tbp;
+}
 
-               memcpy(prev_ts->buf + prev_ts->len, ts->buf, ts->len);
-               prev_ts->len += ts->len;
+static void free_tracer_heads(struct devpath *dpp)
+{
+       int cpu;
+       struct tracer_devpath_head *hd;
 
-               free(ts->buf);
-               free(ts);
+       for (cpu = 0, hd = dpp->heads; cpu < max_cpus; cpu++, hd++) {
+               if (hd->prev)
+                       free(hd->prev);
 
-               ts = prev_ts;
-               tip->leftover_ts = NULL;
+               pthread_mutex_destroy(&hd->mutex);
        }
+       free(dpp->heads);
+}
 
-       while (offset + sizeof(*t) <= ts->len) {
-               t = ts->buf + offset;
+static int setup_tracer_devpaths(void)
+{
+       struct list_head *p;
 
-               if (verify_trace(t)) {
-                       write_data(tip, ts->buf, offset);
-                       return -1;
-               }
+       if (net_client_use_send())
+               if (open_client_connections())
+                       return 1;
 
-               pdu_len = t->pdu_len;
+       __list_for_each(p, &devpaths) {
+               int cpu;
+               struct tracer_devpath_head *hd;
+               struct devpath *dpp = list_entry(p, struct devpath, head);
 
-               if (offset + sizeof(*t) + pdu_len > ts->len)
-                       break;
-
-               offset += sizeof(*t) + pdu_len;
-               tip->events_processed++;
-               tip->data_read += sizeof(*t) + pdu_len;
-               events++;
-       }
-
-       if (write_data(tip, ts->buf, offset))
-               return -1;
-
-       /*
-        * leftover bytes, save them for next time
-        */
-       if (offset != ts->len) {
-               tip->leftover_ts = ts;
-               ts->len -= offset;
-               memmove(ts->buf, ts->buf + offset, ts->len);
-       } else {
-               free(ts->buf);
-               free(ts);
+               dpp->heads = calloc(max_cpus, sizeof(struct tracer_devpath_head));
+               for (cpu = 0, hd = dpp->heads; cpu < max_cpus; cpu++, hd++) {
+                       INIT_LIST_HEAD(&hd->head);
+                       pthread_mutex_init(&hd->mutex, NULL);
+                       hd->prev = NULL;
+               }
        }
 
-       return events;
+       return 0;
 }
 
-static int write_tip_events(struct thread_information *tip)
+static inline void add_trace_buf(struct devpath *dpp, int cpu,
+                                               struct trace_buf **tbpp)
 {
-       struct tip_subbuf *ts = subbuf_fifo_dequeue(tip);
+       struct trace_buf *tbp = *tbpp;
+       struct tracer_devpath_head *hd = &dpp->heads[cpu];
 
-       if (ts)
-               return tip->flush_subbuf(tip, ts);
+       tbp->dpp = dpp;
 
-       return 0;
+       pthread_mutex_lock(&hd->mutex);
+       list_add_tail(&tbp->head, &hd->head);
+       pthread_mutex_unlock(&hd->mutex);
+
+       *tbpp = alloc_trace_buf(cpu, buf_size);
 }
 
-/*
- * scans the tips we know and writes out the subbuffers we accumulate
- */
-static void get_and_write_events(void)
+static inline void incr_entries(int entries_handled)
 {
-       struct device_information *dip;
-       struct thread_information *tip;
-       int i, j, events, ret, tips_running;
-
-       while (!is_done()) {
-               events = 0;
+       pthread_mutex_lock(&dp_mutex);
+       if (dp_entries == 0)
+               pthread_cond_signal(&dp_cond);
+       dp_entries += entries_handled;
+       pthread_mutex_unlock(&dp_mutex);
+}
 
-               for_each_dip(dip, i) {
-                       for_each_tip(dip, tip, j) {
-                               ret = write_tip_events(tip);
-                               if (ret > 0)
-                                       events += ret;
-                       }
-               }
+static void decr_entries(int handled)
+{
+       pthread_mutex_lock(&dp_mutex);
+       dp_entries -= handled;
+       pthread_mutex_unlock(&dp_mutex);
+}
 
-               if (!events)
-                       usleep(100000);
-       }
+static int wait_empty_entries(void)
+{
+       pthread_mutex_lock(&dp_mutex);
+       while (!done && dp_entries == 0)
+               t_pthread_cond_wait(&dp_cond, &dp_mutex);
+       pthread_mutex_unlock(&dp_mutex);
 
-       /*
-        * reap stored events
-        */
-       do {
-               events = 0;
-               tips_running = 0;
-               for_each_dip(dip, i) {
-                       for_each_tip(dip, tip, j) {
-                               ret = write_tip_events(tip);
-                               if (ret > 0)
-                                       events += ret;
-                               tips_running += !tip->exited;
-                       }
-               }
-               usleep(10);
-       } while (events || tips_running);
+       return !done;
 }
 
-static void wait_for_threads(void)
+static int add_devpath(char *path)
 {
+       int fd;
+       struct devpath *dpp;
+       struct list_head *p;
+
        /*
-        * for piped or network output, poll and fetch data for writeout.
-        * for files, we just wait around for trace threads to exit
+        * Verify device is not duplicated
         */
-       if ((output_name && !strcmp(output_name, "-")) ||
-           net_mode == Net_client)
-               get_and_write_events();
-       else {
-               struct device_information *dip;
-               struct thread_information *tip;
-               int i, j, tips_running;
-
-               do {
-                       tips_running = 0;
-                       usleep(100000);
-
-                       for_each_dip(dip, i)
-                               for_each_tip(dip, tip, j)
-                                       tips_running += !tip->exited;
-               } while (tips_running);
+       __list_for_each(p, &devpaths) {
+              struct devpath *tmp = list_entry(p, struct devpath, head);
+              if (!strcmp(tmp->path, path))
+                       return 0;
+       }
+       /*
+        * Verify device is valid before going too far
+        */
+       fd = my_open(path, O_RDONLY | O_NONBLOCK);
+       if (fd < 0) {
+               fprintf(stderr, "Invalid path %s specified: %d/%s\n",
+                       path, errno, strerror(errno));
+               return 1;
        }
 
-       if (net_mode == Net_client)
-               net_client_send_close();
+       dpp = malloc(sizeof(*dpp));
+       memset(dpp, 0, sizeof(*dpp));
+       dpp->path = strdup(path);
+       dpp->fd = fd;
+       ndevs++;
+       list_add_tail(&dpp->head, &devpaths);
+
+       return 0;
 }
 
-static int fill_ofname(struct thread_information *tip, char *dst,
-                      char *buts_name)
+static void rel_devpaths(void)
 {
-       struct stat sb;
-       int len = 0;
-       time_t t;
+       struct list_head *p, *q;
 
-       if (output_dir)
-               len = sprintf(dst, "%s/", output_dir);
+       list_for_each_safe(p, q, &devpaths) {
+               struct devpath *dpp = list_entry(p, struct devpath, head);
 
-       if (net_mode == Net_server) {
-               len += sprintf(dst + len, "%s-", inet_ntoa(tip->cl_in_addr));
-               time(&t);
-               len += strftime(dst + len, 64, "%F-%T/", gmtime(&t));
-       }
+               list_del(&dpp->head);
+               if (dpp->setup_done)
+                       __stop_trace(dpp->fd);
+               close(dpp->fd);
 
-       if (stat(dst, &sb) < 0) {
-               if (errno != ENOENT) {
-                       perror("stat");
-                       return 1;
-               }
-               if (mkdir(dst, 0755) < 0) {
-                       perror(dst);
-                       fprintf(stderr, "Can't make output dir\n");
-                       return 1;
-               }
+               if (dpp->heads)
+                       free_tracer_heads(dpp);
+
+               dpp_free(dpp);
+               ndevs--;
        }
+}
 
-       if (output_name)
-               sprintf(dst + len, "%s.blktrace.%d", output_name, tip->cpu);
-       else
-               sprintf(dst + len, "%s.blktrace.%d", buts_name, tip->cpu);
+static int flush_subbuf_net(struct trace_buf *tbp)
+{
+       int fd = cl_fds[tbp->cpu];
+       struct devpath *dpp = tbp->dpp;
+
+       if (net_send_header(fd, tbp->cpu, dpp->buts_name, tbp->len))
+               return 1;
+       else if (net_send_data(fd, tbp->buf, tbp->len) != tbp->len)
+               return 1;
 
        return 0;
 }
 
-static void fill_ops(struct thread_information *tip)
+static int
+handle_list_net(__attribute__((__unused__))struct tracer_devpath_head *hd,
+               struct list_head *list)
 {
-       /*
-        * setup ops
-        */
-       if (net_mode == Net_client) {
-               if (net_use_sendfile) {
-                       tip->get_subbuf = get_subbuf_sendfile;
-                       tip->flush_subbuf = flush_subbuf_sendfile;
-               } else {
-                       tip->get_subbuf = get_subbuf;
-                       tip->flush_subbuf = flush_subbuf_net;
+       struct trace_buf *tbp;
+       struct list_head *p, *q;
+       int entries_handled = 0;
+
+       list_for_each_safe(p, q, list) {
+               tbp = list_entry(p, struct trace_buf, head);
+
+               list_del(&tbp->head);
+               entries_handled++;
+
+               if (cl_fds[tbp->cpu] >= 0) {
+                       if (flush_subbuf_net(tbp)) {
+                               close(cl_fds[tbp->cpu]);
+                               cl_fds[tbp->cpu] = -1;
+                       }
                }
-       } else {
-               if (tip->ofile_mmap)
-                       tip->get_subbuf = mmap_subbuf;
-               else
-                       tip->get_subbuf = get_subbuf;
 
-               tip->flush_subbuf = flush_subbuf_file;
-       }
-                       
-       if (net_mode == Net_server)
-               tip->read_data = read_data_net;
-       else
-               tip->read_data = read_data_file;
-}
-
-static int tip_open_output(struct device_information *dip,
-                          struct thread_information *tip)
-{
-       int pipeline = output_name && !strcmp(output_name, "-");
-       int mode, vbuf_size;
-       char op[128];
-
-       if (net_mode == Net_client) {
-               tip->ofile = NULL;
-               tip->ofile_stdout = 0;
-               tip->ofile_mmap = 0;
-               goto done;
-       } else if (pipeline) {
-               tip->ofile = fdopen(STDOUT_FILENO, "w");
-               tip->ofile_stdout = 1;
-               tip->ofile_mmap = 0;
-               mode = _IOLBF;
-               vbuf_size = 512;
-       } else {
-               if (fill_ofname(tip, op, dip->buts_name))
-                       return 1;
-               tip->ofile = fopen(op, "w+");
-               tip->ofile_stdout = 0;
-               tip->ofile_mmap = 1;
-               mode = _IOFBF;
-               vbuf_size = OFILE_BUF;
+               free(tbp);
        }
 
-       if (tip->ofile == NULL) {
-               perror(op);
-               return 1;
-       }
+       return entries_handled;
+}
 
-       tip->ofile_buffer = malloc(vbuf_size);
-       if (setvbuf(tip->ofile, tip->ofile_buffer, mode, vbuf_size)) {
-               perror("setvbuf");
-               close_thread(tip);
-               return 1;
+/*
+ * Tack 'tbp's buf onto the tail of 'prev's buf
+ */
+static struct trace_buf *tb_combine(struct trace_buf *prev,
+                                   struct trace_buf *tbp)
+{
+       unsigned long tot_len;
+
+       tot_len = prev->len + tbp->len;
+       if (tot_len > buf_size) {
+               /*
+                * tbp->head isn't connected (it was 'prev'
+                * so it had been taken off of the list
+                * before). Therefore, we can realloc
+                * the whole structures, as the other fields
+                * are "static".
+                */
+               prev = realloc(prev, sizeof(*prev) + tot_len);
+               prev->buf = (void *)(prev + 1);
        }
 
-done:
-       fill_ops(tip);
-       return 0;
+       memcpy(prev->buf + prev->len, tbp->buf, tbp->len);
+       prev->len = tot_len;
+
+       free(tbp);
+       return prev;
 }
 
-static int start_threads(struct device_information *dip)
+static int handle_list_file(struct tracer_devpath_head *hd,
+                           struct list_head *list)
 {
-       struct thread_information *tip;
-       int j;
+       int off, t_len, nevents;
+       struct blk_io_trace *t;
+       struct list_head *p, *q;
+       int entries_handled = 0;
+       struct trace_buf *tbp, *prev;
 
-       for_each_tip(dip, tip, j) {
-               tip->cpu = j;
-               tip->device = dip;
-               tip->events_processed = 0;
-               tip->fd = -1;
-               memset(&tip->fifo, 0, sizeof(tip->fifo));
-               tip->leftover_ts = NULL;
+       prev = hd->prev;
+       list_for_each_safe(p, q, list) {
+               tbp = list_entry(p, struct trace_buf, head);
+               list_del(&tbp->head);
+               entries_handled++;
 
-               if (tip_open_output(dip, tip))
-                       return 1;
+               /*
+                * If there was some leftover before, tack this new
+                * entry onto the tail of the previous one.
+                */
+               if (prev)
+                       tbp = tb_combine(prev, tbp);
 
-               if (pthread_create(&tip->thread, NULL, thread_main, tip)) {
-                       perror("pthread_create");
-                       close_thread(tip);
-                       return 1;
+               /*
+                * See how many whole traces there are - send them
+                * all out in one go.
+                */
+               off = 0;
+               nevents = 0;
+               while (off + (int)sizeof(*t) <= tbp->len) {
+                       t = (struct blk_io_trace *)(tbp->buf + off);
+                       t_len = sizeof(*t) + t->pdu_len;
+                       if (off + t_len > tbp->len)
+                               break;
+
+                       off += t_len;
+                       nevents++;
                }
+               if (nevents)
+                       pdc_nev_update(tbp->dpp, tbp->cpu, nevents);
+
+               /*
+                * Write any full set of traces, any remaining data is kept
+                * for the next pass.
+                */
+               if (off) {
+                       if (write_data(tbp->buf, off) || off == tbp->len) {
+                               free(tbp);
+                               prev = NULL;
+                       }
+                       else {
+                               /*
+                                * Move valid data to beginning of buffer
+                                */
+                               tbp->len -= off;
+                               memmove(tbp->buf, tbp->buf + off, tbp->len);
+                               prev = tbp;
+                       }
+               } else
+                       prev = tbp;
        }
+       hd->prev = prev;
 
-       return 0;
+       return entries_handled;
 }
 
-static void stop_threads(struct device_information *dip)
+static void __process_trace_bufs(void)
 {
-       struct thread_information *tip;
-       unsigned long ret;
-       int i;
+       int cpu;
+       struct list_head *p;
+       struct list_head list;
+       int handled = 0;
+
+       __list_for_each(p, &devpaths) {
+               struct devpath *dpp = list_entry(p, struct devpath, head);
+               struct tracer_devpath_head *hd = dpp->heads;
+
+               for (cpu = 0; cpu < max_cpus; cpu++, hd++) {
+                       pthread_mutex_lock(&hd->mutex);
+                       if (list_empty(&hd->head)) {
+                               pthread_mutex_unlock(&hd->mutex);
+                               continue;
+                       }
+
+                       list_replace_init(&hd->head, &list);
+                       pthread_mutex_unlock(&hd->mutex);
 
-       for_each_tip(dip, tip, i) {
-               (void) pthread_join(tip->thread, (void *) &ret);
-               close_thread(tip);
+                       handled += handle_list(hd, &list);
+               }
        }
+
+       if (handled)
+               decr_entries(handled);
 }
 
-static void stop_all_threads(void)
+static void process_trace_bufs(void)
 {
-       struct device_information *dip;
-       int i;
-
-       for_each_dip(dip, i)
-               stop_threads(dip);
+       while (wait_empty_entries())
+               __process_trace_bufs();
 }
 
-static void stop_all_tracing(void)
+static void clean_trace_bufs(void)
 {
-       struct device_information *dip;
-       int i;
+       /*
+        * No mutex needed here: we're only reading from the lists,
+        * tracers are done
+        */
+       while (dp_entries)
+               __process_trace_bufs();
+}
 
-       for_each_dip(dip, i)
-               stop_trace(dip);
+static inline void read_err(int cpu, char *ifn)
+{
+       if (errno != EAGAIN)
+               fprintf(stderr, "Thread %d failed read of %s: %d/%s\n",
+                       cpu, ifn, errno, strerror(errno));
 }
 
-static void exit_trace(int status)
+static int net_sendfile(struct io_info *iop)
 {
-       if (!is_trace_stopped()) {
-               trace_stopped = 1;
-               stop_all_threads();
-               stop_all_tracing();
+       int ret;
+
+       ret = sendfile(iop->ofd, iop->ifd, NULL, iop->ready);
+       if (ret < 0) {
+               perror("sendfile");
+               return 1;
+       } else if (ret < (int)iop->ready) {
+               fprintf(stderr, "short sendfile send (%d of %d)\n",
+                       ret, iop->ready);
+               return 1;
        }
 
-       exit(status);
+       return 0;
 }
 
-static int resize_devices(char *path)
+static inline int net_sendfile_data(struct tracer *tp, struct io_info *iop)
 {
-       int size = (ndevs + 1) * sizeof(struct device_information);
+       struct devpath *dpp = iop->dpp;
 
-       device_information = realloc(device_information, size);
-       if (!device_information) {
-               fprintf(stderr, "Out of memory, device %s (%d)\n", path, size);
+       if (net_send_header(iop->ofd, tp->cpu, dpp->buts_name, iop->ready))
                return 1;
-       }
-       device_information[ndevs].path = path;
-       ndevs++;
-       return 0;
+       return net_sendfile(iop);
 }
 
-static int open_devices(void)
+static int fill_ofname(char *dst, int dstlen, char *subdir, char *buts_name,
+                      int cpu)
 {
-       struct device_information *dip;
-       int i;
+       int len;
+       struct stat sb;
 
-       for_each_dip(dip, i) {
-               dip->fd = open(dip->path, O_RDONLY | O_NONBLOCK);
-               if (dip->fd < 0) {
-                       perror(dip->path);
+       if (output_dir)
+               len = snprintf(dst, dstlen, "%s/", output_dir);
+       else
+               len = snprintf(dst, dstlen, "./");
+
+       if (subdir)
+               len += snprintf(dst + len, dstlen - len, "%s", subdir);
+
+       if (stat(dst, &sb) < 0) {
+               if (errno != ENOENT) {
+                       fprintf(stderr,
+                               "Destination dir %s stat failed: %d/%s\n",
+                               dst, errno, strerror(errno));
+                       return 1;
+               }
+               /*
+                * There is no synchronization between multiple threads
+                * trying to create the directory at once.  It's harmless
+                * to let them try, so just detect the problem and move on.
+                */
+               if (mkdir(dst, 0755) < 0 && errno != EEXIST) {
+                       fprintf(stderr,
+                               "Destination dir %s can't be made: %d/%s\n",
+                               dst, errno, strerror(errno));
                        return 1;
                }
        }
 
+       if (output_name)
+               snprintf(dst + len, dstlen - len, "%s.blktrace.%d",
+                        output_name, cpu);
+       else
+               snprintf(dst + len, dstlen - len, "%s.blktrace.%d",
+                        buts_name, cpu);
+
        return 0;
 }
 
-static int start_devices(void)
+static int set_vbuf(struct io_info *iop, int mode, size_t size)
 {
-       struct device_information *dip;
-       int i, j, size;
-
-       size = ncpus * sizeof(struct thread_information);
-       thread_information = malloc(size * ndevs);
-       if (!thread_information) {
-               fprintf(stderr, "Out of memory, threads (%d)\n", size * ndevs);
+       iop->obuf = malloc(size);
+       if (setvbuf(iop->ofp, iop->obuf, mode, size) < 0) {
+               fprintf(stderr, "setvbuf(%s, %d) failed: %d/%s\n",
+                       iop->dpp->path, (int)size, errno,
+                       strerror(errno));
+               free(iop->obuf);
                return 1;
        }
 
-       for_each_dip(dip, i) {
-               if (start_trace(dip)) {
-                       close(dip->fd);
-                       fprintf(stderr, "Failed to start trace on %s\n",
-                               dip->path);
-                       break;
-               }
-       }
+       return 0;
+}
 
-       if (i != ndevs) {
-               __for_each_dip(dip, j, i)
-                       stop_trace(dip);
+static int iop_open(struct io_info *iop, int cpu)
+{
+       char hostdir[MAXPATHLEN + 64];
 
-               return 1;
-       }
+       iop->ofd = -1;
+       if (net_mode == Net_server) {
+               struct cl_conn *nc = iop->nc;
+               int len;
 
-       for_each_dip(dip, i) {
-               dip->threads = thread_information + (i * ncpus);
-               if (start_threads(dip)) {
-                       fprintf(stderr, "Failed to start worker threads\n");
-                       break;
-               }
+               len = snprintf(hostdir, sizeof(hostdir), "%s-",
+                              nc->ch->hostname);
+               len += strftime(hostdir + len, sizeof(hostdir) - len, "%F-%T/",
+                               gmtime(&iop->dpp->cl_connect_time));
+       } else {
+               hostdir[0] = 0;
        }
 
-       if (i != ndevs) {
-               __for_each_dip(dip, j, i)
-                       stop_threads(dip);
-               for_each_dip(dip, i)
-                       stop_trace(dip);
+       if (fill_ofname(iop->ofn, sizeof(iop->ofn), hostdir,
+                       iop->dpp->buts_name, cpu))
+               return 1;
+
+       iop->ofp = my_fopen(iop->ofn, "w+");
+       if (iop->ofp == NULL) {
+               fprintf(stderr, "Open output file %s failed: %d/%s\n",
+                       iop->ofn, errno, strerror(errno));
+               return 1;
+       }
 
+       if (set_vbuf(iop, _IOLBF, FILE_VBUF_SIZE)) {
+               fprintf(stderr, "set_vbuf for file %s failed: %d/%s\n",
+                       iop->ofn, errno, strerror(errno));
+               fclose(iop->ofp);
                return 1;
        }
 
+       iop->ofd = fileno(iop->ofp);
        return 0;
 }
 
-static void show_stats(void)
+static void close_iop(struct io_info *iop)
 {
-       struct device_information *dip;
-       struct thread_information *tip;
-       unsigned long long events_processed, data_read;
-       unsigned long total_drops;
-       int i, j, no_stdout = 0;
+       struct mmap_info *mip = &iop->mmap_info;
+
+       if (mip->fs_buf)
+               munmap(mip->fs_buf, mip->fs_buf_len);
+
+       if (!piped_output) {
+               if (ftruncate(fileno(iop->ofp), mip->fs_size) < 0) {
+                       fprintf(stderr,
+                               "Ignoring err: ftruncate(%s): %d/%s\n",
+                               iop->ofn, errno, strerror(errno));
+               }
+       }
+
+       if (iop->ofp)
+               fclose(iop->ofp);
+       if (iop->obuf)
+               free(iop->obuf);
+}
 
-       if (is_stat_shown())
-               return;
+static void close_ios(struct tracer *tp)
+{
+       while (tp->nios > 0) {
+               struct io_info *iop = &tp->ios[--tp->nios];
 
-       if (output_name && !strcmp(output_name, "-"))
-               no_stdout = 1;
+               iop->dpp->drops = get_drops(iop->dpp);
+               if (iop->ifd >= 0)
+                       close(iop->ifd);
 
-       stat_shown = 1;
+               if (iop->ofp)
+                       close_iop(iop);
+               else if (iop->ofd >= 0) {
+                       struct devpath *dpp = iop->dpp;
 
-       total_drops = 0;
-       for_each_dip(dip, i) {
-               if (!no_stdout)
-                       printf("Device: %s\n", dip->path);
-               events_processed = 0;
-               data_read = 0;
-               for_each_tip(dip, tip, j) {
-                       if (!no_stdout)
-                               printf("  CPU%3d: %20lu events, %8llu KiB data\n",
-                                       tip->cpu, tip->events_processed,
-                                       (tip->data_read + 1023) >> 10);
-                       events_processed += tip->events_processed;
-                       data_read += tip->data_read;
+                       net_send_close(iop->ofd, dpp->buts_name, dpp->drops);
+                       net_close_connection(&iop->ofd);
                }
-               total_drops += dip->drop_count;
-               if (!no_stdout)
-                       printf("  Total:  %20llu events (dropped %lu), %8llu KiB data\n",
-                                       events_processed, dip->drop_count,
-                                       (data_read + 1023) >> 10);
        }
 
-       if (total_drops)
-               fprintf(stderr, "You have dropped events, consider using a larger buffer size (-b)\n");
+       free(tp->ios);
+       free(tp->pfds);
 }
 
-static struct device_information *net_get_dip(char *buts_name,
-                                             struct in_addr *cl_in_addr)
+static int open_ios(struct tracer *tp)
 {
-       struct device_information *dip;
-       int i;
+       struct pollfd *pfd;
+       struct io_info *iop;
+       struct list_head *p;
+
+       tp->ios = calloc(ndevs, sizeof(struct io_info));
+       memset(tp->ios, 0, ndevs * sizeof(struct io_info));
+
+       tp->pfds = calloc(ndevs, sizeof(struct pollfd));
+       memset(tp->pfds, 0, ndevs * sizeof(struct pollfd));
+
+       tp->nios = 0;
+       iop = tp->ios;
+       pfd = tp->pfds;
+       __list_for_each(p, &devpaths) {
+               struct devpath *dpp = list_entry(p, struct devpath, head);
+
+               iop->dpp = dpp;
+               iop->ofd = -1;
+               snprintf(iop->ifn, sizeof(iop->ifn), "%s/block/%s/trace%d",
+                       debugfs_path, dpp->buts_name, tp->cpu);
+
+               iop->ifd = my_open(iop->ifn, O_RDONLY | O_NONBLOCK);
+               if (iop->ifd < 0) {
+                       fprintf(stderr, "Thread %d failed open %s: %d/%s\n",
+                               tp->cpu, iop->ifn, errno, strerror(errno));
+                       return 1;
+               }
 
-       for (i = 0; i < ndevs; i++) {
-               dip = &device_information[i];
+               init_mmap_info(&iop->mmap_info);
+
+               pfd->fd = iop->ifd;
+               pfd->events = POLLIN;
+
+               if (piped_output)
+                       ;
+               else if (net_client_use_sendfile()) {
+                       iop->ofd = net_setup_client();
+                       if (iop->ofd < 0)
+                               goto err;
+                       net_send_open(iop->ofd, tp->cpu, dpp->buts_name);
+               } else if (net_mode == Net_none) {
+                       if (iop_open(iop, tp->cpu))
+                               goto err;
+               } else {
+                       /*
+                        * This ensures that the server knows about all
+                        * connections & devices before _any_ closes
+                        */
+                       net_send_open(cl_fds[tp->cpu], tp->cpu, dpp->buts_name);
+               }
 
-               if (!strcmp(dip->buts_name, buts_name))
-                       return dip;
+               pfd++;
+               iop++;
+               tp->nios++;
        }
 
-       device_information = realloc(device_information, (ndevs + 1) * sizeof(*dip));
-       dip = &device_information[ndevs];
-       memset(dip, 0, sizeof(*dip));
-       dip->fd = -1;
-       strcpy(dip->buts_name, buts_name);
-       dip->path = strdup(buts_name);
-       ndevs++;
-       dip->threads = malloc(ncpus * sizeof(struct thread_information));
-       memset(dip->threads, 0, ncpus * sizeof(struct thread_information));
+       return 0;
 
-       /*
-        * open all files
-        */
-       for (i = 0; i < ncpus; i++) {
-               struct thread_information *tip = &dip->threads[i];
+err:
+       close(iop->ifd);        /* tp->nios _not_ bumped */
+       close_ios(tp);
+       return 1;
+}
 
-               tip->cpu = i;
-               tip->device = dip;
-               tip->fd = -1;
-               tip->cl_in_addr = *cl_in_addr;
+static int handle_pfds_file(struct tracer *tp, int nevs, int force_read)
+{
+       struct mmap_info *mip;
+       int i, ret, nentries = 0;
+       struct pollfd *pfd = tp->pfds;
+       struct io_info *iop = tp->ios;
+
+       for (i = 0; nevs > 0 && i < ndevs; i++, pfd++, iop++) {
+               if (pfd->revents & POLLIN || force_read) {
+                       mip = &iop->mmap_info;
+
+                       ret = setup_mmap(iop->ofd, buf_size, mip, tp);
+                       if (ret < 0) {
+                               pfd->events = 0;
+                               break;
+                       }
 
-               if (tip_open_output(dip, tip))
-                       return NULL;
+                       ret = read(iop->ifd, mip->fs_buf + mip->fs_off,
+                                  buf_size);
+                       if (ret > 0) {
+                               pdc_dr_update(iop->dpp, tp->cpu, ret);
+                               mip->fs_size += ret;
+                               mip->fs_off += ret;
+                               nentries++;
+                       } else if (ret == 0) {
+                               /*
+                                * Short reads after we're done stop us
+                                * from trying reads.
+                                */
+                               if (tp->is_done)
+                                       clear_events(pfd);
+                       } else {
+                               read_err(tp->cpu, iop->ifn);
+                               if (errno != EAGAIN || tp->is_done)
+                                       clear_events(pfd);
+                       }
+                       nevs--;
+               }
        }
 
-       return dip;
+       return nentries;
 }
 
-static struct thread_information *net_get_tip(struct blktrace_net_hdr *bnh,
-                                             struct in_addr *cl_in_addr)
+static int handle_pfds_netclient(struct tracer *tp, int nevs, int force_read)
 {
-       struct device_information *dip;
+       struct stat sb;
+       int i, nentries = 0;
+       struct pollfd *pfd = tp->pfds;
+       struct io_info *iop = tp->ios;
+
+       for (i = 0; i < ndevs; i++, pfd++, iop++) {
+               if (pfd->revents & POLLIN || force_read) {
+                       if (fstat(iop->ifd, &sb) < 0) {
+                               perror(iop->ifn);
+                               pfd->events = 0;
+                       } else if (sb.st_size > (off_t)iop->data_queued) {
+                               iop->ready = sb.st_size - iop->data_queued;
+                               iop->data_queued = sb.st_size;
+
+                               if (!net_sendfile_data(tp, iop)) {
+                                       pdc_dr_update(iop->dpp, tp->cpu,
+                                                     iop->ready);
+                                       nentries++;
+                               } else
+                                       clear_events(pfd);
+                       }
+                       if (--nevs == 0)
+                               break;
+               }
+       }
+
+       if (nentries)
+               incr_entries(nentries);
 
-       ncpus = bnh->max_cpus;
-       dip = net_get_dip(bnh->buts_name, cl_in_addr);
-       return &dip->threads[bnh->cpu];
+       return nentries;
 }
 
-static int net_get_header(struct blktrace_net_hdr *bnh)
+static int handle_pfds_entries(struct tracer *tp, int nevs, int force_read)
 {
-       int fl = fcntl(net_in_fd, F_GETFL);
-       int bytes_left, ret;
-       void *p = bnh;
-
-       fcntl(net_in_fd, F_SETFL, fl | O_NONBLOCK);
-       bytes_left = sizeof(*bnh);
-       while (bytes_left && !is_done()) {
-               ret = recv(net_in_fd, p, bytes_left, MSG_WAITALL);
-               if (ret < 0) {
-                       if (errno != EAGAIN) {
-                               perror("recv header");
-                               return 1;
+       int i, nentries = 0;
+       struct trace_buf *tbp;
+       struct pollfd *pfd = tp->pfds;
+       struct io_info *iop = tp->ios;
+
+       tbp = alloc_trace_buf(tp->cpu, buf_size);
+       for (i = 0; i < ndevs; i++, pfd++, iop++) {
+               if (pfd->revents & POLLIN || force_read) {
+                       tbp->len = read(iop->ifd, tbp->buf, buf_size);
+                       if (tbp->len > 0) {
+                               pdc_dr_update(iop->dpp, tp->cpu, tbp->len);
+                               add_trace_buf(iop->dpp, tp->cpu, &tbp);
+                               nentries++;
+                       } else if (tbp->len == 0) {
+                               /*
+                                * Short reads after we're done stop us
+                                * from trying reads.
+                                */
+                               if (tp->is_done)
+                                       clear_events(pfd);
+                       } else {
+                               read_err(tp->cpu, iop->ifn);
+                               if (errno != EAGAIN || tp->is_done)
+                                       clear_events(pfd);
                        }
-                       usleep(1000);
-                       continue;
-               } else if (!ret) {
-                       usleep(1000);
-                       continue;
-               } else {
-                       p += ret;
-                       bytes_left -= ret;
+                       if (!piped_output && --nevs == 0)
+                               break;
                }
        }
-       fcntl(net_in_fd, F_SETFL, fl & ~O_NONBLOCK);
-       return bytes_left;
+       free(tbp);
+
+       if (nentries)
+               incr_entries(nentries);
+
+       return nentries;
 }
 
-static int net_server_loop(struct in_addr *cl_in_addr)
+static void *thread_main(void *arg)
 {
-       struct thread_information *tip;
-       struct blktrace_net_hdr bnh;
+       int ret, ndone, to_val;
+       struct tracer *tp = arg;
 
-       if (net_get_header(&bnh))
-               return 1;
+       ret = lock_on_cpu(tp->cpu);
+       if (ret)
+               goto err;
 
-       if (data_is_native == -1 && check_data_endianness(bnh.magic)) {
-               fprintf(stderr, "server: received data is bad\n");
-               return 1;
-       }
+       ret = open_ios(tp);
+       if (ret)
+               goto err;
 
-       if (!data_is_native) {
-               bnh.magic = be32_to_cpu(bnh.magic);
-               bnh.cpu = be32_to_cpu(bnh.cpu);
-               bnh.len = be32_to_cpu(bnh.len);
-       }
+       if (piped_output)
+               to_val = 50;            /* Frequent partial handles */
+       else
+               to_val = 500;           /* 1/2 second intervals */
 
-       if ((bnh.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
-               fprintf(stderr, "server: bad data magic\n");
-               return 1;
+
+       tracer_signal_ready(tp, Th_running, 0);
+       tracer_wait_unblock(tp);
+
+       while (!tp->is_done) {
+               ndone = poll(tp->pfds, ndevs, to_val);
+               if (ndone || piped_output)
+                       (void)handle_pfds(tp, ndone, piped_output);
+               else if (ndone < 0 && errno != EINTR)
+                       fprintf(stderr, "Thread %d poll failed: %d/%s\n",
+                               tp->cpu, errno, strerror(errno));
        }
 
        /*
-        * len == 0 means that the other end signalled end-of-run
+        * Trace is stopped, pull data until we get a short read
         */
-       if (!bnh.len) {
-               /*
-                * overload cpu count with dropped events
-                */
-               struct device_information *dip;
+       while (handle_pfds(tp, ndevs, 1) > 0)
+               ;
 
-               dip = net_get_dip(bnh.buts_name, cl_in_addr);
-               dip->drop_count = bnh.cpu;
+       close_ios(tp);
+       tracer_signal_ready(tp, Th_leaving, 0);
+       return NULL;
 
-               fprintf(stderr, "server: end of run\n");
-               return 1;
-       }
+err:
+       tracer_signal_ready(tp, Th_error, ret);
+       return NULL;
+}
 
-       tip = net_get_tip(&bnh, cl_in_addr);
-       if (!tip)
-               return 1;
+static int start_tracer(int cpu)
+{
+       struct tracer *tp;
 
-       if (mmap_subbuf(tip, bnh.len))
+       tp = malloc(sizeof(*tp));
+       memset(tp, 0, sizeof(*tp));
+
+       INIT_LIST_HEAD(&tp->head);
+       tp->status = 0;
+       tp->cpu = cpu;
+
+       if (pthread_create(&tp->thread, NULL, thread_main, tp)) {
+               fprintf(stderr, "FAILED to start thread on CPU %d: %d/%s\n",
+                       cpu, errno, strerror(errno));
+               free(tp);
                return 1;
+       }
 
+       list_add_tail(&tp->head, &tracers);
        return 0;
 }
 
-static int get_connection(int fd, struct sockaddr_in *addr)
+static int create_output_files(int cpu)
 {
-       struct pollfd pfd = { .fd = fd, .events = POLLIN };
-       socklen_t socklen;
+       char fname[MAXPATHLEN + 64];
+       struct list_head *p;
+       FILE *f;
 
-       printf("blktrace: waiting for incoming connection...\n");
+       __list_for_each(p, &devpaths) {
+               struct devpath *dpp = list_entry(p, struct devpath, head);
 
-       if (poll(&pfd, 1, -1) < 0) {
-               perror("poll for connection");
-               return 1;
+               if (fill_ofname(fname, sizeof(fname), NULL, dpp->buts_name,
+                               cpu))
+                       return 1;
+               f = my_fopen(fname, "w+");
+               if (!f)
+                       return 1;
+               fclose(f);
        }
-       if ((pfd.revents & POLLIN) == 0)
-               return 1;
+       return 0;
+}
 
-       socklen = sizeof(*addr);
-       net_in_fd = accept(fd, (struct sockaddr *) addr, &socklen);
-       if (net_in_fd < 0) {
-               perror("accept");
-               return 1;
+static void start_tracers(void)
+{
+       int cpu, started = 0;
+       struct list_head *p;
+       size_t alloc_size = CPU_ALLOC_SIZE(max_cpus);
+
+       for (cpu = 0; cpu < max_cpus; cpu++) {
+               if (!CPU_ISSET_S(cpu, alloc_size, online_cpus)) {
+                       /*
+                        * Create fake empty output files so that other tools
+                        * like blkparse don't have to bother with sparse CPU
+                        * number space.
+                        */
+                       if (create_output_files(cpu))
+                               break;
+                       continue;
+               }
+               if (start_tracer(cpu))
+                       break;
+               started++;
        }
 
-       printf("blktrace: connection from %s\n", inet_ntoa(addr->sin_addr));
-       return 0;
+       wait_tracers_ready(started);
+
+       __list_for_each(p, &tracers) {
+               struct tracer *tp = list_entry(p, struct tracer, head);
+               if (tp->status)
+                       fprintf(stderr,
+                               "FAILED to start thread on CPU %d: %d/%s\n",
+                               tp->cpu, tp->status, strerror(tp->status));
+       }
 }
 
-/*
- * Start here when we are in server mode - just fetch data from the network
- * and dump to files
- */
-static int net_server(void)
+static void stop_tracers(void)
 {
-       struct device_information *dip;
-       struct thread_information *tip;
-       struct sockaddr_in addr;
-       int fd, opt, i, j;
+       struct list_head *p;
 
-       fd = socket(AF_INET, SOCK_STREAM, 0);
-       if (fd < 0) {
-               perror("server: socket");
-               return 1;
+       /*
+        * Stop the tracing - makes the tracer threads clean up quicker.
+        */
+       __list_for_each(p, &devpaths) {
+               struct devpath *dpp = list_entry(p, struct devpath, head);
+               (void)ioctl(dpp->fd, BLKTRACESTOP);
        }
 
-       opt = 1;
-       if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) < 0) {
-               perror("setsockopt");
-               return 1;
+       /*
+        * Tell each tracer to quit
+        */
+       __list_for_each(p, &tracers) {
+               struct tracer *tp = list_entry(p, struct tracer, head);
+               tp->is_done = 1;
        }
+       pthread_cond_broadcast(&mt_cond);
+}
 
-       memset(&addr, 0, sizeof(addr));
-       addr.sin_family = AF_INET;
-       addr.sin_addr.s_addr = htonl(INADDR_ANY);
-       addr.sin_port = htons(net_port);
+static void del_tracers(void)
+{
+       struct list_head *p, *q;
 
-       if (bind(fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
-               perror("bind");
-               return 1;
-       }
+       list_for_each_safe(p, q, &tracers) {
+               struct tracer *tp = list_entry(p, struct tracer, head);
 
-       if (listen(fd, 1) < 0) {
-               perror("listen");
-               return 1;
+               list_del(&tp->head);
+               free(tp);
        }
+}
 
-repeat:
-       if (get_connection(fd, &addr))
-               return 0;
+static void wait_tracers(void)
+{
+       struct list_head *p;
 
-       while (!is_done()) {
-               if (net_server_loop(&addr.sin_addr))
-                       break;
-       }
+       if (use_tracer_devpaths())
+               process_trace_bufs();
 
-       for_each_dip(dip, i)
-               for_each_tip(dip, tip, j)
-                       tip_ftrunc_final(tip);
+       wait_tracers_leaving();
 
-       show_stats();
+       __list_for_each(p, &tracers) {
+               int ret;
+               struct tracer *tp = list_entry(p, struct tracer, head);
 
-       if (is_done())
-               return 0;
+               ret = pthread_join(tp->thread, NULL);
+               if (ret)
+                       fprintf(stderr, "Thread join %d failed %d\n",
+                               tp->cpu, ret);
+       }
 
-       /*
-        * cleanup for next run
-        */
-       for_each_dip(dip, i) {
-               for_each_tip(dip, tip, j)
-                       fclose(tip->ofile);
+       if (use_tracer_devpaths())
+               clean_trace_bufs();
 
-               free(dip->threads);
-               free(dip->path);
-       }
+       get_all_drops();
+}
 
-       free(device_information);
-       device_information = NULL;
-       ncpus = ndevs = 0;
+static void exit_tracing(void)
+{
+       signal(SIGINT, SIG_IGN);
+       signal(SIGHUP, SIG_IGN);
+       signal(SIGTERM, SIG_IGN);
+       signal(SIGALRM, SIG_IGN);
+
+       stop_tracers();
+       wait_tracers();
+       del_tracers();
+       rel_devpaths();
+}
 
-       close(net_in_fd);
-       net_in_fd = -1;
-       stat_shown = 0;
-       goto repeat;
+static void handle_sigint(__attribute__((__unused__)) int sig)
+{
+       done = 1;
+       stop_tracers();
 }
 
-/*
- * Setup outgoing network connection where we will transmit data
- */
-static int net_setup_client(void)
+static void show_stats(struct list_head *devpaths)
 {
-       struct sockaddr_in addr;
-       int fd;
+       FILE *ofp;
+       struct list_head *p;
+       unsigned long long nevents, data_read;
+       unsigned long long total_drops = 0;
+       unsigned long long total_events = 0;
+
+       if (piped_output)
+               ofp = my_fopen("/dev/null", "w");
+       else
+               ofp = stdout;
 
-       fd = socket(AF_INET, SOCK_STREAM, 0);
-       if (fd < 0) {
-               perror("client: socket");
-               return 1;
-       }
+       __list_for_each(p, devpaths) {
+               int cpu;
+               struct pdc_stats *sp;
+               struct devpath *dpp = list_entry(p, struct devpath, head);
 
-       memset(&addr, 0, sizeof(addr));
-       addr.sin_family = AF_INET;
-       addr.sin_port = htons(net_port);
+               if (net_mode == Net_server)
+                       printf("server: end of run for %s:%s\n",
+                               dpp->ch->hostname, dpp->buts_name);
 
-       if (inet_aton(hostname, &addr.sin_addr) != 1) {
-               struct hostent *hent = gethostbyname(hostname);
-               if (!hent) {
-                       perror("gethostbyname");
-                       return 1;
-               }
+               data_read = 0;
+               nevents = 0;
+
+               fprintf(ofp, "=== %s ===\n", dpp->buts_name);
+               for (cpu = 0, sp = dpp->stats; cpu < dpp->ncpus; cpu++, sp++) {
+                       /*
+                        * Estimate events if not known...
+                        */
+                       if (sp->nevents == 0) {
+                               sp->nevents = sp->data_read /
+                                               sizeof(struct blk_io_trace);
+                       }
 
-               memcpy(&addr.sin_addr, hent->h_addr, 4);
-               strcpy(hostname, hent->h_name);
-       }
+                       fprintf(ofp,
+                               "  CPU%3d: %20llu events, %8llu KiB data\n",
+                               cpu, sp->nevents, (sp->data_read + 1023) >> 10);
 
-       printf("blktrace: connecting to %s\n", hostname);
+                       data_read += sp->data_read;
+                       nevents += sp->nevents;
+               }
 
-       if (connect(fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
-               perror("client: connect");
-               return 1;
+               fprintf(ofp, "  Total:  %20llu events (dropped %llu),"
+                            " %8llu KiB data\n", nevents,
+                            dpp->drops, (data_read + 1024) >> 10);
+
+               total_drops += dpp->drops;
+               total_events += (nevents + dpp->drops);
        }
 
-       printf("blktrace: connected!\n");
-       net_out_fd = fd;
-       return 0;
-}
+       fflush(ofp);
+       if (piped_output)
+               fclose(ofp);
 
-static char usage_str[] = \
-       "-d <dev> [ -r relay path ] [ -o <output> ] [-k ] [ -w time ]\n" \
-       "[ -a action ] [ -A action mask ] [ -v ]\n\n" \
-       "\t-d Use specified device. May also be given last after options\n" \
-       "\t-r Path to mounted relayfs, defaults to /relay\n" \
-       "\t-o File(s) to send output to\n" \
-       "\t-D Directory to prepend to output file names\n" \
-       "\t-k Kill a running trace\n" \
-       "\t-w Stop after defined time, in seconds\n" \
-       "\t-a Only trace specified actions. See documentation\n" \
-       "\t-A Give trace mask as a single value. See documentation\n" \
-       "\t-b Sub buffer size in KiB\n" \
-       "\t-n Number of sub buffers\n" \
-       "\t-l Run in network listen mode (blktrace server)\n" \
-       "\t-h Run in network client mode, connecting to the given host\n" \
-       "\t-p Network port to use (default 8462)\n" \
-       "\t-s Make the network client use sendfile() to transfer data\n" \
-       "\t-V Print program version info\n\n";
+       if (total_drops) {
+               double drops_ratio = 1.0;
 
-static void show_usage(char *program)
-{
-       fprintf(stderr, "Usage: %s %s %s",program, blktrace_version, usage_str);
+               if (total_events)
+                       drops_ratio = (double)total_drops/(double)total_events;
+
+               fprintf(stderr, "\nYou have %llu (%5.1lf%%) dropped events\n"
+                               "Consider using a larger buffer size (-b) "
+                               "and/or more buffers (-n)\n",
+                       total_drops, 100.0 * drops_ratio);
+       }
 }
 
-int main(int argc, char *argv[])
+static int handle_args(int argc, char *argv[])
 {
-       static char default_relay_path[] = "/relay";
+       int c, i;
        struct statfs st;
-       int i, c;
-       int stop_watch = 0;
        int act_mask_tmp = 0;
 
        while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) >= 0) {
@@ -1560,7 +2111,7 @@ int main(int argc, char *argv[])
                case 'a':
                        i = find_mask_map(optarg);
                        if (i < 0) {
-                               fprintf(stderr,"Invalid action mask %s\n",
+                               fprintf(stderr, "Invalid action mask %s\n",
                                        optarg);
                                return 1;
                        }
@@ -1568,7 +2119,7 @@ int main(int argc, char *argv[])
                        break;
 
                case 'A':
-                       if ((sscanf(optarg, "%x", &i) != 1) || 
+                       if ((sscanf(optarg, "%x", &i) != 1) ||
                                                        !valid_act_opt(i)) {
                                fprintf(stderr,
                                        "Invalid set action mask %s/0x%x\n",
@@ -1579,12 +2130,33 @@ int main(int argc, char *argv[])
                        break;
 
                case 'd':
-                       if (resize_devices(optarg) != 0)
+                       if (add_devpath(optarg) != 0)
+                               return 1;
+                       break;
+
+               case 'I': {
+                       char dev_line[256];
+                       FILE *ifp = my_fopen(optarg, "r");
+
+                       if (!ifp) {
+                               fprintf(stderr,
+                                       "Invalid file for devices %s\n",
+                                       optarg);
                                return 1;
+                       }
+
+                       while (fscanf(ifp, "%s\n", dev_line) == 1) {
+                               if (add_devpath(dev_line) != 0) {
+                                       fclose(ifp);
+                                       return 1;
+                               }
+                       }
+                       fclose(ifp);
                        break;
+               }
 
                case 'r':
-                       relay_path = optarg;
+                       debugfs_path = optarg;
                        break;
 
                case 'o':
@@ -1603,13 +2175,15 @@ int main(int argc, char *argv[])
                        }
                        break;
                case 'V':
+               case 'v':
                        printf("%s version %s\n", argv[0], blktrace_version);
-                       return 0;
+                       exit(0);
+                       /*NOTREACHED*/
                case 'b':
                        buf_size = strtoul(optarg, NULL, 10);
                        if (buf_size <= 0 || buf_size > 16*1024) {
-                               fprintf(stderr,
-                                       "Invalid buffer size (%lu)\n",buf_size);
+                               fprintf(stderr, "Invalid buffer size (%lu)\n",
+                                       buf_size);
                                return 1;
                        }
                        buf_size <<= 10;
@@ -1627,7 +2201,9 @@ int main(int argc, char *argv[])
                        break;
                case 'h':
                        net_mode = Net_client;
-                       strcpy(hostname, optarg);
+                       memset(hostname, 0, sizeof(hostname));
+                       strncpy(hostname, optarg, sizeof(hostname));
+                       hostname[sizeof(hostname) - 1] = '\0';
                        break;
                case 'l':
                        net_mode = Net_server;
@@ -1636,88 +2212,634 @@ int main(int argc, char *argv[])
                        net_port = atoi(optarg);
                        break;
                case 's':
-                       net_use_sendfile = 1;
+                       net_use_sendfile = 0;
                        break;
                default:
                        show_usage(argv[0]);
-                       return 1;
+                       exit(1);
+                       /*NOTREACHED*/
                }
        }
 
-       setlocale(LC_NUMERIC, "en_US");
-
-       page_size = getpagesize();
-
-       if (net_mode == Net_server)
-               return net_server();
-
-       while (optind < argc) {
-               if (resize_devices(argv[optind++]) != 0)
+       while (optind < argc)
+               if (add_devpath(argv[optind++]) != 0)
                        return 1;
-       }
 
-       if (ndevs == 0) {
+       if (net_mode != Net_server && ndevs == 0) {
                show_usage(argv[0]);
                return 1;
        }
 
-       if (!relay_path)
-               relay_path = default_relay_path;
+       if (statfs(debugfs_path, &st) < 0) {
+               fprintf(stderr, "Invalid debug path %s: %d/%s\n",
+                       debugfs_path, errno, strerror(errno));
+               return 1;
+       }
+
+       if (st.f_type != (long)DEBUGFS_TYPE) {
+               fprintf(stderr, "Debugfs is not mounted at %s\n", debugfs_path);
+               return 1;
+       }
 
        if (act_mask_tmp != 0)
                act_mask = act_mask_tmp;
 
-       if (statfs(relay_path, &st) < 0) {
-               perror("statfs");
-               fprintf(stderr,"%s does not appear to be a valid path\n",
-                       relay_path);
-               return 1;
-       } else if (st.f_type != (long) RELAYFS_TYPE) {
-               fprintf(stderr,"%s does not appear to be a relay filesystem\n",
-                       relay_path);
+       if (net_mode == Net_client && net_setup_addr())
                return 1;
+
+       /*
+        * Set up for appropriate PFD handler based upon output name.
+        */
+       if (net_client_use_sendfile())
+               handle_pfds = handle_pfds_netclient;
+       else if (net_client_use_send())
+               handle_pfds = handle_pfds_entries;
+       else if (output_name && (strcmp(output_name, "-") == 0)) {
+               piped_output = 1;
+               handle_pfds = handle_pfds_entries;
+               pfp = stdout;
+               if (setvbuf(pfp, NULL, _IONBF, 0)) {
+                       perror("setvbuf stdout");
+                       return 1;
+               }
+       } else
+               handle_pfds = handle_pfds_file;
+       return 0;
+}
+
+static void ch_add_connection(struct net_server_s *ns, struct cl_host *ch,
+                             int fd)
+{
+       struct cl_conn *nc;
+
+       nc = malloc(sizeof(*nc));
+       memset(nc, 0, sizeof(*nc));
+
+       time(&nc->connect_time);
+       nc->ch = ch;
+       nc->fd = fd;
+       nc->ncpus = -1;
+
+       list_add_tail(&nc->ch_head, &ch->conn_list);
+       ch->connects++;
+
+       list_add_tail(&nc->ns_head, &ns->conn_list);
+       ns->connects++;
+       ns->pfds = realloc(ns->pfds, (ns->connects+1) * sizeof(struct pollfd));
+}
+
+static void ch_rem_connection(struct net_server_s *ns, struct cl_host *ch,
+                             struct cl_conn *nc)
+{
+       net_close_connection(&nc->fd);
+
+       list_del(&nc->ch_head);
+       ch->connects--;
+
+       list_del(&nc->ns_head);
+       ns->connects--;
+       ns->pfds = realloc(ns->pfds, (ns->connects+1) * sizeof(struct pollfd));
+
+       free(nc);
+}
+
+static struct cl_host *net_find_client_host(struct net_server_s *ns,
+                                           struct in_addr cl_in_addr)
+{
+       struct list_head *p;
+
+       __list_for_each(p, &ns->ch_list) {
+               struct cl_host *ch = list_entry(p, struct cl_host, head);
+
+               if (in_addr_eq(ch->cl_in_addr, cl_in_addr))
+                       return ch;
        }
 
-       if (open_devices() != 0)
-               return 1;
+       return NULL;
+}
 
-       if (kill_running_trace) {
-               stop_all_traces();
+static struct cl_host *net_add_client_host(struct net_server_s *ns,
+                                          struct sockaddr_in *addr)
+{
+       struct cl_host *ch;
+
+       ch = malloc(sizeof(*ch));
+       memset(ch, 0, sizeof(*ch));
+
+       ch->ns = ns;
+       ch->cl_in_addr = addr->sin_addr;
+       list_add_tail(&ch->head, &ns->ch_list);
+       ns->nchs++;
+
+       ch->hostname = strdup(inet_ntoa(addr->sin_addr));
+       printf("server: connection from %s\n", ch->hostname);
+
+       INIT_LIST_HEAD(&ch->conn_list);
+       INIT_LIST_HEAD(&ch->devpaths);
+
+       return ch;
+}
+
+static void device_done(struct devpath *dpp, int ncpus)
+{
+       int cpu;
+       struct io_info *iop;
+
+       for (cpu = 0, iop = dpp->ios; cpu < ncpus; cpu++, iop++)
+               close_iop(iop);
+
+       list_del(&dpp->head);
+       dpp_free(dpp);
+}
+
+static void net_ch_remove(struct cl_host *ch, int ncpus)
+{
+       struct list_head *p, *q;
+       struct net_server_s *ns = ch->ns;
+
+       list_for_each_safe(p, q, &ch->devpaths) {
+               struct devpath *dpp = list_entry(p, struct devpath, head);
+               device_done(dpp, ncpus);
+       }
+
+       list_for_each_safe(p, q, &ch->conn_list) {
+               struct cl_conn *nc = list_entry(p, struct cl_conn, ch_head);
+
+               ch_rem_connection(ns, ch, nc);
+       }
+
+       list_del(&ch->head);
+       ns->nchs--;
+
+       if (ch->hostname)
+               free(ch->hostname);
+       free(ch);
+}
+
+static void net_add_connection(struct net_server_s *ns)
+{
+       int fd;
+       struct cl_host *ch;
+       socklen_t socklen = sizeof(ns->addr);
+
+       fd = my_accept(ns->listen_fd, (struct sockaddr *)&ns->addr, &socklen);
+       if (fd < 0) {
+               /*
+                * This is OK: we just won't accept this connection,
+                * nothing fatal.
+                */
+               perror("accept");
+       } else {
+               ch = net_find_client_host(ns, ns->addr.sin_addr);
+               if (!ch)
+                       ch = net_add_client_host(ns, &ns->addr);
+
+               ch_add_connection(ns, ch, fd);
+       }
+}
+
+static struct devpath *nc_add_dpp(struct cl_conn *nc,
+                                 struct blktrace_net_hdr *bnh,
+                                 time_t connect_time)
+{
+       int cpu;
+       struct io_info *iop;
+       struct devpath *dpp;
+
+       dpp = malloc(sizeof(*dpp));
+       memset(dpp, 0, sizeof(*dpp));
+
+       dpp->buts_name = strdup(bnh->buts_name);
+       dpp->path = strdup(bnh->buts_name);
+       dpp->fd = -1;
+       dpp->ch = nc->ch;
+       dpp->cl_id = bnh->cl_id;
+       dpp->cl_connect_time = connect_time;
+       dpp->ncpus = nc->ncpus;
+       dpp->stats = calloc(dpp->ncpus, sizeof(*dpp->stats));
+       memset(dpp->stats, 0, dpp->ncpus * sizeof(*dpp->stats));
+
+       list_add_tail(&dpp->head, &nc->ch->devpaths);
+       nc->ch->ndevs++;
+
+       dpp->ios = calloc(nc->ncpus, sizeof(*iop));
+       memset(dpp->ios, 0, ndevs * sizeof(*iop));
+
+       for (cpu = 0, iop = dpp->ios; cpu < nc->ncpus; cpu++, iop++) {
+               iop->dpp = dpp;
+               iop->nc = nc;
+               init_mmap_info(&iop->mmap_info);
+
+               if (iop_open(iop, cpu))
+                       goto err;
+       }
+
+       return dpp;
+
+err:
+       /*
+        * Need to unravel what's been done...
+        */
+       while (cpu >= 0)
+               close_iop(&dpp->ios[cpu--]);
+       dpp_free(dpp);
+
+       return NULL;
+}
+
+static struct devpath *nc_find_dpp(struct cl_conn *nc,
+                                  struct blktrace_net_hdr *bnh)
+{
+       struct list_head *p;
+       time_t connect_time = nc->connect_time;
+
+       __list_for_each(p, &nc->ch->devpaths) {
+               struct devpath *dpp = list_entry(p, struct devpath, head);
+
+               if (!strcmp(dpp->buts_name, bnh->buts_name))
+                       return dpp;
+
+               if (dpp->cl_id == bnh->cl_id)
+                       connect_time = dpp->cl_connect_time;
+       }
+
+       return nc_add_dpp(nc, bnh, connect_time);
+}
+
+static void net_client_read_data(struct cl_conn *nc, struct devpath *dpp,
+                                struct blktrace_net_hdr *bnh)
+{
+       int ret;
+       struct io_info *iop = &dpp->ios[bnh->cpu];
+       struct mmap_info *mip = &iop->mmap_info;
+
+       if (setup_mmap(iop->ofd, bnh->len, &iop->mmap_info, NULL)) {
+               fprintf(stderr, "ncd(%s:%d): mmap failed\n",
+                       nc->ch->hostname, nc->fd);
+               exit(1);
+       }
+
+       ret = net_recv_data(nc->fd, mip->fs_buf + mip->fs_off, bnh->len);
+       if (ret > 0) {
+               pdc_dr_update(dpp, bnh->cpu, ret);
+               mip->fs_size += ret;
+               mip->fs_off += ret;
+       } else if (ret < 0)
+               exit(1);
+}
+
+/*
+ * Returns 1 if we closed a host - invalidates other polling information
+ * that may be present.
+ */
+static int net_client_data(struct cl_conn *nc)
+{
+       int ret;
+       struct devpath *dpp;
+       struct blktrace_net_hdr bnh;
+
+       ret = net_get_header(nc, &bnh);
+       if (ret == 0)
                return 0;
+
+       if (ret < 0) {
+               fprintf(stderr, "ncd(%d): header read failed\n", nc->fd);
+               exit(1);
        }
 
-       ncpus = sysconf(_SC_NPROCESSORS_ONLN);
-       if (ncpus < 0) {
-               fprintf(stderr, "sysconf(_SC_NPROCESSORS_ONLN) failed\n");
-               return 1;
+       if (data_is_native == -1 && check_data_endianness(bnh.magic)) {
+               fprintf(stderr, "ncd(%d): received data is bad\n", nc->fd);
+               exit(1);
        }
 
-       signal(SIGINT, handle_sigint);
-       signal(SIGHUP, handle_sigint);
-       signal(SIGTERM, handle_sigint);
-       signal(SIGALRM, handle_sigint);
+       if (!data_is_native) {
+               bnh.magic = be32_to_cpu(bnh.magic);
+               bnh.cpu = be32_to_cpu(bnh.cpu);
+               bnh.max_cpus = be32_to_cpu(bnh.max_cpus);
+               bnh.len = be32_to_cpu(bnh.len);
+               bnh.cl_id = be32_to_cpu(bnh.cl_id);
+               bnh.buf_size = be32_to_cpu(bnh.buf_size);
+               bnh.buf_nr = be32_to_cpu(bnh.buf_nr);
+               bnh.page_size = be32_to_cpu(bnh.page_size);
+       }
 
-       if (net_mode == Net_client && net_setup_client())
-               return 1;
+       if ((bnh.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
+               fprintf(stderr, "ncd(%s:%d): bad data magic\n",
+                       nc->ch->hostname, nc->fd);
+               exit(1);
+       }
 
-       if (start_devices() != 0)
-               return 1;
+       if (nc->ncpus == -1)
+               nc->ncpus = bnh.max_cpus;
+
+       /*
+        * len == 0 means the other end is sending us a new connection/dpp
+        * len == 1 means that the other end signalled end-of-run
+        */
+       dpp = nc_find_dpp(nc, &bnh);
+       if (bnh.len == 0) {
+               /*
+                * Just adding in the dpp above is enough
+                */
+               ack_open_close(nc->fd, dpp->buts_name);
+               nc->ch->cl_opens++;
+       } else if (bnh.len == 1) {
+               /*
+                * overload cpu count with dropped events
+                */
+               dpp->drops = bnh.cpu;
+
+               ack_open_close(nc->fd, dpp->buts_name);
+               if (--nc->ch->cl_opens == 0) {
+                       show_stats(&nc->ch->devpaths);
+                       net_ch_remove(nc->ch, nc->ncpus);
+                       return 1;
+               }
+       } else
+               net_client_read_data(nc, dpp, &bnh);
+
+       return 0;
+}
+
+static void handle_client_data(struct net_server_s *ns, int events)
+{
+       struct cl_conn *nc;
+       struct pollfd *pfd;
+       struct list_head *p, *q;
+
+       pfd = &ns->pfds[1];
+       list_for_each_safe(p, q, &ns->conn_list) {
+               if (pfd->revents & POLLIN) {
+                       nc = list_entry(p, struct cl_conn, ns_head);
+
+                       if (net_client_data(nc) || --events == 0)
+                               break;
+               }
+               pfd++;
+       }
+}
 
-       atexit(stop_all_tracing);
+static void net_setup_pfds(struct net_server_s *ns)
+{
+       struct pollfd *pfd;
+       struct list_head *p;
 
-       if (stop_watch)
-               alarm(stop_watch);
+       ns->pfds[0].fd = ns->listen_fd;
+       ns->pfds[0].events = POLLIN;
 
-       wait_for_threads();
+       pfd = &ns->pfds[1];
+       __list_for_each(p, &ns->conn_list) {
+               struct cl_conn *nc = list_entry(p, struct cl_conn, ns_head);
 
-       if (!is_trace_stopped()) {
-               trace_stopped = 1;
-               stop_all_threads();
-               stop_all_traces();
+               pfd->fd = nc->fd;
+               pfd->events = POLLIN;
+               pfd++;
        }
+}
+
+static int net_server_handle_connections(struct net_server_s *ns)
+{
+       int events;
+
+       printf("server: waiting for connections...\n");
+
+       while (!done) {
+               net_setup_pfds(ns);
+               events = poll(ns->pfds, ns->connects + 1, -1);
+               if (events < 0) {
+                       if (errno != EINTR) {
+                               perror("FATAL: poll error");
+                               return 1;
+                       }
+               } else if (events > 0) {
+                       if (ns->pfds[0].revents & POLLIN) {
+                               net_add_connection(ns);
+                               events--;
+                       }
 
-       show_stats();
+                       if (events)
+                               handle_client_data(ns, events);
+               }
+       }
 
        return 0;
 }
 
+static int net_server(void)
+{
+       int fd, opt;
+       int ret = 1;
+       struct net_server_s net_server;
+       struct net_server_s *ns = &net_server;
+
+       memset(ns, 0, sizeof(*ns));
+       INIT_LIST_HEAD(&ns->ch_list);
+       INIT_LIST_HEAD(&ns->conn_list);
+       ns->pfds = malloc(sizeof(struct pollfd));
+
+       fd = my_socket(AF_INET, SOCK_STREAM, 0);
+       if (fd < 0) {
+               perror("server: socket");
+               goto out;
+       }
+
+       opt = 1;
+       if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) < 0) {
+               perror("setsockopt");
+               goto out;
+       }
+
+       memset(&ns->addr, 0, sizeof(ns->addr));
+       ns->addr.sin_family = AF_INET;
+       ns->addr.sin_addr.s_addr = htonl(INADDR_ANY);
+       ns->addr.sin_port = htons(net_port);
+
+       if (bind(fd, (struct sockaddr *) &ns->addr, sizeof(ns->addr)) < 0) {
+               perror("bind");
+               goto out;
+       }
+
+       if (listen(fd, 1) < 0) {
+               perror("listen");
+               goto out;
+       }
+
+       /*
+        * The actual server looping is done here:
+        */
+       ns->listen_fd = fd;
+       ret = net_server_handle_connections(ns);
+
+       /*
+        * Clean up and return...
+        */
+out:
+       free(ns->pfds);
+       return ret;
+}
+
+static int run_tracers(void)
+{
+       atexit(exit_tracing);
+       if (net_mode == Net_client)
+               printf("blktrace: connecting to %s\n", hostname);
+
+       if (setup_buts())
+               return 1;
+
+       if (use_tracer_devpaths()) {
+               if (setup_tracer_devpaths())
+                       return 1;
+
+               if (piped_output)
+                       handle_list = handle_list_file;
+               else
+                       handle_list = handle_list_net;
+       }
+
+       start_tracers();
+       if (nthreads_running == ncpus) {
+               unblock_tracers();
+               start_buts();
+               if (net_mode == Net_client)
+                       printf("blktrace: connected!\n");
+               if (stop_watch)
+                       alarm(stop_watch);
+       } else
+               stop_tracers();
+
+       wait_tracers();
+       if (nthreads_running == ncpus)
+               show_stats(&devpaths);
+       if (net_client_use_send())
+               close_client_connections();
+       del_tracers();
+
+       return 0;
+}
+
+static cpu_set_t *get_online_cpus(void)
+{
+       FILE *cpus;
+       cpu_set_t *set;
+       size_t alloc_size;
+       int cpuid, prevcpuid = -1;
+       char nextch;
+       int n, ncpu, curcpu = 0;
+       int *cpu_nums;
+
+       ncpu = sysconf(_SC_NPROCESSORS_CONF);
+       if (ncpu < 0)
+               return NULL;
+
+       cpu_nums = malloc(sizeof(int)*ncpu);
+       if (!cpu_nums) {
+               errno = ENOMEM;
+               return NULL;
+       }
+
+       /*
+        * There is no way to easily get maximum CPU number. So we have to
+        * parse the file first to find it out and then create appropriate
+        * cpuset
+        */
+       cpus = my_fopen("/sys/devices/system/cpu/online", "r");
+       for (;;) {
+               n = fscanf(cpus, "%d%c", &cpuid, &nextch);
+               if (n <= 0)
+                       break;
+               if (n == 2 && nextch == '-') {
+                       prevcpuid = cpuid;
+                       continue;
+               }
+               if (prevcpuid == -1)
+                       prevcpuid = cpuid;
+               while (prevcpuid <= cpuid) {
+                       /* More CPUs listed than configured? */
+                       if (curcpu >= ncpu) {
+                               errno = EINVAL;
+                               return NULL;
+                       }
+                       cpu_nums[curcpu++] = prevcpuid++;
+               }
+               prevcpuid = -1;
+       }
+       fclose(cpus);
+
+       ncpu = curcpu;
+       max_cpus = cpu_nums[ncpu - 1] + 1;
+
+       /* Now that we have maximum cpu number, create a cpuset */
+       set = CPU_ALLOC(max_cpus);
+       if (!set) {
+               errno = ENOMEM;
+               return NULL;
+       }
+       alloc_size = CPU_ALLOC_SIZE(max_cpus);
+       CPU_ZERO_S(alloc_size, set);
+
+       for (curcpu = 0; curcpu < ncpu; curcpu++)
+               CPU_SET_S(cpu_nums[curcpu], alloc_size, set);
+
+       free(cpu_nums);
+
+       return set;
+}
+
+int main(int argc, char *argv[])
+{
+       int ret = 0;
+
+       setlocale(LC_NUMERIC, "en_US");
+       pagesize = getpagesize();
+       online_cpus = get_online_cpus();
+       if (!online_cpus) {
+               fprintf(stderr, "cannot get online cpus %d/%s\n",
+                       errno, strerror(errno));
+               ret = 1;
+               goto out;
+       } else if (handle_args(argc, argv)) {
+               ret = 1;
+               goto out;
+       }
+
+       ncpus = CPU_COUNT_S(CPU_ALLOC_SIZE(max_cpus), online_cpus);
+       if (ndevs > 1 && output_name && strcmp(output_name, "-") != 0) {
+               fprintf(stderr, "-o not supported with multiple devices\n");
+               ret = 1;
+               goto out;
+       }
+
+       signal(SIGINT, handle_sigint);
+       signal(SIGHUP, handle_sigint);
+       signal(SIGTERM, handle_sigint);
+       signal(SIGALRM, handle_sigint);
+       signal(SIGPIPE, SIG_IGN);
+
+       if (kill_running_trace) {
+               struct devpath *dpp;
+               struct list_head *p;
+
+               __list_for_each(p, &devpaths) {
+                       dpp = list_entry(p, struct devpath, head);
+                       if (__stop_trace(dpp->fd)) {
+                               fprintf(stderr,
+                                       "BLKTRACETEARDOWN %s failed: %d/%s\n",
+                                       dpp->path, errno, strerror(errno));
+                       }
+               }
+       } else if (net_mode == Net_server) {
+               if (output_name) {
+                       fprintf(stderr, "-o ignored in server mode\n");
+                       output_name = NULL;
+               }
+               ret = net_server();
+       } else
+               ret = run_tracers();
+
+out:
+       if (pfp)
+               fclose(pfp);
+       rel_devpaths();
+       return ret;
+}