blktrace: Use number of online CPUs
[blktrace.git] / blktrace.c
index 00ce0459f3ecdbbb2d708d9b7020223b485f1538..b445524b1f0b8b38ae9d13f030d7cdf171558a78 100644 (file)
@@ -71,6 +71,12 @@ enum {
        Net_client,
 };
 
+enum thread_status {
+       Th_running,
+       Th_leaving,
+       Th_error
+};
+
 /*
  * Generic stats collected: nevents can be _roughly_ estimated by data_read
  * (discounting pdu...)
@@ -87,7 +93,7 @@ struct devpath {
        char *path;                     /* path to device special file */
        char *buts_name;                /* name returned from bt kernel code */
        struct pdc_stats *stats;
-       int fd, idx, ncpus;
+       int fd, ncpus;
        unsigned long long drops;
 
        /*
@@ -159,10 +165,8 @@ struct tracer {
        struct io_info *ios;
        struct pollfd *pfds;
        pthread_t thread;
-       pthread_mutex_t mutex;
-       pthread_cond_t cond;
        int cpu, nios;
-       volatile int running, status, is_done;
+       volatile int status, is_done;
 };
 
 /*
@@ -269,23 +273,27 @@ static char blktrace_version[] = "2.0.0";
  */
 int data_is_native = -1;
 
+static int ndevs;
 static int ncpus;
 static int pagesize;
 static int act_mask = ~0U;
+static int kill_running_trace;
+static int stop_watch;
+static int piped_output;
+
 static char *debugfs_path = "/sys/kernel/debug";
 static char *output_name;
 static char *output_dir;
-static int kill_running_trace;
-static int stop_watch;
+
 static unsigned long buf_size = BUF_SIZE;
 static unsigned long buf_nr = BUF_NR;
+
+static FILE *pfp;
+
 static LIST_HEAD(devpaths);
 static LIST_HEAD(tracers);
-static int ndevs;
+
 static volatile int done;
-static FILE *pfp;
-static int piped_output;
-static int ntracers;
 
 /*
  * tracer threads add entries, the main thread takes them off and processes
@@ -296,12 +304,14 @@ static pthread_mutex_t dp_mutex = PTHREAD_MUTEX_INITIALIZER;
 static volatile int dp_entries;
 
 /*
- * This synchronizes the starting of trace gathering amongst all tracer
- * threads.
+ * These synchronize master / thread interactions.
  */
-static pthread_cond_t ub_cond = PTHREAD_COND_INITIALIZER;
-static pthread_mutex_t ub_mutex = PTHREAD_MUTEX_INITIALIZER;
-static volatile int unblock_tracers;
+static pthread_cond_t mt_cond = PTHREAD_COND_INITIALIZER;
+static pthread_mutex_t mt_mutex = PTHREAD_MUTEX_INITIALIZER;
+static volatile int nthreads_running;
+static volatile int nthreads_leaving;
+static volatile int nthreads_error;
+static volatile int tracers_run;
 
 /*
  * network cmd line params
@@ -425,24 +435,39 @@ static struct option l_opts[] = {
        }
 };
 
-static char usage_str[] = \
-       "-d <dev> [ -r debugfs path ] [ -o <output> ] [-k ] [ -w time ]\n" \
-       "[ -a action ] [ -A action mask ] [ -I  <devs file> ] [ -v ]\n\n" \
+static char usage_str[] = "\n\n" \
+       "-d <dev>             | --dev=<dev>\n" \
+        "[ -r <debugfs path>  | --relay=<debugfs path> ]\n" \
+        "[ -o <file>          | --output=<file>]\n" \
+        "[ -D <dir>           | --output-dir=<dir>\n" \
+        "[ -w <time>          | --stopwatch=<time>]\n" \
+        "[ -a <action field>  | --act-mask=<action field>]\n" \
+        "[ -A <action mask>   | --set-mask=<action mask>]\n" \
+        "[ -b <size>          | --buffer-size]\n" \
+        "[ -n <number>        | --num-sub-buffers=<number>]\n" \
+        "[ -l                 | --listen]\n" \
+        "[ -h <hostname>      | --host=<hostname>]\n" \
+        "[ -p <port number>   | --port=<port number>]\n" \
+        "[ -s                 | --no-sendfile]\n" \
+        "[ -I <devs file>     | --input-devs=<devs file>]\n" \
+        "[ -v <version>       | --version]\n" \
+        "[ -V <version>       | --version]\n" \
+
        "\t-d Use specified device. May also be given last after options\n" \
        "\t-r Path to mounted debugfs, defaults to /sys/kernel/debug\n" \
        "\t-o File(s) to send output to\n" \
        "\t-D Directory to prepend to output file names\n" \
-       "\t-k Kill a running trace\n" \
        "\t-w Stop after defined time, in seconds\n" \
        "\t-a Only trace specified actions. See documentation\n" \
        "\t-A Give trace mask as a single value. See documentation\n" \
-       "\t-b Sub buffer size in KiB\n" \
-       "\t-n Number of sub buffers\n" \
+       "\t-b Sub buffer size in KiB (default 512)\n" \
+       "\t-n Number of sub buffers (default 4)\n" \
        "\t-l Run in network listen mode (blktrace server)\n" \
        "\t-h Run in network client mode, connecting to the given host\n" \
        "\t-p Network port to use (default 8462)\n" \
        "\t-s Make the network client NOT use sendfile() to transfer data\n" \
        "\t-I Add devices found in <devs file>\n" \
+       "\t-v Print program version info\n" \
        "\t-V Print program version info\n\n";
 
 static void clear_events(struct pollfd *pfd)
@@ -483,7 +508,88 @@ static inline void pdc_nev_update(struct devpath *dpp, int cpu, int nevents)
 
 static void show_usage(char *prog)
 {
-       fprintf(stderr, "Usage: %s %s %s", prog, blktrace_version, usage_str);
+       fprintf(stderr, "Usage: %s %s", prog, usage_str);
+}
+
+/*
+ * Create a timespec 'msec' milliseconds into the future
+ */
+static inline void make_timespec(struct timespec *tsp, long delta_msec)
+{
+       struct timeval now;
+
+       gettimeofday(&now, NULL);
+       tsp->tv_sec = now.tv_sec;
+       tsp->tv_nsec = 1000L * now.tv_usec;
+
+       tsp->tv_nsec += (delta_msec * 1000000L);
+       if (tsp->tv_nsec > 1000000000L) {
+               long secs = tsp->tv_nsec / 1000000000L;
+
+               tsp->tv_sec += secs;
+               tsp->tv_nsec -= (secs * 1000000000L);
+       }
+}
+
+/*
+ * Add a timer to ensure wait ends
+ */
+static void t_pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex)
+{
+       struct timespec ts;
+
+       make_timespec(&ts, 50);
+       pthread_cond_timedwait(cond, mutex, &ts);
+}
+
+static void unblock_tracers(void)
+{
+       pthread_mutex_lock(&mt_mutex);
+       tracers_run = 1;
+       pthread_cond_broadcast(&mt_cond);
+       pthread_mutex_unlock(&mt_mutex);
+}
+
+static void tracer_wait_unblock(struct tracer *tp)
+{
+       pthread_mutex_lock(&mt_mutex);
+       while (!tp->is_done && !tracers_run)
+               pthread_cond_wait(&mt_cond, &mt_mutex);
+       pthread_mutex_unlock(&mt_mutex);
+}
+
+static void tracer_signal_ready(struct tracer *tp,
+                               enum thread_status th_status,
+                               int status)
+{
+       pthread_mutex_lock(&mt_mutex);
+       tp->status = status;
+
+       if (th_status == Th_running)
+               nthreads_running++;
+       else if (th_status == Th_error)
+               nthreads_error++;
+       else
+               nthreads_leaving++;
+
+       pthread_cond_signal(&mt_cond);
+       pthread_mutex_unlock(&mt_mutex);
+}
+
+static void wait_tracers_ready(int ncpus_started)
+{
+       pthread_mutex_lock(&mt_mutex);
+       while ((nthreads_running + nthreads_error) < ncpus_started)
+               t_pthread_cond_wait(&mt_cond, &mt_mutex);
+       pthread_mutex_unlock(&mt_mutex);
+}
+
+static void wait_tracers_leaving(void)
+{
+       pthread_mutex_lock(&mt_mutex);
+       while (nthreads_leaving < nthreads_running)
+               t_pthread_cond_wait(&mt_cond, &mt_mutex);
+       pthread_mutex_unlock(&mt_mutex);
 }
 
 static void init_mmap_info(struct mmap_info *mip)
@@ -515,36 +621,22 @@ static void dpp_free(struct devpath *dpp)
 
 static int lock_on_cpu(int cpu)
 {
-       cpu_set_t cpu_mask;
+       cpu_set_t * cpu_mask;
+       size_t size;
+       cpu_mask = CPU_ALLOC(ncpus);
+       size = CPU_ALLOC_SIZE(ncpus);
 
-       CPU_ZERO(&cpu_mask);
-       CPU_SET(cpu, &cpu_mask);
-       if (sched_setaffinity(getpid(), sizeof(cpu_mask), &cpu_mask) < 0)
+       CPU_ZERO_S(size, cpu_mask);
+       CPU_SET_S(cpu, size, cpu_mask);
+       if (sched_setaffinity(0, size, cpu_mask) < 0) {
+               CPU_FREE(cpu_mask);             
                return errno;
+       }
 
+       CPU_FREE(cpu_mask);             
        return 0;
 }
 
-/*
- * Create a timespec 'msec' milliseconds into the future
- */
-static inline void make_timespec(struct timespec *tsp, long delta_msec)
-{
-       struct timeval now;
-
-       gettimeofday(&now, NULL);
-       tsp->tv_sec = now.tv_sec;
-       tsp->tv_nsec = 1000L * now.tv_usec;
-
-       tsp->tv_nsec += (delta_msec * 1000000L);
-       if (tsp->tv_nsec > 1000000000L) {
-               long secs = tsp->tv_nsec / 1000000000L;
-
-               tsp->tv_sec += secs;
-               tsp->tv_nsec -= (secs * 1000000000L);
-       }
-}
-
 static int increase_limit(int resource, rlim_t increase)
 {
        struct rlimit rlim;
@@ -635,17 +727,59 @@ static void *my_mmap(void *addr, size_t length, int prot, int flags, int fd,
        return new;
 }
 
-static int my_mlock(const void *addr, size_t len)
+static int my_mlock(struct tracer *tp,
+                   const void *addr, size_t len)
 {
-       int ret;
+       int ret, retry = 0;
 
        do {
                ret = mlock(addr, len);
+               if ((retry >= 10) && tp && tp->is_done)
+                       break;
+               retry++;
        } while (ret < 0 && handle_mem_failure(len));
 
        return ret;
 }
 
+static int setup_mmap(int fd, unsigned int maxlen,
+                     struct mmap_info *mip,
+                     struct tracer *tp)
+{
+       if (mip->fs_off + maxlen > mip->fs_buf_len) {
+               unsigned long nr = max(16, mip->buf_nr);
+
+               if (mip->fs_buf) {
+                       munlock(mip->fs_buf, mip->fs_buf_len);
+                       munmap(mip->fs_buf, mip->fs_buf_len);
+                       mip->fs_buf = NULL;
+               }
+
+               mip->fs_off = mip->fs_size & (mip->pagesize - 1);
+               mip->fs_buf_len = (nr * mip->buf_size) - mip->fs_off;
+               mip->fs_max_size += mip->fs_buf_len;
+
+               if (ftruncate(fd, mip->fs_max_size) < 0) {
+                       perror("setup_mmap: ftruncate");
+                       return 1;
+               }
+
+               mip->fs_buf = my_mmap(NULL, mip->fs_buf_len, PROT_WRITE,
+                                     MAP_SHARED, fd,
+                                     mip->fs_size - mip->fs_off);
+               if (mip->fs_buf == MAP_FAILED) {
+                       perror("setup_mmap: mmap");
+                       return 1;
+               }
+               if (my_mlock(tp, mip->fs_buf, mip->fs_buf_len) < 0) {
+                       perror("setup_mlock: mlock");
+                       return 1;
+               }
+       }
+
+       return 0;
+}
+
 static int __stop_trace(int fd)
 {
        /*
@@ -695,11 +829,12 @@ static int __net_recv_data(int fd, void *buf, unsigned int len)
                if (ret == 0)
                        break;
                else if (ret < 0) {
-                       if (errno != EAGAIN) {
-                               perror("server: net_recv_data: recv failed");
-                               break;
-                       } else
-                               break;
+                       if (errno == EAGAIN) {
+                               usleep(50);
+                               continue;
+                       }
+                       perror("server: net_recv_data: recv failed");
+                       break;
                } else {
                        buf += ret;
                        bytes_left -= ret;
@@ -743,8 +878,9 @@ static int net_send_header(int fd, int cpu, char *buts_name, int len)
        memset(&hdr, 0, sizeof(hdr));
 
        hdr.magic = BLK_IO_TRACE_MAGIC;
+       memset(hdr.buts_name, 0, sizeof(hdr.buts_name));
        strncpy(hdr.buts_name, buts_name, sizeof(hdr.buts_name));
-       hdr.buts_name[sizeof(hdr.buts_name)-1] = '\0';
+       hdr.buts_name[sizeof(hdr.buts_name) - 1] = '\0';
        hdr.cpu = cpu;
        hdr.max_cpus = ncpus;
        hdr.len = len;
@@ -800,9 +936,9 @@ static void net_send_drops(int fd)
 
 /*
  * Returns:
- *      0: "EOF"
- *      1: OK
- *     -1: Error
+ *      0: "EOF"
+ *      1: OK
+ *     -1: Error
  */
 static int net_get_header(struct cl_conn *nc, struct blktrace_net_hdr *bnh)
 {
@@ -817,7 +953,8 @@ static int net_get_header(struct cl_conn *nc, struct blktrace_net_hdr *bnh)
                return 1;
        else if (bytes_read == 0)
                return 0;
-       return -1;
+       else
+               return -1;
 }
 
 static int net_setup_addr(void)
@@ -851,7 +988,9 @@ retry:
                }
 
                memcpy(&addr->sin_addr, hent->h_addr, 4);
-               strcpy(hostname, hent->h_name);
+               memset(hostname, 0, sizeof(hostname));
+               strncpy(hostname, hent->h_name, sizeof(hostname));
+               hostname[sizeof(hostname) - 1] = '\0';
        }
 
        return 0;
@@ -876,6 +1015,7 @@ static int net_setup_client(void)
                                hostname);
                else
                        perror("client: connect");
+
                close(fd);
                return -1;
        }
@@ -930,22 +1070,30 @@ static void setup_buts(void)
                buts.buf_nr = buf_nr;
                buts.act_mask = act_mask;
 
-               if (ioctl(dpp->fd, BLKTRACESETUP, &buts) < 0) {
+               if (ioctl(dpp->fd, BLKTRACESETUP, &buts) >= 0) {
+                       dpp->ncpus = ncpus;
+                       dpp->buts_name = strdup(buts.name);
+                       if (dpp->stats)
+                               free(dpp->stats);
+                       dpp->stats = calloc(dpp->ncpus, sizeof(*dpp->stats));
+                       memset(dpp->stats, 0, dpp->ncpus * sizeof(*dpp->stats));
+               } else
                        fprintf(stderr, "BLKTRACESETUP(2) %s failed: %d/%s\n",
                                dpp->path, errno, strerror(errno));
-                       continue;
-               } else if (ioctl(dpp->fd, BLKTRACESTART) < 0) {
+       }
+}
+
+static void start_buts(void)
+{
+       struct list_head *p;
+
+       __list_for_each(p, &devpaths) {
+               struct devpath *dpp = list_entry(p, struct devpath, head);
+
+               if (ioctl(dpp->fd, BLKTRACESTART) < 0) {
                        fprintf(stderr, "BLKTRACESTART %s failed: %d/%s\n",
                                dpp->path, errno, strerror(errno));
-                       continue;
                }
-
-               dpp->ncpus = ncpus;
-               dpp->buts_name = strdup(buts.name);
-               if (dpp->stats)
-                       free(dpp->stats);
-               dpp->stats = calloc(dpp->ncpus, sizeof(*dpp->stats));
-               memset(dpp->stats, 0, dpp->ncpus * sizeof(*dpp->stats));
        }
 }
 
@@ -983,6 +1131,7 @@ static void get_all_drops(void)
 
        __list_for_each(p, &devpaths) {
                struct devpath *dpp = list_entry(p, struct devpath, head);
+
                dpp->drops = get_drops(dpp);
        }
 }
@@ -1009,6 +1158,7 @@ static void free_tracer_heads(struct devpath *dpp)
        for (cpu = 0, hd = dpp->heads; cpu < ncpus; cpu++, hd++) {
                if (hd->prev)
                        free(hd->prev);
+
                pthread_mutex_destroy(&hd->mutex);
        }
        free(dpp->heads);
@@ -1062,11 +1212,37 @@ static inline void incr_entries(int entries_handled)
        pthread_mutex_unlock(&dp_mutex);
 }
 
+static void decr_entries(int handled)
+{
+       pthread_mutex_lock(&dp_mutex);
+       dp_entries -= handled;
+       pthread_mutex_unlock(&dp_mutex);
+}
+
+static int wait_empty_entries(void)
+{
+       pthread_mutex_lock(&dp_mutex);
+       while (!done && dp_entries == 0)
+               t_pthread_cond_wait(&dp_cond, &dp_mutex);
+       pthread_mutex_unlock(&dp_mutex);
+
+       return !done;
+}
+
 static int add_devpath(char *path)
 {
        int fd;
        struct devpath *dpp;
+       struct list_head *p;
 
+       /*
+        * Verify device is not duplicated
+        */
+       __list_for_each(p, &devpaths) {
+              struct devpath *tmp = list_entry(p, struct devpath, head);
+              if (!strcmp(tmp->path, path))
+                       return 0;
+       }
        /*
         * Verify device is valid before going too far
         */
@@ -1081,7 +1257,7 @@ static int add_devpath(char *path)
        memset(dpp, 0, sizeof(*dpp));
        dpp->path = strdup(path);
        dpp->fd = fd;
-       dpp->idx = ndevs++;
+       ndevs++;
        list_add_tail(&dpp->head, &devpaths);
 
        return 0;
@@ -1113,8 +1289,7 @@ static int flush_subbuf_net(struct trace_buf *tbp)
 
        if (net_send_header(fd, tbp->cpu, dpp->buts_name, tbp->len))
                return 1;
-
-       if (net_send_data(fd, tbp->buf, tbp->len) != tbp->len)
+       else if (net_send_data(fd, tbp->buf, tbp->len) != tbp->len)
                return 1;
 
        return 0;
@@ -1147,6 +1322,34 @@ handle_list_net(__attribute__((__unused__))struct tracer_devpath_head *hd,
        return entries_handled;
 }
 
+/*
+ * Tack 'tbp's buf onto the tail of 'prev's buf
+ */
+static struct trace_buf *tb_combine(struct trace_buf *prev,
+                                   struct trace_buf *tbp)
+{
+       unsigned long tot_len;
+
+       tot_len = prev->len + tbp->len;
+       if (tot_len > buf_size) {
+               /*
+                * tbp->head isn't connected (it was 'prev'
+                * so it had been taken off of the list
+                * before). Therefore, we can realloc
+                * the whole structures, as the other fields
+                * are "static".
+                */
+               prev = realloc(prev, sizeof(*prev) + tot_len);
+               prev->buf = (void *)(prev + 1);
+       }
+
+       memcpy(prev->buf + prev->len, tbp->buf, tbp->len);
+       prev->len = tot_len;
+
+       free(tbp);
+       return prev;
+}
+
 static int handle_list_file(struct tracer_devpath_head *hd,
                            struct list_head *list)
 {
@@ -1166,31 +1369,8 @@ static int handle_list_file(struct tracer_devpath_head *hd,
                 * If there was some leftover before, tack this new
                 * entry onto the tail of the previous one.
                 */
-               if (prev) {
-                       unsigned long tot_len;
-                       struct trace_buf *tmp = tbp;
-
-                       tbp = prev;
-                       prev = NULL;
-
-                       tot_len = tbp->len + tmp->len;
-                       if (tot_len > buf_size) {
-                               /*
-                                * tbp->head isn't connected (it was 'prev'
-                                * so it had been taken off of the list
-                                * before). Therefore, we can realloc
-                                * the whole structures, as the other fields
-                                * are "static".
-                                */
-                               tbp = realloc(tbp->buf, sizeof(*tbp) + tot_len);
-                               tbp->buf = (void *)(tbp + 1);
-                       }
-
-                       memcpy(tbp->buf + tbp->len, tmp->buf, tmp->len);
-                       tbp->len = tot_len;
-
-                       free(tmp);
-               }
+               if (prev)
+                       tbp = tb_combine(prev, tbp);
 
                /*
                 * See how many whole traces there are - send them
@@ -1215,8 +1395,10 @@ static int handle_list_file(struct tracer_devpath_head *hd,
                 * for the next pass.
                 */
                if (off) {
-                       if (write_data(tbp->buf, off) || off == tbp->len)
+                       if (write_data(tbp->buf, off) || off == tbp->len) {
                                free(tbp);
+                               prev = NULL;
+                       }
                        else {
                                /*
                                 * Move valid data to beginning of buffer
@@ -1258,27 +1440,14 @@ static void __process_trace_bufs(void)
                }
        }
 
-       if (handled) {
-               pthread_mutex_lock(&dp_mutex);
-               dp_entries -= handled;
-               pthread_mutex_unlock(&dp_mutex);
-       }
+       if (handled)
+               decr_entries(handled);
 }
 
 static void process_trace_bufs(void)
 {
-       while (!done) {
-               pthread_mutex_lock(&dp_mutex);
-               while (!done && dp_entries == 0) {
-                       struct timespec ts;
-
-                       make_timespec(&ts, 50);
-                       pthread_cond_timedwait(&dp_cond, &dp_mutex, &ts);
-               }
-               pthread_mutex_unlock(&dp_mutex);
-
+       while (wait_empty_entries())
                __process_trace_bufs();
-       }
 }
 
 static void clean_trace_bufs(void)
@@ -1324,78 +1493,6 @@ static inline int net_sendfile_data(struct tracer *tp, struct io_info *iop)
        return net_sendfile(iop);
 }
 
-static int handle_pfds_netclient(struct tracer *tp, int nevs, int force_read)
-{
-       struct stat sb;
-       int i, nentries = 0;
-       struct pdc_stats *sp;
-       struct pollfd *pfd = tp->pfds;
-       struct io_info *iop = tp->ios;
-
-       for (i = 0; nevs > 0 && i < ndevs; i++, pfd++, iop++, sp++) {
-               if (pfd->revents & POLLIN || force_read) {
-                       if (fstat(iop->ifd, &sb) < 0) {
-                               perror(iop->ifn);
-                               pfd->events = 0;
-                       } else if (sb.st_size > (off_t)iop->data_queued) {
-                               iop->ready = sb.st_size - iop->data_queued;
-                               iop->data_queued = sb.st_size;
-                               if (!net_sendfile_data(tp, iop)) {
-                                       pdc_dr_update(iop->dpp, tp->cpu,
-                                                     iop->ready);
-                                       nentries++;
-                               } else
-                                       clear_events(pfd);
-                       }
-                       nevs--;
-               }
-       }
-
-       if (nentries)
-               incr_entries(nentries);
-
-       return nentries;
-}
-
-static int handle_pfds_entries(struct tracer *tp, int nevs, int force_read)
-{
-       int i, nentries = 0;
-       struct trace_buf *tbp;
-       struct pollfd *pfd = tp->pfds;
-       struct io_info *iop = tp->ios;
-
-       tbp = alloc_trace_buf(tp->cpu, buf_size);
-       for (i = 0; i < ndevs; i++, pfd++, iop++) {
-               if (pfd->revents & POLLIN || force_read) {
-                       tbp->len = read(iop->ifd, tbp->buf, buf_size);
-                       if (tbp->len > 0) {
-                               pdc_dr_update(iop->dpp, tp->cpu, tbp->len);
-                               add_trace_buf(iop->dpp, tp->cpu, &tbp);
-                               nentries++;
-                       } else if (tbp->len == 0) {
-                               /*
-                                * Short reads after we're done stop us
-                                * from trying reads.
-                                */
-                               if (tp->is_done)
-                                       clear_events(pfd);
-                       } else {
-                               read_err(tp->cpu, iop->ifn);
-                               if (errno != EAGAIN || tp->is_done)
-                                       clear_events(pfd);
-                       }
-                       if (!piped_output && --nevs == 0)
-                               break;
-               }
-       }
-       free(tbp);
-
-       if (nentries)
-               incr_entries(nentries);
-
-       return nentries;
-}
-
 static int fill_ofname(struct io_info *iop, int cpu)
 {
        int len;
@@ -1422,7 +1519,12 @@ static int fill_ofname(struct io_info *iop, int cpu)
                                iop->ofn, errno, strerror(errno));
                        return 1;
                }
-               if (mkdir(iop->ofn, 0755) < 0) {
+               /*
+                * There is no synchronization between multiple threads
+                * trying to create the directory at once.  It's harmless
+                * to let them try, so just detect the problem and move on.
+                */
+               if (mkdir(iop->ofn, 0755) < 0 && errno != EEXIST) {
                        fprintf(stderr,
                                "Destination dir %s can't be made: %d/%s\n",
                                iop->ofn, errno, strerror(errno));
@@ -1466,6 +1568,7 @@ static int iop_open(struct io_info *iop, int cpu)
                        iop->ofn, errno, strerror(errno));
                return 1;
        }
+
        if (set_vbuf(iop, _IOLBF, FILE_VBUF_SIZE)) {
                fprintf(stderr, "set_vbuf for file %s failed: %d/%s\n",
                        iop->ofn, errno, strerror(errno));
@@ -1477,6 +1580,50 @@ static int iop_open(struct io_info *iop, int cpu)
        return 0;
 }
 
+static void close_iop(struct io_info *iop)
+{
+       struct mmap_info *mip = &iop->mmap_info;
+
+       if (mip->fs_buf)
+               munmap(mip->fs_buf, mip->fs_buf_len);
+
+       if (!piped_output) {
+               if (ftruncate(fileno(iop->ofp), mip->fs_size) < 0) {
+                       fprintf(stderr,
+                               "Ignoring err: ftruncate(%s): %d/%s\n",
+                               iop->ofn, errno, strerror(errno));
+               }
+       }
+
+       if (iop->ofp)
+               fclose(iop->ofp);
+       if (iop->obuf)
+               free(iop->obuf);
+}
+
+static void close_ios(struct tracer *tp)
+{
+       while (tp->nios > 0) {
+               struct io_info *iop = &tp->ios[--tp->nios];
+
+               iop->dpp->drops = get_drops(iop->dpp);
+               if (iop->ifd >= 0)
+                       close(iop->ifd);
+
+               if (iop->ofp)
+                       close_iop(iop);
+               else if (iop->ofd >= 0) {
+                       struct devpath *dpp = iop->dpp;
+
+                       net_send_close(iop->ofd, dpp->buts_name, dpp->drops);
+                       net_close_connection(&iop->ofd);
+               }
+       }
+
+       free(tp->ios);
+       free(tp->pfds);
+}
+
 static int open_ios(struct tracer *tp)
 {
        struct pollfd *pfd;
@@ -1484,9 +1631,9 @@ static int open_ios(struct tracer *tp)
        struct list_head *p;
 
        tp->ios = calloc(ndevs, sizeof(struct io_info));
-       tp->pfds = calloc(ndevs, sizeof(struct pollfd));
-
        memset(tp->ios, 0, ndevs * sizeof(struct io_info));
+
+       tp->pfds = calloc(ndevs, sizeof(struct pollfd));
        memset(tp->pfds, 0, ndevs * sizeof(struct pollfd));
 
        tp->nios = 0;
@@ -1539,86 +1686,10 @@ static int open_ios(struct tracer *tp)
 
 err:
        close(iop->ifd);        /* tp->nios _not_ bumped */
+       close_ios(tp);
        return 1;
 }
 
-static void close_iop(struct io_info *iop)
-{
-       struct mmap_info *mip = &iop->mmap_info;
-
-       if (mip->fs_buf)
-               munmap(mip->fs_buf, mip->fs_buf_len);
-
-       if (!piped_output) {
-               if (ftruncate(fileno(iop->ofp), mip->fs_size) < 0) {
-                       fprintf(stderr,
-                               "Ignoring err: ftruncate(%s): %d/%s\n",
-                               iop->ofn, errno, strerror(errno));
-               }
-       }
-
-       if (iop->ofp)
-               fclose(iop->ofp);
-       if (iop->obuf)
-               free(iop->obuf);
-}
-
-static void close_ios(struct tracer *tp)
-{
-       while (tp->nios > 0) {
-               struct io_info *iop = &tp->ios[--tp->nios];
-
-               iop->dpp->drops = get_drops(iop->dpp);
-               if (iop->ifd >= 0)
-                       close(iop->ifd);
-
-               if (iop->ofp)
-                       close_iop(iop);
-               else if (iop->ofd >= 0) {
-                       struct devpath *dpp = iop->dpp;
-
-                       net_send_close(iop->ofd, dpp->buts_name, dpp->drops);
-                       net_close_connection(&iop->ofd);
-               }
-       }
-
-       free(tp->ios);
-       free(tp->pfds);
-}
-
-static int setup_mmap(int fd, unsigned int maxlen, struct mmap_info *mip)
-{
-       if (mip->fs_off + maxlen > mip->fs_buf_len) {
-               unsigned long nr = max(16, mip->buf_nr);
-
-               if (mip->fs_buf) {
-                       munlock(mip->fs_buf, mip->fs_buf_len);
-                       munmap(mip->fs_buf, mip->fs_buf_len);
-                       mip->fs_buf = NULL;
-               }
-
-               mip->fs_off = mip->fs_size & (mip->pagesize - 1);
-               mip->fs_buf_len = (nr * mip->buf_size) - mip->fs_off;
-               mip->fs_max_size += mip->fs_buf_len;
-
-               if (ftruncate(fd, mip->fs_max_size) < 0) {
-                       perror("__setup_mmap: ftruncate");
-                       return 1;
-               }
-
-               mip->fs_buf = my_mmap(NULL, mip->fs_buf_len, PROT_WRITE,
-                                     MAP_SHARED, fd,
-                                     mip->fs_size - mip->fs_off);
-               if (mip->fs_buf == MAP_FAILED) {
-                       perror("__setup_mmap: mmap");
-                       return 1;
-               }
-               my_mlock(mip->fs_buf, mip->fs_buf_len);
-       }
-
-       return 0;
-}
-
 static int handle_pfds_file(struct tracer *tp, int nevs, int force_read)
 {
        struct mmap_info *mip;
@@ -1630,7 +1701,7 @@ static int handle_pfds_file(struct tracer *tp, int nevs, int force_read)
                if (pfd->revents & POLLIN || force_read) {
                        mip = &iop->mmap_info;
 
-                       ret = setup_mmap(iop->ofd, buf_size, mip);
+                       ret = setup_mmap(iop->ofd, buf_size, mip, tp);
                        if (ret < 0) {
                                pfd->events = 0;
                                break;
@@ -1662,11 +1733,82 @@ static int handle_pfds_file(struct tracer *tp, int nevs, int force_read)
        return nentries;
 }
 
-static void *thread_main(void *arg)
+static int handle_pfds_netclient(struct tracer *tp, int nevs, int force_read)
+{
+       struct stat sb;
+       int i, nentries = 0;
+       struct pollfd *pfd = tp->pfds;
+       struct io_info *iop = tp->ios;
+
+       for (i = 0; i < ndevs; i++, pfd++, iop++) {
+               if (pfd->revents & POLLIN || force_read) {
+                       if (fstat(iop->ifd, &sb) < 0) {
+                               perror(iop->ifn);
+                               pfd->events = 0;
+                       } else if (sb.st_size > (off_t)iop->data_queued) {
+                               iop->ready = sb.st_size - iop->data_queued;
+                               iop->data_queued = sb.st_size;
+
+                               if (!net_sendfile_data(tp, iop)) {
+                                       pdc_dr_update(iop->dpp, tp->cpu,
+                                                     iop->ready);
+                                       nentries++;
+                               } else
+                                       clear_events(pfd);
+                       }
+                       if (--nevs == 0)
+                               break;
+               }
+       }
+
+       if (nentries)
+               incr_entries(nentries);
+
+       return nentries;
+}
+
+static int handle_pfds_entries(struct tracer *tp, int nevs, int force_read)
 {
-       int ret, ndone;
-       int to_val;
+       int i, nentries = 0;
+       struct trace_buf *tbp;
+       struct pollfd *pfd = tp->pfds;
+       struct io_info *iop = tp->ios;
+
+       tbp = alloc_trace_buf(tp->cpu, buf_size);
+       for (i = 0; i < ndevs; i++, pfd++, iop++) {
+               if (pfd->revents & POLLIN || force_read) {
+                       tbp->len = read(iop->ifd, tbp->buf, buf_size);
+                       if (tbp->len > 0) {
+                               pdc_dr_update(iop->dpp, tp->cpu, tbp->len);
+                               add_trace_buf(iop->dpp, tp->cpu, &tbp);
+                               nentries++;
+                       } else if (tbp->len == 0) {
+                               /*
+                                * Short reads after we're done stop us
+                                * from trying reads.
+                                */
+                               if (tp->is_done)
+                                       clear_events(pfd);
+                       } else {
+                               read_err(tp->cpu, iop->ifn);
+                               if (errno != EAGAIN || tp->is_done)
+                                       clear_events(pfd);
+                       }
+                       if (!piped_output && --nevs == 0)
+                               break;
+               }
+       }
+       free(tbp);
+
+       if (nentries)
+               incr_entries(nentries);
 
+       return nentries;
+}
+
+static void *thread_main(void *arg)
+{
+       int ret, ndone, to_val;
        struct tracer *tp = arg;
 
        ret = lock_on_cpu(tp->cpu);
@@ -1674,25 +1816,17 @@ static void *thread_main(void *arg)
                goto err;
 
        ret = open_ios(tp);
-       if (ret) {
-               close_ios(tp);
+       if (ret)
                goto err;
-       }
-
-       pthread_mutex_lock(&tp->mutex);
-       tp->running = 1;
-       pthread_cond_signal(&tp->cond);
-       pthread_mutex_unlock(&tp->mutex);
 
        if (piped_output)
                to_val = 50;            /* Frequent partial handles */
        else
                to_val = 500;           /* 1/2 second intervals */
 
-       pthread_mutex_lock(&ub_mutex);
-       while (!tp->is_done && !unblock_tracers)
-               pthread_cond_wait(&ub_cond, &ub_mutex);
-       pthread_mutex_unlock(&ub_mutex);
+
+       tracer_signal_ready(tp, Th_running, 0);
+       tracer_wait_unblock(tp);
 
        while (!tp->is_done) {
                ndone = poll(tp->pfds, ndevs, to_val);
@@ -1710,13 +1844,11 @@ static void *thread_main(void *arg)
                ;
 
        close_ios(tp);
+       tracer_signal_ready(tp, Th_leaving, 0);
+       return NULL;
 
 err:
-       pthread_mutex_lock(&tp->mutex);
-       tp->running = 0;
-       tp->status = ret;
-       pthread_cond_signal(&tp->cond);
-       pthread_mutex_unlock(&tp->mutex);
+       tracer_signal_ready(tp, Th_error, ret);
        return NULL;
 }
 
@@ -1728,46 +1860,38 @@ static int start_tracer(int cpu)
        memset(tp, 0, sizeof(*tp));
 
        INIT_LIST_HEAD(&tp->head);
-       pthread_mutex_init(&tp->mutex, NULL);
-       pthread_cond_init(&tp->cond, NULL);
-       tp->running = 0;
        tp->status = 0;
        tp->cpu = cpu;
 
        if (pthread_create(&tp->thread, NULL, thread_main, tp)) {
                fprintf(stderr, "FAILED to start thread on CPU %d: %d/%s\n",
                        cpu, errno, strerror(errno));
-               goto err;
-       }
-
-       pthread_mutex_lock(&tp->mutex);
-       while (!tp->running && (tp->status == 0))
-               pthread_cond_wait(&tp->cond, &tp->mutex);
-       pthread_mutex_unlock(&tp->mutex);
-
-       if (tp->status == 0) {
-               list_add_tail(&tp->head, &tracers);
-               return 0;
+               free(tp);
+               return 1;
        }
 
-       fprintf(stderr, "FAILED to start thread on CPU %d\n", cpu);
-
-err:
-       pthread_mutex_destroy(&tp->mutex);
-       pthread_cond_destroy(&tp->cond);
-       free(tp);
-       return 1;
+       list_add_tail(&tp->head, &tracers);
+       return 0;
 }
 
-static int start_tracers(void)
+static void start_tracers(void)
 {
        int cpu;
+       struct list_head *p;
 
        for (cpu = 0; cpu < ncpus; cpu++)
                if (start_tracer(cpu))
                        break;
 
-       return cpu;
+       wait_tracers_ready(cpu);
+
+       __list_for_each(p, &tracers) {
+               struct tracer *tp = list_entry(p, struct tracer, head);
+               if (tp->status)
+                       fprintf(stderr,
+                               "FAILED to start thread on CPU %d: %d/%s\n",
+                               tp->cpu, tp->status, strerror(tp->status));
+       }
 }
 
 static void stop_tracers(void)
@@ -1789,6 +1913,7 @@ static void stop_tracers(void)
                struct tracer *tp = list_entry(p, struct tracer, head);
                tp->is_done = 1;
        }
+       pthread_cond_broadcast(&mt_cond);
 }
 
 static void del_tracers(void)
@@ -1801,7 +1926,6 @@ static void del_tracers(void)
                list_del(&tp->head);
                free(tp);
        }
-       ntracers = 0;
 }
 
 static void wait_tracers(void)
@@ -1811,15 +1935,12 @@ static void wait_tracers(void)
        if (use_tracer_devpaths())
                process_trace_bufs();
 
+       wait_tracers_leaving();
+
        __list_for_each(p, &tracers) {
                int ret;
                struct tracer *tp = list_entry(p, struct tracer, head);
 
-               pthread_mutex_lock(&tp->mutex);
-               while (tp->running)
-                       pthread_cond_wait(&tp->cond, &tp->mutex);
-               pthread_mutex_unlock(&tp->mutex);
-
                ret = pthread_join(tp->thread, NULL);
                if (ret)
                        fprintf(stderr, "Thread join %d failed %d\n",
@@ -1964,9 +2085,13 @@ static int handle_args(int argc, char *argv[])
                                return 1;
                        }
 
-                       while (fscanf(ifp, "%s\n", dev_line) == 1)
-                               if (add_devpath(dev_line) != 0)
+                       while (fscanf(ifp, "%s\n", dev_line) == 1) {
+                               if (add_devpath(dev_line) != 0) {
+                                       fclose(ifp);
                                        return 1;
+                               }
+                       }
+                       fclose(ifp);
                        break;
                }
 
@@ -2016,7 +2141,9 @@ static int handle_args(int argc, char *argv[])
                        break;
                case 'h':
                        net_mode = Net_client;
-                       strcpy(hostname, optarg);
+                       memset(hostname, 0, sizeof(hostname));
+                       strncpy(hostname, optarg, sizeof(hostname));
+                       hostname[sizeof(hostname) - 1] = '\0';
                        break;
                case 'l':
                        net_mode = Net_server;
@@ -2043,12 +2170,17 @@ static int handle_args(int argc, char *argv[])
                return 1;
        }
 
-       if (statfs(debugfs_path, &st) < 0 || st.f_type != (long)DEBUGFS_TYPE) {
+       if (statfs(debugfs_path, &st) < 0) {
                fprintf(stderr, "Invalid debug path %s: %d/%s\n",
                        debugfs_path, errno, strerror(errno));
                return 1;
        }
 
+       if (st.f_type != (long)DEBUGFS_TYPE) {
+               fprintf(stderr, "Debugfs is not mounted at %s\n", debugfs_path);
+               return 1;
+       }
+
        if (act_mask_tmp != 0)
                act_mask = act_mask_tmp;
 
@@ -2066,7 +2198,10 @@ static int handle_args(int argc, char *argv[])
                piped_output = 1;
                handle_pfds = handle_pfds_entries;
                pfp = stdout;
-               setvbuf(pfp, NULL, _IONBF, 0);
+               if (setvbuf(pfp, NULL, _IONBF, 0)) {
+                       perror("setvbuf stdout");
+                       return 1;
+               }
        } else
                handle_pfds = handle_pfds_file;
        return 0;
@@ -2278,7 +2413,7 @@ static void net_client_read_data(struct cl_conn *nc, struct devpath *dpp,
        struct io_info *iop = &dpp->ios[bnh->cpu];
        struct mmap_info *mip = &iop->mmap_info;
 
-       if (setup_mmap(iop->ofd, bnh->len, &iop->mmap_info)) {
+       if (setup_mmap(iop->ofd, bnh->len, &iop->mmap_info, NULL)) {
                fprintf(stderr, "ncd(%s:%d): mmap failed\n",
                        nc->ch->hostname, nc->fd);
                exit(1);
@@ -2483,6 +2618,45 @@ out:
        return ret;
 }
 
+static int run_tracers(void)
+{
+       atexit(exit_tracing);
+       if (net_mode == Net_client)
+               printf("blktrace: connecting to %s\n", hostname);
+
+       setup_buts();
+
+       if (use_tracer_devpaths()) {
+               if (setup_tracer_devpaths())
+                       return 1;
+
+               if (piped_output)
+                       handle_list = handle_list_file;
+               else
+                       handle_list = handle_list_net;
+       }
+
+       start_tracers();
+       if (nthreads_running == ncpus) {
+               unblock_tracers();
+               start_buts();
+               if (net_mode == Net_client)
+                       printf("blktrace: connected!\n");
+               if (stop_watch)
+                       alarm(stop_watch);
+       } else
+               stop_tracers();
+
+       wait_tracers();
+       if (nthreads_running == ncpus)
+               show_stats(&devpaths);
+       if (net_client_use_send())
+               close_client_connections();
+       del_tracers();
+
+       return 0;
+}
+
 int main(int argc, char *argv[])
 {
        int ret = 0;
@@ -2495,9 +2669,13 @@ int main(int argc, char *argv[])
                        errno, strerror(errno));
                ret = 1;
                goto out;
+       } else if (handle_args(argc, argv)) {
+               ret = 1;
+               goto out;
        }
 
-       if (handle_args(argc, argv)) {
+       if (ndevs > 1 && output_name && strcmp(output_name, "-") != 0) {
+               fprintf(stderr, "-o not supported with multiple devices\n");
                ret = 1;
                goto out;
        }
@@ -2525,53 +2703,9 @@ int main(int argc, char *argv[])
                        fprintf(stderr, "-o ignored in server mode\n");
                        output_name = NULL;
                }
-
                ret = net_server();
-       } else {
-               atexit(exit_tracing);
-
-               if (net_mode == Net_client)
-                       printf("blktrace: connecting to %s\n", hostname);
-
-               setup_buts();
-
-               if (use_tracer_devpaths()) {
-                       if (setup_tracer_devpaths())
-                               goto out;
-
-                       if (piped_output)
-                               handle_list = handle_list_file;
-                       else
-                               handle_list = handle_list_net;
-               }
-
-               ntracers = start_tracers();
-               if (ntracers != ncpus)
-                       stop_tracers();
-               else {
-                       /*
-                        * Let tracers go
-                        */
-                       pthread_mutex_lock(&ub_mutex);
-                       unblock_tracers = 1;
-                       pthread_cond_broadcast(&ub_cond);
-                       pthread_mutex_unlock(&ub_mutex);
-
-                       if (net_mode == Net_client)
-                               printf("blktrace: connected!\n");
-
-                       if (stop_watch)
-                               alarm(stop_watch);
-               }
-
-               wait_tracers();
-               if (ntracers == ncpus)
-                       show_stats(&devpaths);
-
-               if (net_client_use_send())
-                       close_client_connections();
-               del_tracers();
-       }
+       } else
+               ret = run_tracers();
 
 out:
        if (pfp)