[PATCH] blktrace: no need to track ts->offset anymore
[blktrace.git] / blktrace.c
index d61b2d9d2fd9623d5e8bc3842ff568439c7ca427..24b48d5b2a214ecc9d86222a6263366afbfbe2b7 100644 (file)
@@ -41,6 +41,7 @@
 #include <netinet/in.h>
 #include <arpa/inet.h>
 #include <netdb.h>
+#include <sys/sendfile.h>
 
 #include "blktrace.h"
 #include "barrier.h"
@@ -58,7 +59,7 @@ static char blktrace_version[] = "0.99";
 
 #define RELAYFS_TYPE   0xF0B4A981
 
-#define S_OPTS "d:a:A:r:o:kw:Vb:n:D:lh:p:"
+#define S_OPTS "d:a:A:r:o:kw:Vb:n:D:lh:p:s"
 static struct option l_opts[] = {
        {
                .name = "dev",
@@ -144,6 +145,12 @@ static struct option l_opts[] = {
                .flag = NULL,
                .val = 'p'
        },
+       {
+               .name = "sendfile",
+               .has_arg = no_argument,
+               .flag = NULL,
+               .val = 's'
+       },
        {
                .name = NULL,
        }
@@ -172,10 +179,17 @@ struct thread_information {
        void *fd_buf;
        char fn[MAXPATHLEN + 64];
 
+       int pfd;
+       size_t *pfd_buf;
+
+       struct in_addr cl_in_addr;
+
        FILE *ofile;
        char *ofile_buffer;
+       off_t ofile_offset;
        int ofile_stdout;
        int ofile_mmap;
+       volatile int sendfile_pending;
 
        int (*get_subbuf)(struct thread_information *, unsigned int);
        int (*flush_subbuf)(struct thread_information *, struct tip_subbuf *);
@@ -276,12 +290,26 @@ enum {
 static char hostname[MAXHOSTNAMELEN];
 static int net_port = TRACE_NET_PORT;
 static int net_mode = 0;
+static int net_use_sendfile;
 
 static int net_in_fd = -1;
 static int net_out_fd = -1;
 
 static void handle_sigint(__attribute__((__unused__)) int sig)
 {
+       struct device_information *dip;
+       int i;
+
+       /*
+        * stop trace so we can reap currently produced data
+        */
+       for_each_dip(dip, i) {
+               if (dip->fd == -1)
+                       continue;
+               if (ioctl(dip->fd, BLKTRACESTOP) < 0)
+                       perror("BLKTRACESTOP");
+       }
+
        done = 1;
 }
 
@@ -345,8 +373,11 @@ static void stop_trace(struct device_information *dip)
        if (dip_tracing(dip) || kill_running_trace) {
                dip_set_tracing(dip, 0);
 
-               if (ioctl(dip->fd, BLKTRACESTOP) < 0)
-                       perror("BLKTRACESTOP");
+               /*
+                * should be stopped, just don't complain if it isn't
+                */
+               ioctl(dip->fd, BLKTRACESTOP);
+
                if (ioctl(dip->fd, BLKTRACETEARDOWN) < 0)
                        perror("BLKTRACETEARDOWN");
 
@@ -437,12 +468,7 @@ static int read_data_net(struct thread_information *tip, void *buf,
 static int read_data(struct thread_information *tip, void *buf,
                     unsigned int len)
 {
-       int ret = tip->read_data(tip, buf, len);
-
-       if (ret > 0)
-               tip->data_read += ret;
-
-       return ret;
+       return tip->read_data(tip, buf, len);
 }
 
 static inline struct tip_subbuf *
@@ -517,6 +543,7 @@ static int mmap_subbuf(struct thread_information *tip, unsigned int maxlen)
 
        ret = read_data(tip, tip->fs_buf + tip->fs_off, maxlen);
        if (ret >= 0) {
+               tip->data_read += ret;
                tip->fs_size += ret;
                tip->fs_off += ret;
                return 0;
@@ -539,24 +566,60 @@ static int get_subbuf(struct thread_information *tip, unsigned int maxlen)
        ret = read_data(tip, ts->buf, ts->max_len);
        if (ret > 0) {
                ts->len = ret;
-               return subbuf_fifo_queue(tip, ts);
+               tip->data_read += ret;
+               if (subbuf_fifo_queue(tip, ts))
+                       return -1;
        }
 
        return ret;
 }
 
+static int get_subbuf_sendfile(struct thread_information *tip,
+                              unsigned int maxlen)
+{
+       struct tip_subbuf *ts;
+
+       wait_for_data(tip);
+
+       /*
+        * hack to get last data out, we can't use sendfile for that
+        */
+       if (is_done())
+               return get_subbuf(tip, maxlen);
+
+       if (tip->sendfile_pending) {
+               usleep(100);
+               return 0;
+       }
+
+       ts = malloc(sizeof(*ts));
+       ts->buf = NULL;
+       ts->max_len = 0;
+
+       if (subbuf_fifo_queue(tip, ts))
+               return -1;
+
+       tip->sendfile_pending++;
+       return buf_size;
+}
+
 static void close_thread(struct thread_information *tip)
 {
        if (tip->fd != -1)
                close(tip->fd);
+       if (tip->pfd != -1)
+               close(tip->pfd);
        if (tip->ofile)
                fclose(tip->ofile);
        if (tip->ofile_buffer)
                free(tip->ofile_buffer);
        if (tip->fd_buf)
                free(tip->fd_buf);
+       if (tip->pfd_buf)
+               free(tip->pfd_buf);
 
        tip->fd = -1;
+       tip->pfd = -1;
        tip->ofile = NULL;
        tip->ofile_buffer = NULL;
        tip->fd_buf = NULL;
@@ -601,11 +664,32 @@ static void *thread_main(void *arg)
                exit_trace(1);
        }
 
+       if (net_mode == Net_client && net_use_sendfile) {
+               char tmp[MAXPATHLEN + 64];
+
+               snprintf(tmp, sizeof(tmp), "%s/block/%s/trace%d.padding",
+                        relay_path, tip->device->buts_name, tip->cpu);
+
+               tip->pfd = open(tmp, O_RDONLY);
+               if (tip->pfd < 0) {
+                       fprintf(stderr, "Couldn't open padding file %s\n", tmp);
+                       exit_trace(1);
+               }
+
+               tip->pfd_buf = malloc(buf_nr * sizeof(size_t));
+       }
+
        while (!is_done()) {
-               if (tip->get_subbuf(tip, buf_size))
+               if (tip->get_subbuf(tip, buf_size) < 0)
                        break;
        }
 
+       /*
+        * trace is stopped, pull data until we get a short read
+        */
+       while (tip->get_subbuf(tip, buf_size) > 0)
+               ;
+
        tip_ftrunc_final(tip);
        tip->exited = 1;
        return NULL;
@@ -630,8 +714,7 @@ static int write_data_net(int fd, void *buf, unsigned int buf_len)
        return 0;
 }
 
-static int flush_subbuf_net(struct thread_information *tip,
-                           struct tip_subbuf *ts)
+static int net_send_header(struct thread_information *tip, unsigned int len)
 {
        struct blktrace_net_hdr hdr;
 
@@ -639,11 +722,31 @@ static int flush_subbuf_net(struct thread_information *tip,
        strcpy(hdr.buts_name, tip->device->buts_name);
        hdr.cpu = tip->cpu;
        hdr.max_cpus = ncpus;
-       hdr.len = ts->len;
+       hdr.len = len;
 
-       if (write_data_net(net_out_fd, &hdr, sizeof(hdr)))
-               return 1;
+       return write_data_net(net_out_fd, &hdr, sizeof(hdr));
+}
 
+/*
+ * send header with 0 length to signal end-of-run
+ */
+static void net_client_send_close(void)
+{
+       struct blktrace_net_hdr hdr;
+
+       hdr.magic = BLK_IO_TRACE_MAGIC;
+       hdr.cpu = 0;
+       hdr.max_cpus = ncpus;
+       hdr.len = 0;
+
+       write_data_net(net_out_fd, &hdr, sizeof(hdr));
+}
+
+static int flush_subbuf_net(struct thread_information *tip,
+                           struct tip_subbuf *ts)
+{
+       if (net_send_header(tip, ts->len))
+               return 1;
        if (write_data_net(net_out_fd, ts->buf, ts->len))
                return 1;
 
@@ -652,6 +755,74 @@ static int flush_subbuf_net(struct thread_information *tip,
        return 0;
 }
 
+static int net_sendfile(struct thread_information *tip, struct tip_subbuf *ts)
+{
+       int ret = sendfile(net_out_fd, tip->fd, NULL, ts->len);
+
+       if (ret < 0) {
+               perror("sendfile");
+               return 1;
+       } else if (ret < (int) ts->len) {
+               fprintf(stderr, "short sendfile send (%d of %d)\n", ret, ts->len);
+               return 1;
+       }
+
+       return 0;
+}
+
+static int get_subbuf_padding(struct thread_information *tip, off_t off)
+{
+       int padding_size = buf_nr * sizeof(size_t);
+       int ret;
+
+       ret = read(tip->pfd, tip->pfd_buf, padding_size);
+       if (ret == padding_size) {
+               int subbuf = (off / buf_size) % buf_nr;
+
+               ret = tip->pfd_buf[subbuf];
+       } else if (ret < 0)
+               perror("tip pad read");
+       else {
+               fprintf(stderr, "bad pad size read\n");
+               ret = -1;
+       }
+
+       return ret;
+}
+
+static int flush_subbuf_sendfile(struct thread_information *tip,
+                                struct tip_subbuf *ts)
+{
+       int pad, ret = 1;
+
+       /*
+        * currently we cannot use sendfile() on the last bytes read, as they
+        * may not be a full subbuffer. get_subbuf_sendfile() falls back to
+        * the read approach for those, so use send() to ship them out
+        */
+       if (ts->buf)
+               return flush_subbuf_net(tip, ts);
+       
+       pad = get_subbuf_padding(tip, tip->ofile_offset);
+       if (pad == -1)
+               goto err;
+
+       ts->len = buf_size - pad;
+
+       if (net_send_header(tip, ts->len))
+               goto err;
+       if (net_sendfile(tip, ts))
+               goto err;
+
+       tip->data_read += ts->len;
+       tip->ofile_offset += buf_size;
+       ret = 0;
+err:
+       tip->sendfile_pending--;
+       free(ts);
+       return ret;
+}
+
 static int write_data(struct thread_information *tip, void *buf,
                      unsigned int buf_len)
 {
@@ -817,19 +988,45 @@ static void wait_for_threads(void)
                                        tips_running += !tip->exited;
                } while (tips_running);
        }
+
+       if (net_mode == Net_client)
+               net_client_send_close();
 }
 
-static void fill_ofname(char *dst, char *buts_name, int cpu)
+static int fill_ofname(struct thread_information *tip, char *dst,
+                      char *buts_name)
 {
+       struct stat sb;
        int len = 0;
+       time_t t;
 
        if (output_dir)
                len = sprintf(dst, "%s/", output_dir);
 
+       if (net_mode == Net_server) {
+               len += sprintf(dst + len, "%s-", inet_ntoa(tip->cl_in_addr));
+               time(&t);
+               len += strftime(dst + len, 64, "%F-%T/", gmtime(&t));
+       }
+
+       if (stat(dst, &sb) < 0) {
+               if (errno != ENOENT) {
+                       perror("stat");
+                       return 1;
+               }
+               if (mkdir(dst, 0755) < 0) {
+                       perror(dst);
+                       fprintf(stderr, "Can't make output dir\n");
+                       return 1;
+               }
+       }
+
        if (output_name)
-               sprintf(dst + len, "%s.blktrace.%d", output_name, cpu);
+               sprintf(dst + len, "%s.blktrace.%d", output_name, tip->cpu);
        else
-               sprintf(dst + len, "%s.blktrace.%d", buts_name, cpu);
+               sprintf(dst + len, "%s.blktrace.%d", buts_name, tip->cpu);
+
+       return 0;
 }
 
 static void fill_ops(struct thread_information *tip)
@@ -837,64 +1034,90 @@ static void fill_ops(struct thread_information *tip)
        /*
         * setup ops
         */
-       if (tip->ofile_mmap && net_mode != Net_client)
-               tip->get_subbuf = mmap_subbuf;
-       else
-               tip->get_subbuf = get_subbuf;
+       if (net_mode == Net_client) {
+               if (net_use_sendfile) {
+                       tip->get_subbuf = get_subbuf_sendfile;
+                       tip->flush_subbuf = flush_subbuf_sendfile;
+               } else {
+                       tip->get_subbuf = get_subbuf;
+                       tip->flush_subbuf = flush_subbuf_net;
+               }
+       } else {
+               if (tip->ofile_mmap)
+                       tip->get_subbuf = mmap_subbuf;
+               else
+                       tip->get_subbuf = get_subbuf;
 
-       if (net_mode == Net_client)
-               tip->flush_subbuf = flush_subbuf_net;
-       else
                tip->flush_subbuf = flush_subbuf_file;
-
+       }
+                       
        if (net_mode == Net_server)
                tip->read_data = read_data_net;
        else
                tip->read_data = read_data_file;
 }
 
+static int tip_open_output(struct device_information *dip,
+                          struct thread_information *tip)
+{
+       int pipeline = output_name && !strcmp(output_name, "-");
+       int mode, vbuf_size;
+       char op[128];
+
+       if (net_mode == Net_client) {
+               tip->ofile = NULL;
+               tip->ofile_stdout = 0;
+               tip->ofile_mmap = 0;
+               goto done;
+       } else if (pipeline) {
+               tip->ofile = fdopen(STDOUT_FILENO, "w");
+               tip->ofile_stdout = 1;
+               tip->ofile_mmap = 0;
+               mode = _IOLBF;
+               vbuf_size = 512;
+       } else {
+               if (fill_ofname(tip, op, dip->buts_name))
+                       return 1;
+               tip->ofile = fopen(op, "w+");
+               tip->ofile_stdout = 0;
+               tip->ofile_mmap = 1;
+               mode = _IOFBF;
+               vbuf_size = OFILE_BUF;
+       }
+
+       if (tip->ofile == NULL) {
+               perror(op);
+               return 1;
+       }
+
+       tip->ofile_buffer = malloc(vbuf_size);
+       if (setvbuf(tip->ofile, tip->ofile_buffer, mode, vbuf_size)) {
+               perror("setvbuf");
+               close_thread(tip);
+               return 1;
+       }
+
+done:
+       fill_ops(tip);
+       return 0;
+}
+
 static int start_threads(struct device_information *dip)
 {
        struct thread_information *tip;
-       int j, pipeline = output_name && !strcmp(output_name, "-");
-       int mode, vbuf_size;
-       char op[64];
+       int j;
 
        for_each_tip(dip, tip, j) {
                tip->cpu = j;
                tip->device = dip;
                tip->events_processed = 0;
+               tip->fd = -1;
+               tip->pfd = -1;
                memset(&tip->fifo, 0, sizeof(tip->fifo));
                tip->leftover_ts = NULL;
 
-               if (pipeline) {
-                       tip->ofile = fdopen(STDOUT_FILENO, "w");
-                       tip->ofile_stdout = 1;
-                       tip->ofile_mmap = 0;
-                       mode = _IOLBF;
-                       vbuf_size = 512;
-               } else {
-                       fill_ofname(op, dip->buts_name, tip->cpu);
-                       tip->ofile = fopen(op, "w+");
-                       tip->ofile_stdout = 0;
-                       tip->ofile_mmap = 1;
-                       mode = _IOFBF;
-                       vbuf_size = OFILE_BUF;
-               }
-
-               if (tip->ofile == NULL) {
-                       perror(op);
-                       return 1;
-               }
-
-               tip->ofile_buffer = malloc(vbuf_size);
-               if (setvbuf(tip->ofile, tip->ofile_buffer, mode, vbuf_size)) {
-                       perror("setvbuf");
-                       close_thread(tip);
+               if (tip_open_output(dip, tip))
                        return 1;
-               }
-
-               fill_ops(tip);
 
                if (pthread_create(&tip->thread, NULL, thread_main, tip)) {
                        perror("pthread_create");
@@ -1051,7 +1274,7 @@ static void show_stats(void)
                        if (!no_stdout)
                                printf("  CPU%3d: %20lu events, %8llu KiB data\n",
                                        tip->cpu, tip->events_processed,
-                                       tip->data_read >> 10);
+                                       (tip->data_read + 1023) >> 10);
                        events_processed += tip->events_processed;
                        data_read += tip->data_read;
                }
@@ -1059,14 +1282,15 @@ static void show_stats(void)
                if (!no_stdout)
                        printf("  Total:  %20llu events (dropped %lu), %8llu KiB data\n",
                                        events_processed, dip->drop_count,
-                                       data_read >> 10);
+                                       (data_read + 1023) >> 10);
        }
 
        if (total_drops)
                fprintf(stderr, "You have dropped events, consider using a larger buffer size (-b)\n");
 }
 
-static struct device_information *net_get_dip(char *buts_name)
+static struct device_information *net_get_dip(char *buts_name,
+                                             struct in_addr *cl_in_addr)
 {
        struct device_information *dip;
        int i;
@@ -1080,8 +1304,10 @@ static struct device_information *net_get_dip(char *buts_name)
 
        device_information = realloc(device_information, (ndevs + 1) * sizeof(*dip));
        dip = &device_information[ndevs];
+       memset(dip, 0, sizeof(*dip));
+       dip->fd = -1;
        strcpy(dip->buts_name, buts_name);
-       strcpy(dip->path, buts_name);
+       dip->path = strdup(buts_name);
        ndevs++;
        dip->threads = malloc(ncpus * sizeof(struct thread_information));
        memset(dip->threads, 0, ncpus * sizeof(struct thread_information));
@@ -1091,31 +1317,27 @@ static struct device_information *net_get_dip(char *buts_name)
         */
        for (i = 0; i < ncpus; i++) {
                struct thread_information *tip = &dip->threads[i];
-               char op[64];
 
                tip->cpu = i;
-               tip->ofile_stdout = 0;
-               tip->ofile_mmap = 1;
                tip->device = dip;
+               tip->fd = -1;
+               tip->pfd = -1;
+               tip->cl_in_addr = *cl_in_addr;
 
-               fill_ofname(op, dip->buts_name, tip->cpu);
-
-               tip->ofile = fopen(op, "w+");
-               if (!tip->ofile) {
-                       perror("fopen");
+               if (tip_open_output(dip, tip))
                        return NULL;
-               }
        }
 
        return dip;
 }
 
-static struct thread_information *net_get_tip(struct blktrace_net_hdr *bnh)
+static struct thread_information *net_get_tip(struct blktrace_net_hdr *bnh,
+                                             struct in_addr *cl_in_addr)
 {
        struct device_information *dip;
 
        ncpus = bnh->max_cpus;
-       dip = net_get_dip(bnh->buts_name);
+       dip = net_get_dip(bnh->buts_name, cl_in_addr);
        return &dip->threads[bnh->cpu];
 }
 
@@ -1145,10 +1367,10 @@ static int net_get_header(struct blktrace_net_hdr *bnh)
                }
        }
        fcntl(net_in_fd, F_SETFL, fl & ~O_NONBLOCK);
-       return 0;
+       return bytes_left;
 }
 
-static int net_server_loop(void)
+static int net_server_loop(struct in_addr *cl_in_addr)
 {
        struct thread_information *tip;
        struct blktrace_net_hdr bnh;
@@ -1162,11 +1384,25 @@ static int net_server_loop(void)
        }
 
        if (!data_is_native) {
+               bnh.magic = be32_to_cpu(bnh.magic);
                bnh.cpu = be32_to_cpu(bnh.cpu);
                bnh.len = be32_to_cpu(bnh.len);
        }
 
-       tip = net_get_tip(&bnh);
+       if ((bnh.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
+               fprintf(stderr, "server: bad data magic\n");
+               return 1;
+       }
+
+       /*
+        * len == 0 means that the other end signalled end-of-run
+        */
+       if (!bnh.len) {
+               fprintf(stderr, "server: end of run\n");
+               return 1;
+       }
+
+       tip = net_get_tip(&bnh, cl_in_addr);
        if (!tip)
                return 1;
 
@@ -1182,6 +1418,8 @@ static int net_server_loop(void)
  */
 static int net_server(void)
 {
+       struct device_information *dip;
+       struct thread_information *tip;
        struct sockaddr_in addr;
        socklen_t socklen;
        int fd, opt, i, j;
@@ -1213,6 +1451,12 @@ static int net_server(void)
                return 1;
        }
 
+repeat:
+       signal(SIGINT, NULL);
+       signal(SIGHUP, NULL);
+       signal(SIGTERM, NULL);
+       signal(SIGALRM, NULL);
+
        printf("blktrace: waiting for incoming connection...\n");
 
        socklen = sizeof(addr);
@@ -1227,22 +1471,41 @@ static int net_server(void)
        signal(SIGTERM, handle_sigint);
        signal(SIGALRM, handle_sigint);
 
-       printf("blktrace: connected!\n");
+       printf("blktrace: connection from %s\n", inet_ntoa(addr.sin_addr));
 
        while (!is_done()) {
-               if (net_server_loop())
+               if (net_server_loop(&addr.sin_addr))
                        break;
        }
 
-       for (i = 0; i < ndevs; i++) {
-               struct device_information *dip = &device_information[i];
+       for_each_dip(dip, i)
+               for_each_tip(dip, tip, j)
+                       tip_ftrunc_final(tip);
+
+       show_stats();
+
+       if (is_done())
+               return 0;
 
-               for (j = 0; j < ncpus; j++)
-                       tip_ftrunc_final(&dip->threads[j]);
+       /*
+        * cleanup for next run
+        */
+       for_each_dip(dip, i) {
+               for_each_tip(dip, tip, j)
+                       fclose(tip->ofile);
+
+               free(dip->threads);
+               free(dip->path);
        }
 
-       show_stats();
-       return 0;
+       free(device_information);
+       device_information = NULL;
+       ncpus = ndevs = 0;
+
+       close(net_in_fd);
+       net_in_fd = -1;
+       stat_shown = 0;
+       goto repeat;
 }
 
 /*
@@ -1299,7 +1562,11 @@ static char usage_str[] = \
        "\t-A Give trace mask as a single value. See documentation\n" \
        "\t-b Sub buffer size in KiB\n" \
        "\t-n Number of sub buffers\n" \
-       "\t-v Print program version info\n\n";
+       "\t-l Run in network listen mode (blktrace server)\n" \
+       "\t-h Run in network client mode, connecting to the given host\n" \
+       "\t-p Network port to use (default 8462)\n" \
+       "\t-s Make the network client use sendfile() to transfer data\n" \
+       "\t-V Print program version info\n\n";
 
 static void show_usage(char *program)
 {
@@ -1394,6 +1661,9 @@ int main(int argc, char *argv[])
                case 'p':
                        net_port = atoi(optarg);
                        break;
+               case 's':
+                       net_use_sendfile = 1;
+                       break;
                default:
                        show_usage(argv[0]);
                        return 1;