t/io_uring: only calculate per-file depth if we have files
[fio.git] / iolog.c
diff --git a/iolog.c b/iolog.c
index 18ae4369dcad96190bac5e3974366f6be41d413d..96af4f33e186dd4ec1a02b851a1c7ae4c0b1eaeb 100644 (file)
--- a/iolog.c
+++ b/iolog.c
@@ -4,7 +4,6 @@
  */
 #include <stdio.h>
 #include <stdlib.h>
-#include <libgen.h>
 #include <assert.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 
 #include "flist.h"
 #include "fio.h"
-#include "verify.h"
 #include "trim.h"
 #include "filelock.h"
 #include "smalloc.h"
 #include "blktrace.h"
+#include "pshared.h"
+#include "lib/roundup.h"
+
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/un.h>
 
 static int iolog_flush(struct io_log *log);
 
 static const char iolog_ver2[] = "fio version 2 iolog";
+static const char iolog_ver3[] = "fio version 3 iolog";
 
 void queue_io_piece(struct thread_data *td, struct io_piece *ipo)
 {
@@ -33,18 +41,24 @@ void queue_io_piece(struct thread_data *td, struct io_piece *ipo)
 
 void log_io_u(const struct thread_data *td, const struct io_u *io_u)
 {
+       struct timespec now;
+
        if (!td->o.write_iolog_file)
                return;
 
-       fprintf(td->iolog_f, "%s %s %llu %lu\n", io_u->file->file_name,
-                                               io_ddir_name(io_u->ddir),
-                                               io_u->offset, io_u->buflen);
+       fio_gettime(&now, NULL);
+       fprintf(td->iolog_f, "%llu %s %s %llu %llu\n",
+               (unsigned long long) utime_since_now(&td->io_log_start_time),
+               io_u->file->file_name, io_ddir_name(io_u->ddir), io_u->offset,
+               io_u->buflen);
+
 }
 
 void log_file(struct thread_data *td, struct fio_file *f,
              enum file_log_act what)
 {
        const char *act[] = { "add", "open", "close" };
+       struct timespec now;
 
        assert(what < 3);
 
@@ -58,14 +72,18 @@ void log_file(struct thread_data *td, struct fio_file *f,
        if (!td->iolog_f)
                return;
 
-       fprintf(td->iolog_f, "%s %s\n", f->file_name, act[what]);
+       fio_gettime(&now, NULL);
+       fprintf(td->iolog_f, "%llu %s %s\n",
+               (unsigned long long) utime_since_now(&td->io_log_start_time),
+               f->file_name, act[what]);
 }
 
 static void iolog_delay(struct thread_data *td, unsigned long delay)
 {
        uint64_t usec = utime_since_now(&td->last_issue);
-       uint64_t this_delay;
+       unsigned long orig_delay = delay;
        struct timespec ts;
+       int ret = 0;
 
        if (delay < td->time_offset) {
                td->time_offset = 0;
@@ -79,18 +97,20 @@ static void iolog_delay(struct thread_data *td, unsigned long delay)
        delay -= usec;
 
        fio_gettime(&ts, NULL);
-       while (delay && !td->terminate) {
-               this_delay = delay;
-               if (this_delay > 500000)
-                       this_delay = 500000;
 
-               usec_sleep(td, this_delay);
-               delay -= this_delay;
+       while (delay && !td->terminate) {
+               ret = io_u_queued_complete(td, 0);
+               if (ret < 0)
+                       td_verror(td, -ret, "io_u_queued_complete");
+               if (td->flags & TD_F_REGROW_LOGS)
+                       regrow_logs(td);
+               if (utime_since_now(&ts) > delay)
+                       break;
        }
 
        usec = utime_since_now(&ts);
-       if (usec > delay)
-               td->time_offset = usec - delay;
+       if (usec > orig_delay)
+               td->time_offset = usec - orig_delay;
        else
                td->time_offset = 0;
 }
@@ -108,6 +128,10 @@ static int ipo_special(struct thread_data *td, struct io_piece *ipo)
 
        f = td->files[ipo->fileno];
 
+       if (ipo->delay)
+               iolog_delay(td, ipo->delay);
+       if (fio_fill_issue_time(td))
+               fio_gettime(&td->last_issue, NULL);
        switch (ipo->file_action) {
        case FIO_LOG_OPEN_FILE:
                if (td->o.replay_redirect && fio_file_open(f)) {
@@ -126,6 +150,11 @@ static int ipo_special(struct thread_data *td, struct io_piece *ipo)
        case FIO_LOG_UNLINK_FILE:
                td_io_unlink_file(td, f);
                break;
+       case FIO_LOG_ADD_FILE:
+               /*
+                * Nothing to do
+                */
+               break;
        default:
                log_err("fio: bad file action %d\n", ipo->file_action);
                break;
@@ -134,6 +163,26 @@ static int ipo_special(struct thread_data *td, struct io_piece *ipo)
        return 1;
 }
 
+static bool read_iolog(struct thread_data *td);
+
+unsigned long long delay_since_ttime(const struct thread_data *td,
+              unsigned long long time)
+{
+       double tmp;
+       double scale;
+       const unsigned long long *last_ttime = &td->io_log_last_ttime;
+
+       if (!*last_ttime || td->o.no_stall || time < *last_ttime)
+               return 0;
+       else if (td->o.replay_time_scale == 100)
+               return time - *last_ttime;
+
+
+       scale = (double) 100.0 / (double) td->o.replay_time_scale;
+       tmp = time - *last_ttime;
+       return tmp * scale;
+}
+
 int read_iolog_get(struct thread_data *td, struct io_u *io_u)
 {
        struct io_piece *ipo;
@@ -142,6 +191,18 @@ int read_iolog_get(struct thread_data *td, struct io_u *io_u)
        while (!flist_empty(&td->io_log_list)) {
                int ret;
 
+               if (td->o.read_iolog_chunked) {
+                       if (td->io_log_checkmark == td->io_log_current) {
+                               if (td->io_log_blktrace) {
+                                       if (!read_blktrace(td))
+                                               return 1;
+                               } else {
+                                       if (!read_iolog(td))
+                                               return 1;
+                               }
+                       }
+                       td->io_log_current--;
+               }
                ipo = flist_first_entry(&td->io_log_list, struct io_piece, list);
                flist_del(&ipo->list);
                remove_trim_entry(td, ipo);
@@ -158,10 +219,11 @@ int read_iolog_get(struct thread_data *td, struct io_u *io_u)
                io_u->ddir = ipo->ddir;
                if (ipo->ddir != DDIR_WAIT) {
                        io_u->offset = ipo->offset;
+                       io_u->verify_offset = ipo->offset;
                        io_u->buflen = ipo->len;
                        io_u->file = td->files[ipo->fileno];
                        get_file(io_u->file);
-                       dprint(FD_IO, "iolog: get %llu/%lu/%s\n", io_u->offset,
+                       dprint(FD_IO, "iolog: get %llu/%llu/%s\n", io_u->offset,
                                                io_u->buflen, io_u->file->file_name);
                        if (ipo->delay)
                                iolog_delay(td, ipo->delay);
@@ -184,7 +246,7 @@ int read_iolog_get(struct thread_data *td, struct io_u *io_u)
 void prune_io_piece_log(struct thread_data *td)
 {
        struct io_piece *ipo;
-       struct rb_node *n;
+       struct fio_rb_node *n;
 
        while ((n = rb_first(&td->io_hist_tree)) != NULL) {
                ipo = rb_entry(n, struct io_piece, rb_node);
@@ -208,10 +270,10 @@ void prune_io_piece_log(struct thread_data *td)
  */
 void log_io_piece(struct thread_data *td, struct io_u *io_u)
 {
-       struct rb_node **p, *parent;
+       struct fio_rb_node **p, *parent;
        struct io_piece *ipo, *__ipo;
 
-       ipo = malloc(sizeof(struct io_piece));
+       ipo = calloc(1, sizeof(struct io_piece));
        init_ipo(ipo);
        ipo->file = io_u->file;
        ipo->offset = io_u->offset;
@@ -227,21 +289,11 @@ void log_io_piece(struct thread_data *td, struct io_u *io_u)
        }
 
        /*
-        * We don't need to sort the entries, if:
-        *
-        *      Sequential writes, or
-        *      Random writes that lay out the file as it goes along
-        *
-        * For both these cases, just reading back data in the order we
-        * wrote it out is the fastest.
-        *
-        * One exception is if we don't have a random map AND we are doing
-        * verifies, in that case we need to check for duplicate blocks and
-        * drop the old one, which we rely on the rb insert/lookup for
-        * handling.
+        * Only sort writes if we don't have a random map in which case we need
+        * to check for duplicate blocks and drop the old one, which we rely on
+        * the rb insert/lookup for handling.
         */
-       if (((!td->o.verifysort) || !td_random(td) || !td->o.overwrite) &&
-             (file_randommap(td, ipo->file) || td->o.verify == VERIFY_NONE)) {
+       if (file_randommap(td, ipo->file)) {
                INIT_FLIST_HEAD(&ipo->list);
                flist_add_tail(&ipo->list, &td->io_hist_list);
                ipo->flags |= IP_F_ONLIST;
@@ -325,7 +377,7 @@ void unlog_io_piece(struct thread_data *td, struct io_u *io_u)
        td->io_hist_len--;
 }
 
-void trim_io_piece(struct thread_data *td, const struct io_u *io_u)
+void trim_io_piece(const struct io_u *io_u)
 {
        struct io_piece *ipo = io_u->ipo;
 
@@ -337,6 +389,9 @@ void trim_io_piece(struct thread_data *td, const struct io_u *io_u)
 
 void write_iolog_close(struct thread_data *td)
 {
+       if (!td->iolog_f)
+               return;
+
        fflush(td->iolog_f);
        fclose(td->iolog_f);
        free(td->iolog_buf);
@@ -344,20 +399,61 @@ void write_iolog_close(struct thread_data *td)
        td->iolog_buf = NULL;
 }
 
+int64_t iolog_items_to_fetch(struct thread_data *td)
+{
+       struct timespec now;
+       uint64_t elapsed;
+       uint64_t for_1s;
+       int64_t items_to_fetch;
+
+       if (!td->io_log_highmark)
+               return 10;
+
+
+       fio_gettime(&now, NULL);
+       elapsed = ntime_since(&td->io_log_highmark_time, &now);
+       if (elapsed) {
+               for_1s = (td->io_log_highmark - td->io_log_current) * 1000000000 / elapsed;
+               items_to_fetch = for_1s - td->io_log_current;
+               if (items_to_fetch < 0)
+                       items_to_fetch = 0;
+       } else
+               items_to_fetch = 0;
+
+       td->io_log_highmark = td->io_log_current + items_to_fetch;
+       td->io_log_checkmark = (td->io_log_highmark + 1) / 2;
+       fio_gettime(&td->io_log_highmark_time, NULL);
+
+       return items_to_fetch;
+}
+
+#define io_act(_td, _r) (((_td)->io_log_version == 3 && (r) == 5) || \
+                                       ((_td)->io_log_version == 2 && (r) == 4))
+#define file_act(_td, _r) (((_td)->io_log_version == 3 && (r) == 3) || \
+                                       ((_td)->io_log_version == 2 && (r) == 2))
+
 /*
- * Read version 2 iolog data. It is enhanced to include per-file logging,
+ * Read version 2 and 3 iolog data. It is enhanced to include per-file logging,
  * syncs, etc.
  */
-static int read_iolog2(struct thread_data *td, FILE *f)
+static bool read_iolog(struct thread_data *td)
 {
        unsigned long long offset;
        unsigned int bytes;
-       int reads, writes, waits, fileno = 0, file_action = 0; /* stupid gcc */
+       unsigned long long delay = 0;
+       int reads, writes, trims, waits, fileno = 0, file_action = 0; /* stupid gcc */
        char *rfname, *fname, *act;
        char *str, *p;
        enum fio_ddir rw;
-
-       free_release_files(td);
+       bool realloc = false;
+       int64_t items_to_fetch = 0;
+       int syncs;
+
+       if (td->o.read_iolog_chunked) {
+               items_to_fetch = iolog_items_to_fetch(td);
+               if (!items_to_fetch)
+                       return true;
+       }
 
        /*
         * Read in the read iolog and store it, reuse the infrastructure
@@ -367,40 +463,62 @@ static int read_iolog2(struct thread_data *td, FILE *f)
        rfname = fname = malloc(256+16);
        act = malloc(256+16);
 
-       reads = writes = waits = 0;
-       while ((p = fgets(str, 4096, f)) != NULL) {
+       syncs = reads = writes = trims = waits = 0;
+       while ((p = fgets(str, 4096, td->io_log_rfile)) != NULL) {
                struct io_piece *ipo;
                int r;
+               unsigned long long ttime;
 
-               r = sscanf(p, "%256s %256s %llu %u", rfname, act, &offset,
-                                                                       &bytes);
+               if (td->io_log_version == 3) {
+                       r = sscanf(p, "%llu %256s %256s %llu %u", &ttime, rfname, act,
+                                                       &offset, &bytes);
+                       delay = delay_since_ttime(td, ttime);
+                       td->io_log_last_ttime = ttime;
+                       /*
+                        * "wait" is not allowed with version 3
+                        */
+                       if (!strcmp(act, "wait")) {
+                               log_err("iolog: ignoring wait command with"
+                                       " version 3 for file %s\n", fname);
+                               continue;
+                       }
+               } else /* version 2 */
+                       r = sscanf(p, "%256s %256s %llu %u", rfname, act, &offset, &bytes);
 
                if (td->o.replay_redirect)
                        fname = td->o.replay_redirect;
 
-               if (r == 4) {
+               if (io_act(td, r)) {
                        /*
                         * Check action first
                         */
                        if (!strcmp(act, "wait"))
                                rw = DDIR_WAIT;
-                       else if (!strcmp(act, "read"))
+                       else if (!strcmp(act, "read")) {
+                               if (td->o.replay_skip & (1u << DDIR_READ))
+                                       continue;
                                rw = DDIR_READ;
-                       else if (!strcmp(act, "write"))
+                       } else if (!strcmp(act, "write")) {
+                               if (td->o.replay_skip & (1u << DDIR_WRITE))
+                                       continue;
                                rw = DDIR_WRITE;
-                       else if (!strcmp(act, "sync"))
+                       } else if (!strcmp(act, "sync")) {
+                               if (td->o.replay_skip & (1u << DDIR_SYNC))
+                                       continue;
                                rw = DDIR_SYNC;
-                       else if (!strcmp(act, "datasync"))
+                       else if (!strcmp(act, "datasync"))
                                rw = DDIR_DATASYNC;
-                       else if (!strcmp(act, "trim"))
+                       else if (!strcmp(act, "trim")) {
+                               if (td->o.replay_skip & (1u << DDIR_TRIM))
+                                       continue;
                                rw = DDIR_TRIM;
-                       else {
+                       else {
                                log_err("fio: bad iolog file action: %s\n",
                                                                        act);
                                continue;
                        }
                        fileno = get_fileno(td, fname);
-               } else if (r == 2) {
+               } else if (file_act(td, r)) {
                        rw = DDIR_INVAL;
                        if (!strcmp(act, "add")) {
                                if (td->o.replay_redirect &&
@@ -408,10 +526,9 @@ static int read_iolog2(struct thread_data *td, FILE *f)
                                        dprint(FD_FILE, "iolog: ignoring"
                                                " re-add of file %s\n", fname);
                                } else {
-                                       fileno = add_file(td, fname, 0, 1);
+                                       fileno = add_file(td, fname, td->subjob_number, 1);
                                        file_action = FIO_LOG_ADD_FILE;
                                }
-                               continue;
                        } else if (!strcmp(act, "open")) {
                                fileno = get_fileno(td, fname);
                                file_action = FIO_LOG_OPEN_FILE;
@@ -424,7 +541,7 @@ static int read_iolog2(struct thread_data *td, FILE *f)
                                continue;
                        }
                } else {
-                       log_err("bad iolog2: %s\n", p);
+                       log_err("bad iolog%d: %s\n", td->io_log_version, p);
                        continue;
                }
 
@@ -437,12 +554,21 @@ static int read_iolog2(struct thread_data *td, FILE *f)
                        if (read_only)
                                continue;
                        writes++;
+               } else if (rw == DDIR_TRIM) {
+                       /*
+                        * Don't add a trim for ro mode
+                        */
+                       if (read_only)
+                               continue;
+                       trims++;
                } else if (rw == DDIR_WAIT) {
                        if (td->o.no_stall)
                                continue;
                        waits++;
                } else if (rw == DDIR_INVAL) {
-               } else if (!ddir_sync(rw)) {
+               } else if (ddir_sync(rw)) {
+                       syncs++;
+               } else {
                        log_err("bad ddir: %d\n", rw);
                        continue;
                }
@@ -450,9 +576,11 @@ static int read_iolog2(struct thread_data *td, FILE *f)
                /*
                 * Make note of file
                 */
-               ipo = malloc(sizeof(*ipo));
+               ipo = calloc(1, sizeof(*ipo));
                init_ipo(ipo);
                ipo->ddir = rw;
+               if (td->io_log_version == 3)
+                       ipo->delay = delay;
                if (rw == DDIR_WAIT) {
                        ipo->delay = offset;
                } else {
@@ -463,51 +591,132 @@ static int read_iolog2(struct thread_data *td, FILE *f)
                        ipo_bytes_align(td->o.replay_align, ipo);
 
                        ipo->len = bytes;
-                       if (rw != DDIR_INVAL && bytes > td->o.max_bs[rw])
+                       if (rw != DDIR_INVAL && bytes > td->o.max_bs[rw]) {
+                               realloc = true;
                                td->o.max_bs[rw] = bytes;
+                       }
                        ipo->fileno = fileno;
                        ipo->file_action = file_action;
                        td->o.size += bytes;
                }
 
                queue_io_piece(td, ipo);
+
+               if (td->o.read_iolog_chunked) {
+                       td->io_log_current++;
+                       items_to_fetch--;
+                       if (items_to_fetch == 0)
+                               break;
+               }
        }
 
        free(str);
        free(act);
        free(rfname);
 
+       if (td->o.read_iolog_chunked) {
+               td->io_log_highmark = td->io_log_current;
+               td->io_log_checkmark = (td->io_log_highmark + 1) / 2;
+               fio_gettime(&td->io_log_highmark_time, NULL);
+       }
+
        if (writes && read_only) {
                log_err("fio: <%s> skips replay of %d writes due to"
                        " read-only\n", td->o.name, writes);
                writes = 0;
        }
+       if (syncs)
+               td->flags |= TD_F_SYNCS;
 
-       if (!reads && !writes && !waits)
-               return 1;
-       else if (reads && !writes)
-               td->o.td_ddir = TD_DDIR_READ;
-       else if (!reads && writes)
-               td->o.td_ddir = TD_DDIR_WRITE;
-       else
+       if (td->o.read_iolog_chunked) {
+               if (td->io_log_current == 0) {
+                       return false;
+               }
                td->o.td_ddir = TD_DDIR_RW;
+               if (realloc && td->orig_buffer)
+               {
+                       io_u_quiesce(td);
+                       free_io_mem(td);
+                       if (init_io_u_buffers(td))
+                               return false;
+               }
+               return true;
+       }
 
-       return 0;
+       if (!reads && !writes && !waits && !trims)
+               return false;
+
+       td->o.td_ddir = 0;
+       if (reads)
+               td->o.td_ddir |= TD_DDIR_READ;
+       if (writes)
+               td->o.td_ddir |= TD_DDIR_WRITE;
+       if (trims)
+               td->o.td_ddir |= TD_DDIR_TRIM;
+
+       return true;
+}
+
+static bool is_socket(const char *path)
+{
+       struct stat buf;
+       int r;
+
+       r = stat(path, &buf);
+       if (r == -1)
+               return false;
+
+       return S_ISSOCK(buf.st_mode);
+}
+
+static int open_socket(const char *path)
+{
+       struct sockaddr_un addr;
+       int ret, fd;
+
+       fd = socket(AF_UNIX, SOCK_STREAM, 0);
+       if (fd < 0)
+               return fd;
+
+       addr.sun_family = AF_UNIX;
+       if (snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", path) >=
+           sizeof(addr.sun_path)) {
+               log_err("%s: path name %s is too long for a Unix socket\n",
+                       __func__, path);
+       }
+
+       ret = connect(fd, (const struct sockaddr *)&addr, strlen(path) + sizeof(addr.sun_family));
+       if (!ret)
+               return fd;
+
+       close(fd);
+       return -1;
 }
 
 /*
  * open iolog, check version, and call appropriate parser
  */
-static int init_iolog_read(struct thread_data *td)
+static bool init_iolog_read(struct thread_data *td, char *fname)
 {
        char buffer[256], *p;
-       FILE *f;
-       int ret;
+       FILE *f = NULL;
+
+       dprint(FD_IO, "iolog: name=%s\n", fname);
+
+       if (is_socket(fname)) {
+               int fd;
+
+               fd = open_socket(fname);
+               if (fd >= 0)
+                       f = fdopen(fd, "r");
+       } else if (!strcmp(fname, "-")) {
+               f = stdin;
+       } else
+               f = fopen(fname, "r");
 
-       f = fopen(td->o.read_iolog_file, "r");
        if (!f) {
                perror("fopen read iolog");
-               return 1;
+               return false;
        }
 
        p = fgets(buffer, sizeof(buffer), f);
@@ -515,28 +724,32 @@ static int init_iolog_read(struct thread_data *td)
                td_verror(td, errno, "iolog read");
                log_err("fio: unable to read iolog\n");
                fclose(f);
-               return 1;
+               return false;
        }
 
        /*
-        * version 2 of the iolog stores a specific string as the
+        * versions 2 and 3 of the iolog store a specific string as the
         * first line, check for that
         */
        if (!strncmp(iolog_ver2, buffer, strlen(iolog_ver2)))
-               ret = read_iolog2(td, f);
+               td->io_log_version = 2;
+       else if (!strncmp(iolog_ver3, buffer, strlen(iolog_ver3)))
+               td->io_log_version = 3;
        else {
                log_err("fio: iolog version 1 is no longer supported\n");
-               ret = 1;
+               fclose(f);
+               return false;
        }
 
-       fclose(f);
-       return ret;
+       free_release_files(td);
+       td->io_log_rfile = f;
+       return read_iolog(td);
 }
 
 /*
  * Set up a log for storing io patterns.
  */
-static int init_iolog_write(struct thread_data *td)
+static bool init_iolog_write(struct thread_data *td)
 {
        struct fio_file *ff;
        FILE *f;
@@ -545,7 +758,7 @@ static int init_iolog_write(struct thread_data *td)
        f = fopen(td->o.write_iolog_file, "a");
        if (!f) {
                perror("fopen write iolog");
-               return 1;
+               return false;
        }
 
        /*
@@ -554,13 +767,14 @@ static int init_iolog_write(struct thread_data *td)
        td->iolog_f = f;
        td->iolog_buf = malloc(8192);
        setvbuf(f, td->iolog_buf, _IOFBF, 8192);
+       fio_gettime(&td->io_log_start_time, NULL);
 
        /*
         * write our version line
         */
-       if (fprintf(f, "%s\n", iolog_ver2) < 0) {
+       if (fprintf(f, "%s\n", iolog_ver3) < 0) {
                perror("iolog init\n");
-               return 1;
+               return false;
        }
 
        /*
@@ -569,30 +783,39 @@ static int init_iolog_write(struct thread_data *td)
        for_each_file(td, ff, i)
                log_file(td, ff, FIO_LOG_ADD_FILE);
 
-       return 0;
+       return true;
 }
 
-int init_iolog(struct thread_data *td)
+bool init_iolog(struct thread_data *td)
 {
-       int ret = 0;
+       bool ret;
 
        if (td->o.read_iolog_file) {
                int need_swap;
+               char * fname = get_name_by_idx(td->o.read_iolog_file, td->subjob_number);
 
                /*
                 * Check if it's a blktrace file and load that if possible.
                 * Otherwise assume it's a normal log file and load that.
                 */
-               if (is_blktrace(td->o.read_iolog_file, &need_swap))
-                       ret = load_blktrace(td, td->o.read_iolog_file, need_swap);
-               else
-                       ret = init_iolog_read(td);
+               if (is_blktrace(fname, &need_swap)) {
+                       td->io_log_blktrace = 1;
+                       ret = init_blktrace_read(td, fname, need_swap);
+               } else {
+                       td->io_log_blktrace = 0;
+                       ret = init_iolog_read(td, fname);
+               }
+               free(fname);
        } else if (td->o.write_iolog_file)
                ret = init_iolog_write(td);
+       else
+               ret = true;
 
-       if (ret)
+       if (!ret)
                td_verror(td, EINVAL, "failed initializing iolog");
 
+       init_disk_util(td);
+
        return ret;
 }
 
@@ -608,6 +831,7 @@ void setup_log(struct io_log **log, struct log_params *p,
        INIT_FLIST_HEAD(&l->io_logs);
        l->log_type = p->log_type;
        l->log_offset = p->log_offset;
+       l->log_prio = p->log_prio;
        l->log_gz = p->log_gz;
        l->log_gz_store = p->log_gz_store;
        l->avg_msec = p->avg_msec;
@@ -627,16 +851,28 @@ void setup_log(struct io_log **log, struct log_params *p,
        }
 
        if (l->td && l->td->o.io_submit_mode != IO_MODE_OFFLOAD) {
-               struct io_logs *p;
-
-               p = calloc(1, sizeof(*l->pending));
-               p->max_samples = DEF_LOG_ENTRIES;
-               p->log = calloc(p->max_samples, log_entry_sz(l));
-               l->pending = p;
+               unsigned int def_samples = DEF_LOG_ENTRIES;
+               struct io_logs *__p;
+
+               __p = calloc(1, sizeof(*l->pending));
+               if (l->td->o.iodepth > DEF_LOG_ENTRIES)
+                       def_samples = roundup_pow2(l->td->o.iodepth);
+               __p->max_samples = def_samples;
+               __p->log = calloc(__p->max_samples, log_entry_sz(l));
+               l->pending = __p;
        }
 
        if (l->log_offset)
                l->log_ddir_mask = LOG_OFFSET_SAMPLE_BIT;
+       if (l->log_prio)
+               l->log_ddir_mask |= LOG_PRIO_SAMPLE_BIT;
+       /*
+        * The bandwidth-log option generates agg-read_bw.log,
+        * agg-write_bw.log and agg-trim_bw.log for which l->td is NULL.
+        * Check if l->td is valid before dereferencing it.
+        */
+       if (l->td && l->td->o.log_max == IO_LOG_SAMPLE_BOTH)
+               l->log_ddir_mask |= LOG_AVG_MAX_SAMPLE_BIT;
 
        INIT_FLIST_HEAD(&l->chunk_list);
 
@@ -699,10 +935,10 @@ void free_log(struct io_log *log)
        sfree(log);
 }
 
-unsigned long hist_sum(int j, int stride, unsigned int *io_u_plat,
-               unsigned int *io_u_plat_last)
+uint64_t hist_sum(int j, int stride, uint64_t *io_u_plat,
+               uint64_t *io_u_plat_last)
 {
-       unsigned long sum;
+       uint64_t sum;
        int k;
 
        if (io_u_plat_last) {
@@ -723,8 +959,8 @@ static void flush_hist_samples(FILE *f, int hist_coarseness, void *samples,
        int log_offset;
        uint64_t i, j, nr_samples;
        struct io_u_plat_entry *entry, *entry_before;
-       unsigned int *io_u_plat;
-       unsigned int *io_u_plat_before;
+       uint64_t *io_u_plat;
+       uint64_t *io_u_plat_before;
 
        int stride = 1 << hist_coarseness;
        
@@ -745,13 +981,13 @@ static void flush_hist_samples(FILE *f, int hist_coarseness, void *samples,
                entry_before = flist_first_entry(&entry->list, struct io_u_plat_entry, list);
                io_u_plat_before = entry_before->io_u_plat;
 
-               fprintf(f, "%lu, %u, %u, ", (unsigned long) s->time,
-                                               io_sample_ddir(s), s->bs);
+               fprintf(f, "%lu, %u, %llu, ", (unsigned long) s->time,
+                                               io_sample_ddir(s), (unsigned long long) s->bs);
                for (j = 0; j < FIO_IO_U_PLAT_NR - stride; j += stride) {
-                       fprintf(f, "%lu, ", hist_sum(j, stride, io_u_plat,
-                                               io_u_plat_before));
+                       fprintf(f, "%llu, ", (unsigned long long)
+                               hist_sum(j, stride, io_u_plat, io_u_plat_before));
                }
-               fprintf(f, "%lu\n", (unsigned long)
+               fprintf(f, "%llu\n", (unsigned long long)
                        hist_sum(FIO_IO_U_PLAT_NR - stride, stride, io_u_plat,
                                        io_u_plat_before));
 
@@ -763,33 +999,87 @@ static void flush_hist_samples(FILE *f, int hist_coarseness, void *samples,
 void flush_samples(FILE *f, void *samples, uint64_t sample_size)
 {
        struct io_sample *s;
-       int log_offset;
+       int log_offset, log_prio, log_avg_max;
        uint64_t i, nr_samples;
+       unsigned int prio_val;
+       const char *fmt;
 
        if (!sample_size)
                return;
 
        s = __get_sample(samples, 0, 0);
        log_offset = (s->__ddir & LOG_OFFSET_SAMPLE_BIT) != 0;
+       log_prio = (s->__ddir & LOG_PRIO_SAMPLE_BIT) != 0;
+       log_avg_max = (s->__ddir & LOG_AVG_MAX_SAMPLE_BIT) != 0;
+
+       if (log_offset) {
+               if (log_prio) {
+                       if (log_avg_max)
+                               fmt = "%" PRIu64 ", %" PRId64 ", %" PRId64 ", %u, %llu, %llu, 0x%04x\n";
+                       else
+                               fmt = "%" PRIu64 ", %" PRId64 ", %u, %llu, %llu, 0x%04x\n";
+               } else {
+                       if (log_avg_max)
+                               fmt = "%" PRIu64 ", %" PRId64 ", %" PRId64 ", %u, %llu, %llu, %u\n";
+                       else
+                               fmt = "%" PRIu64 ", %" PRId64 ", %u, %llu, %llu, %u\n";
+               }
+       } else {
+               if (log_prio) {
+                       if (log_avg_max)
+                               fmt = "%" PRIu64 ", %" PRId64 ", %" PRId64 ", %u, %llu, 0x%04x\n";
+                       else
+                               fmt = "%" PRIu64 ", %" PRId64 ", %u, %llu, 0x%04x\n";
+               } else {
+                       if (log_avg_max)
+                               fmt = "%" PRIu64 ", %" PRId64 ", %" PRId64 ", %u, %llu, %u\n";
+                       else
+                               fmt = "%" PRIu64 ", %" PRId64 ", %u, %llu, %u\n";
+               }
+       }
 
        nr_samples = sample_size / __log_entry_sz(log_offset);
 
        for (i = 0; i < nr_samples; i++) {
                s = __get_sample(samples, log_offset, i);
 
+               if (log_prio)
+                       prio_val = s->priority;
+               else
+                       prio_val = ioprio_value_is_class_rt(s->priority);
+
                if (!log_offset) {
-                       fprintf(f, "%lu, %" PRId64 ", %u, %u\n",
-                                       (unsigned long) s->time,
-                                       s->data.val,
-                                       io_sample_ddir(s), s->bs);
+                       if (log_avg_max)
+                               fprintf(f, fmt,
+                                       s->time,
+                                       s->data.val.val0,
+                                       s->data.val.val1,
+                                       io_sample_ddir(s), (unsigned long long) s->bs,
+                                       prio_val);
+                       else
+                               fprintf(f, fmt,
+                                       s->time,
+                                       s->data.val.val0,
+                                       io_sample_ddir(s), (unsigned long long) s->bs,
+                                       prio_val);
                } else {
                        struct io_sample_offset *so = (void *) s;
 
-                       fprintf(f, "%lu, %" PRId64 ", %u, %u, %llu\n",
-                                       (unsigned long) s->time,
-                                       s->data.val,
-                                       io_sample_ddir(s), s->bs,
-                                       (unsigned long long) so->offset);
+                       if (log_avg_max)
+                               fprintf(f, fmt,
+                                       s->time,
+                                       s->data.val.val0,
+                                       s->data.val.val1,
+                                       io_sample_ddir(s), (unsigned long long) s->bs,
+                                       (unsigned long long) so->offset,
+                                       prio_val);
+                       else
+                               fprintf(f, fmt,
+                                       s->time,
+                                       s->data.val.val0,
+                                       io_sample_ddir(s), (unsigned long long) s->bs,
+                                       (unsigned long long) so->offset,
+                                       prio_val);
                }
        }
 }
@@ -984,12 +1274,12 @@ int iolog_file_inflate(const char *file)
        struct iolog_compress ic;
        z_stream stream;
        struct stat sb;
-       ssize_t ret;
+       size_t ret;
        size_t total;
        void *buf;
        FILE *f;
 
-       f = fopen(file, "r");
+       f = fopen(file, "rb");
        if (!f) {
                perror("fopen");
                return 1;
@@ -1006,12 +1296,12 @@ int iolog_file_inflate(const char *file)
        ic.seq = 1;
 
        ret = fread(ic.buf, ic.len, 1, f);
-       if (ret < 0) {
+       if (ret == 0 && ferror(f)) {
                perror("fread");
                fclose(f);
                free(buf);
                return 1;
-       } else if (ret != 1) {
+       } else if (ferror(f) || (!feof(f) && ret != 1)) {
                log_err("fio: short read on reading log\n");
                fclose(f);
                free(buf);
@@ -1071,10 +1361,21 @@ void flush_log(struct io_log *log, bool do_append)
        void *buf;
        FILE *f;
 
+       /*
+        * If log_gz_store is true, we are writing a binary file.
+        * Set the mode appropriately (on all platforms) to avoid issues
+        * on windows (line-ending conversions, etc.)
+        */
        if (!do_append)
-               f = fopen(log->filename, "w");
+               if (log->log_gz_store)
+                       f = fopen(log->filename, "wb");
+               else
+                       f = fopen(log->filename, "w");
        else
-               f = fopen(log->filename, "a");
+               if (log->log_gz_store)
+                       f = fopen(log->filename, "ab");
+               else
+                       f = fopen(log->filename, "a");
        if (!f) {
                perror("fopen log");
                return;
@@ -1146,8 +1447,6 @@ size_t log_chunk_sizes(struct io_log *log)
 
 #ifdef CONFIG_ZLIB
 
-static bool warned_on_drop;
-
 static void iolog_put_deferred(struct io_log *log, void *ptr)
 {
        if (!ptr)
@@ -1157,10 +1456,8 @@ static void iolog_put_deferred(struct io_log *log, void *ptr)
        if (log->deferred < IOLOG_MAX_DEFER) {
                log->deferred_items[log->deferred] = ptr;
                log->deferred++;
-       } else if (!warned_on_drop) {
+       } else if (!fio_did_warn(FIO_WARN_IOLOG_DROP))
                log_err("fio: had to drop log entry free\n");
-               warned_on_drop = true;
-       }
        pthread_mutex_unlock(&log->deferred_free_lock);
 }
 
@@ -1349,14 +1646,14 @@ void iolog_compress_exit(struct thread_data *td)
  * Queue work item to compress the existing log entries. We reset the
  * current log to a small size, and reference the existing log in the
  * data that we queue for compression. Once compression has been done,
- * this old log is freed. If called with finish == true, will not return
- * until the log compression has completed, and will flush all previous
- * logs too
+ * this old log is freed. Will not return until the log compression
+ * has completed, and will flush all previous logs too
  */
 static int iolog_flush(struct io_log *log)
 {
        struct iolog_flush_data *data;
 
+       workqueue_flush(&log->td->log_compress_wq);
        data = malloc(sizeof(*data));
        if (!data)
                return 1;
@@ -1621,9 +1918,7 @@ void td_writeout_logs(struct thread_data *td, bool unit_logs)
 
 void fio_writeout_logs(bool unit_logs)
 {
-       struct thread_data *td;
-       int i;
-
-       for_each_td(td, i)
+       for_each_td(td) {
                td_writeout_logs(td, unit_logs);
+       } end_for_each();
 }