X-Git-Url: https://git.kernel.dk/?p=fio.git;a=blobdiff_plain;f=blktrace.c;h=efe9ce248a8c7f861a6aa5dbec9163f6d51ca90d;hp=8d288b080392063775c8ad477f57ef7c8bac5791;hb=7f125e7f3879d23e79bc2ef5eed678ddab3b5c70;hpb=a5f29b4aaa7fc75a36e10928f723186a3ef4368c diff --git a/blktrace.c b/blktrace.c index 8d288b08..efe9ce24 100644 --- a/blktrace.c +++ b/blktrace.c @@ -3,13 +3,13 @@ */ #include #include -#include -#include +#include #include "flist.h" #include "fio.h" +#include "blktrace.h" #include "blktrace_api.h" -#include "lib/linux-dev-lookup.h" +#include "oslib/linux-dev-lookup.h" #define TRACE_FIFO_SIZE 8192 @@ -72,29 +72,29 @@ static int discard_pdu(struct thread_data *td, struct fifo *fifo, int fd, * Check if this is a blktrace binary data file. We read a single trace * into memory and check for the magic signature. */ -int is_blktrace(const char *filename, int *need_swap) +bool is_blktrace(const char *filename, int *need_swap) { struct blk_io_trace t; int fd, ret; fd = open(filename, O_RDONLY); if (fd < 0) - return 0; + return false; ret = read(fd, &t, sizeof(t)); close(fd); if (ret < 0) { perror("read blktrace"); - return 0; + return false; } else if (ret != sizeof(t)) { log_err("fio: short read on blktrace file\n"); - return 0; + return false; } if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC) { *need_swap = 0; - return 1; + return true; } /* @@ -103,10 +103,10 @@ int is_blktrace(const char *filename, int *need_swap) t.magic = fio_swap32(t.magic); if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC) { *need_swap = 1; - return 1; + return true; } - return 0; + return false; } #define FMINORBITS 20 @@ -174,6 +174,14 @@ static int trace_add_file(struct thread_data *td, __u32 device) return last_fileno; } +static void t_bytes_align(struct thread_options *o, struct blk_io_trace *t) +{ + if (!o->replay_align) + return; + + t->bytes = (t->bytes + o->replay_align - 1) & ~(o->replay_align - 1); +} + /* * Store blk_io_trace data in an ipo for later retrieval. */ @@ -181,14 +189,15 @@ static void store_ipo(struct thread_data *td, unsigned long long offset, unsigned int bytes, int rw, unsigned long long ttime, int fileno) { - struct io_piece *ipo = malloc(sizeof(*ipo)); + struct io_piece *ipo; + ipo = calloc(1, sizeof(*ipo)); init_ipo(ipo); - /* - * the 512 is wrong here, it should be the hardware sector size... - */ ipo->offset = offset * 512; + if (td->o.replay_scale) + ipo->offset = ipo->offset / td->o.replay_scale; + ipo_bytes_align(td->o.replay_align, ipo); ipo->len = bytes; ipo->delay = ttime / 1000; if (rw) @@ -227,9 +236,13 @@ static void handle_trace_discard(struct thread_data *td, unsigned long long ttime, unsigned long *ios, unsigned int *bs) { - struct io_piece *ipo = malloc(sizeof(*ipo)); + struct io_piece *ipo; int fileno; + if (td->o.replay_skip & (1u << DDIR_TRIM)) + return; + + ipo = calloc(1, sizeof(*ipo)); init_ipo(ipo); fileno = trace_add_file(td, t->device); @@ -239,13 +252,12 @@ static void handle_trace_discard(struct thread_data *td, td->o.size += t->bytes; - memset(ipo, 0, sizeof(*ipo)); INIT_FLIST_HEAD(&ipo->list); - /* - * the 512 is wrong here, it should be the hardware sector size... - */ ipo->offset = t->sector * 512; + if (td->o.replay_scale) + ipo->offset = ipo->offset / td->o.replay_scale; + ipo_bytes_align(td->o.replay_align, ipo); ipo->len = t->bytes; ipo->delay = ttime / 1000; ipo->ddir = DDIR_TRIM; @@ -257,6 +269,11 @@ static void handle_trace_discard(struct thread_data *td, queue_io_piece(td, ipo); } +static void dump_trace(struct blk_io_trace *t) +{ + log_err("blktrace: ignoring zero byte trace: action=%x\n", t->action); +} + static void handle_trace_fs(struct thread_data *td, struct blk_io_trace *t, unsigned long long ttime, unsigned long *ios, unsigned int *bs) @@ -268,6 +285,20 @@ static void handle_trace_fs(struct thread_data *td, struct blk_io_trace *t, rw = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0; + if (rw) { + if (td->o.replay_skip & (1u << DDIR_WRITE)) + return; + } else { + if (td->o.replay_skip & (1u << DDIR_READ)) + return; + } + + if (!t->bytes) { + if (!fio_did_warn(FIO_WARN_BTRACE_ZERO)) + dump_trace(t); + return; + } + if (t->bytes > bs[rw]) bs[rw] = t->bytes; @@ -276,6 +307,28 @@ static void handle_trace_fs(struct thread_data *td, struct blk_io_trace *t, store_ipo(td, t->sector, t->bytes, rw, ttime, fileno); } +static void handle_trace_flush(struct thread_data *td, struct blk_io_trace *t, + unsigned long long ttime, unsigned long *ios) +{ + struct io_piece *ipo; + int fileno; + + if (td->o.replay_skip & (1u << DDIR_SYNC)) + return; + + ipo = calloc(1, sizeof(*ipo)); + init_ipo(ipo); + fileno = trace_add_file(td, t->device); + + ipo->delay = ttime / 1000; + ipo->ddir = DDIR_SYNC; + ipo->fileno = fileno; + + ios[DDIR_SYNC]++; + dprint(FD_BLKTRACE, "store flush delay=%lu\n", ipo->delay); + queue_io_piece(td, ipo); +} + /* * We only care for queue traces, most of the others are side effects * due to internal workings of the block layer. @@ -284,25 +337,35 @@ static void handle_trace(struct thread_data *td, struct blk_io_trace *t, unsigned long *ios, unsigned int *bs) { static unsigned long long last_ttime; - unsigned long long delay; + unsigned long long delay = 0; if ((t->action & 0xffff) != __BLK_TA_QUEUE) return; if (!(t->action & BLK_TC_ACT(BLK_TC_NOTIFY))) { - if (!last_ttime || td->o.no_stall) { - last_ttime = t->time; + if (!last_ttime || td->o.no_stall) delay = 0; - } else { + else if (td->o.replay_time_scale == 100) delay = t->time - last_ttime; - last_ttime = t->time; + else { + double tmp = t->time - last_ttime; + double scale; + + scale = (double) 100.0 / (double) td->o.replay_time_scale; + tmp *= scale; + delay = tmp; } + last_ttime = t->time; } + t_bytes_align(&td->o, t); + if (t->action & BLK_TC_ACT(BLK_TC_NOTIFY)) handle_trace_notify(t); else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD)) handle_trace_discard(td, t, delay, ios, bs); + else if (t->action & BLK_TC_ACT(BLK_TC_FLUSH)) + handle_trace_flush(td, t, delay, ios); else handle_trace_fs(td, t, delay, ios, bs); } @@ -322,29 +385,72 @@ static void byteswap_trace(struct blk_io_trace *t) t->pdu_len = fio_swap16(t->pdu_len); } -static int t_is_write(struct blk_io_trace *t) +static bool t_is_write(struct blk_io_trace *t) { return (t->action & BLK_TC_ACT(BLK_TC_WRITE | BLK_TC_DISCARD)) != 0; } +static enum fio_ddir t_get_ddir(struct blk_io_trace *t) +{ + if (t->action & BLK_TC_ACT(BLK_TC_READ)) + return DDIR_READ; + else if (t->action & BLK_TC_ACT(BLK_TC_WRITE)) + return DDIR_WRITE; + else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD)) + return DDIR_TRIM; + + return DDIR_INVAL; +} + +static void depth_inc(struct blk_io_trace *t, int *depth) +{ + enum fio_ddir ddir; + + ddir = t_get_ddir(t); + if (ddir != DDIR_INVAL) + depth[ddir]++; +} + +static void depth_dec(struct blk_io_trace *t, int *depth) +{ + enum fio_ddir ddir; + + ddir = t_get_ddir(t); + if (ddir != DDIR_INVAL) + depth[ddir]--; +} + +static void depth_end(struct blk_io_trace *t, int *this_depth, int *depth) +{ + enum fio_ddir ddir = DDIR_INVAL; + + ddir = t_get_ddir(t); + if (ddir != DDIR_INVAL) { + depth[ddir] = max(depth[ddir], this_depth[ddir]); + this_depth[ddir] = 0; + } +} + /* * Load a blktrace file by reading all the blk_io_trace entries, and storing * them as io_pieces like the fio text version would do. */ -int load_blktrace(struct thread_data *td, const char *filename, int need_swap) +bool load_blktrace(struct thread_data *td, const char *filename, int need_swap) { struct blk_io_trace t; - unsigned long ios[DDIR_RWDIR_CNT], skipped_writes; - unsigned int rw_bs[DDIR_RWDIR_CNT]; + unsigned long ios[DDIR_RWDIR_SYNC_CNT] = { }; + unsigned int rw_bs[DDIR_RWDIR_CNT] = { }; + unsigned long skipped_writes; struct fifo *fifo; - int fd, i, old_state; + int fd, i, old_state, max_depth; struct fio_file *f; - int this_depth, depth; + int this_depth[DDIR_RWDIR_CNT] = { }; + int depth[DDIR_RWDIR_CNT] = { }; fd = open(filename, O_RDONLY); if (fd < 0) { td_verror(td, errno, "open blktrace file"); - return 1; + return false; } fifo = fifo_alloc(TRACE_FIFO_SIZE); @@ -352,11 +458,7 @@ int load_blktrace(struct thread_data *td, const char *filename, int need_swap) old_state = td_bump_runstate(td, TD_SETTING_UP); td->o.size = 0; - - ios[0] = ios[1] = 0; - rw_bs[0] = rw_bs[1] = 0; skipped_writes = 0; - this_depth = depth = 0; do { int ret = trace_fifo_get(td, fifo, fd, &t, sizeof(t)); @@ -392,11 +494,12 @@ int load_blktrace(struct thread_data *td, const char *filename, int need_swap) } if ((t.action & BLK_TC_ACT(BLK_TC_NOTIFY)) == 0) { if ((t.action & 0xffff) == __BLK_TA_QUEUE) - this_depth++; - else if ((t.action & 0xffff) == __BLK_TA_COMPLETE) { - depth = max(depth, this_depth); - this_depth = 0; - } + depth_inc(&t, this_depth); + else if (((t.action & 0xffff) == __BLK_TA_BACKMERGE) || + ((t.action & 0xffff) == __BLK_TA_FRONTMERGE)) + depth_dec(&t, this_depth); + else if ((t.action & 0xffff) == __BLK_TA_COMPLETE) + depth_end(&t, this_depth, depth); if (t_is_write(&t) && read_only) { skipped_writes++; @@ -407,10 +510,8 @@ int load_blktrace(struct thread_data *td, const char *filename, int need_swap) handle_trace(td, &t, ios, rw_bs); } while (1); - for (i = 0; i < td->files_index; i++) { - f = td->files[i]; + for_each_file(td, f, i) trace_add_open_close_event(td, f->fileno, FIO_LOG_CLOSE_FILE); - } fifo_free(fifo); close(fd); @@ -419,23 +520,30 @@ int load_blktrace(struct thread_data *td, const char *filename, int need_swap) if (!td->files_index) { log_err("fio: did not find replay device(s)\n"); - return 1; + return false; } /* * For stacked devices, we don't always get a COMPLETE event so * the depth grows to insane values. Limit it to something sane(r). */ - if (!depth || depth > 1024) - depth = 1024; + max_depth = 0; + for (i = 0; i < DDIR_RWDIR_CNT; i++) { + if (depth[i] > 1024) + depth[i] = 1024; + else if (!depth[i] && ios[i]) + depth[i] = 1; + max_depth = max(depth[i], max_depth); + } if (skipped_writes) log_err("fio: %s skips replay of %lu writes due to read-only\n", td->o.name, skipped_writes); - if (!ios[DDIR_READ] && !ios[DDIR_WRITE]) { + if (!ios[DDIR_READ] && !ios[DDIR_WRITE] && !ios[DDIR_TRIM] && + !ios[DDIR_SYNC]) { log_err("fio: found no ios in blktrace data\n"); - return 1; + return false; } else if (ios[DDIR_READ] && !ios[DDIR_WRITE]) { td->o.td_ddir = TD_DDIR_READ; td->o.max_bs[DDIR_READ] = rw_bs[DDIR_READ]; @@ -451,20 +559,231 @@ int load_blktrace(struct thread_data *td, const char *filename, int need_swap) /* * We need to do direct/raw ios to the device, to avoid getting - * read-ahead in our way. + * read-ahead in our way. But only do so if the minimum block size + * is a multiple of 4k, otherwise we don't know if it's safe to do so. */ - td->o.odirect = 1; + if (!fio_option_is_set(&td->o, odirect) && !(td_min_bs(td) & 4095)) + td->o.odirect = 1; /* - * we don't know if this option was set or not. it defaults to 1, - * so we'll just guess that we should override it if it's still 1 + * If depth wasn't manually set, use probed depth */ - if (td->o.iodepth != 1) - td->o.iodepth = depth; + if (!fio_option_is_set(&td->o, iodepth)) + td->o.iodepth = td->o.iodepth_low = max_depth; - return 0; + return true; err: close(fd); fifo_free(fifo); - return 1; + return false; +} + +static int init_merge_param_list(fio_fp64_t *vals, struct blktrace_cursor *bcs, + int nr_logs, int def, size_t off) +{ + int i = 0, len = 0; + + while (len < FIO_IO_U_LIST_MAX_LEN && vals[len].u.f != 0.0) + len++; + + if (len && len != nr_logs) + return len; + + for (i = 0; i < nr_logs; i++) { + int *val = (int *)((char *)&bcs[i] + off); + *val = def; + if (len) + *val = (int)vals[i].u.f; + } + + return 0; + +} + +static int find_earliest_io(struct blktrace_cursor *bcs, int nr_logs) +{ + __u64 time = ~(__u64)0; + int idx = 0, i; + + for (i = 0; i < nr_logs; i++) { + if (bcs[i].t.time < time) { + time = bcs[i].t.time; + idx = i; + } + } + + return idx; +} + +static void merge_finish_file(struct blktrace_cursor *bcs, int i, int *nr_logs) +{ + bcs[i].iter++; + if (bcs[i].iter < bcs[i].nr_iter) { + lseek(bcs[i].fd, 0, SEEK_SET); + return; + } + + *nr_logs -= 1; + + /* close file */ + fifo_free(bcs[i].fifo); + close(bcs[i].fd); + + /* keep active files contiguous */ + memmove(&bcs[i], &bcs[*nr_logs], sizeof(bcs[i])); +} + +static int read_trace(struct thread_data *td, struct blktrace_cursor *bc) +{ + int ret = 0; + struct blk_io_trace *t = &bc->t; + +read_skip: + /* read an io trace */ + ret = trace_fifo_get(td, bc->fifo, bc->fd, t, sizeof(*t)); + if (ret < 0) { + return ret; + } else if (!ret) { + if (!bc->length) + bc->length = bc->t.time; + return ret; + } else if (ret < (int) sizeof(*t)) { + log_err("fio: short fifo get\n"); + return -1; + } + + if (bc->swap) + byteswap_trace(t); + + /* skip over actions that fio does not care about */ + if ((t->action & 0xffff) != __BLK_TA_QUEUE || + t_get_ddir(t) == DDIR_INVAL) { + ret = discard_pdu(td, bc->fifo, bc->fd, t); + if (ret < 0) { + td_verror(td, ret, "blktrace lseek"); + return ret; + } else if (t->pdu_len != ret) { + log_err("fio: discarded %d of %d\n", ret, + t->pdu_len); + return -1; + } + goto read_skip; + } + + t->time = (t->time + bc->iter * bc->length) * bc->scalar / 100; + + return ret; +} + +static int write_trace(FILE *fp, struct blk_io_trace *t) +{ + /* pdu is not used so just write out only the io trace */ + t->pdu_len = 0; + return fwrite((void *)t, sizeof(*t), 1, fp); +} + +int merge_blktrace_iologs(struct thread_data *td) +{ + int nr_logs = get_max_str_idx(td->o.read_iolog_file); + struct blktrace_cursor *bcs = malloc(sizeof(struct blktrace_cursor) * + nr_logs); + struct blktrace_cursor *bc; + FILE *merge_fp; + char *str, *ptr, *name, *merge_buf; + int i, ret; + + ret = init_merge_param_list(td->o.merge_blktrace_scalars, bcs, nr_logs, + 100, offsetof(struct blktrace_cursor, + scalar)); + if (ret) { + log_err("fio: merge_blktrace_scalars(%d) != nr_logs(%d)\n", + ret, nr_logs); + goto err_param; + } + + ret = init_merge_param_list(td->o.merge_blktrace_iters, bcs, nr_logs, + 1, offsetof(struct blktrace_cursor, + nr_iter)); + if (ret) { + log_err("fio: merge_blktrace_iters(%d) != nr_logs(%d)\n", + ret, nr_logs); + goto err_param; + } + + /* setup output file */ + merge_fp = fopen(td->o.merge_blktrace_file, "w"); + merge_buf = malloc(128 * 1024); + ret = setvbuf(merge_fp, merge_buf, _IOFBF, 128 * 1024); + if (ret) + goto err_out_file; + + /* setup input files */ + str = ptr = strdup(td->o.read_iolog_file); + nr_logs = 0; + for (i = 0; (name = get_next_str(&ptr)) != NULL; i++) { + bcs[i].fd = open(name, O_RDONLY); + if (bcs[i].fd < 0) { + log_err("fio: could not open file: %s\n", name); + ret = bcs[i].fd; + goto err_file; + } + bcs[i].fifo = fifo_alloc(TRACE_FIFO_SIZE); + nr_logs++; + + if (!is_blktrace(name, &bcs[i].swap)) { + log_err("fio: file is not a blktrace: %s\n", name); + goto err_file; + } + + ret = read_trace(td, &bcs[i]); + if (ret < 0) { + goto err_file; + } else if (!ret) { + merge_finish_file(bcs, i, &nr_logs); + i--; + } + } + free(str); + + /* merge files */ + while (nr_logs) { + i = find_earliest_io(bcs, nr_logs); + bc = &bcs[i]; + /* skip over the pdu */ + ret = discard_pdu(td, bc->fifo, bc->fd, &bc->t); + if (ret < 0) { + td_verror(td, ret, "blktrace lseek"); + goto err_file; + } else if (bc->t.pdu_len != ret) { + log_err("fio: discarded %d of %d\n", ret, + bc->t.pdu_len); + goto err_file; + } + + ret = write_trace(merge_fp, &bc->t); + ret = read_trace(td, bc); + if (ret < 0) + goto err_file; + else if (!ret) + merge_finish_file(bcs, i, &nr_logs); + } + + /* set iolog file to read from the newly merged file */ + td->o.read_iolog_file = td->o.merge_blktrace_file; + ret = 0; + +err_file: + /* cleanup */ + for (i = 0; i < nr_logs; i++) { + fifo_free(bcs[i].fifo); + close(bcs[i].fd); + } +err_out_file: + fflush(merge_fp); + fclose(merge_fp); + free(merge_buf); +err_param: + free(bcs); + + return ret; }