2 * Code related to writing an iolog of what a thread is doing, and to
3 * later read that back and replay
20 static const char iolog_ver2[] = "fio version 2 iolog";
24 struct iolog_compress {
25 struct flist_head list;
31 #define GZ_CHUNK 131072
33 static struct iolog_compress *get_new_chunk(unsigned int seq)
35 struct iolog_compress *c;
37 c = malloc(sizeof(*c));
38 INIT_FLIST_HEAD(&c->list);
39 c->buf = malloc(GZ_CHUNK);
45 static void free_chunk(struct iolog_compress *ic)
53 void queue_io_piece(struct thread_data *td, struct io_piece *ipo)
55 flist_add_tail(&ipo->list, &td->io_log_list);
56 td->total_io_size += ipo->len;
59 void log_io_u(struct thread_data *td, struct io_u *io_u)
61 const char *act[] = { "read", "write", "sync", "datasync",
62 "sync_file_range", "wait", "trim" };
64 assert(io_u->ddir <= 6);
66 if (!td->o.write_iolog_file)
69 fprintf(td->iolog_f, "%s %s %llu %lu\n", io_u->file->file_name,
70 act[io_u->ddir], io_u->offset,
74 void log_file(struct thread_data *td, struct fio_file *f,
75 enum file_log_act what)
77 const char *act[] = { "add", "open", "close" };
81 if (!td->o.write_iolog_file)
86 * this happens on the pre-open/close done before the job starts
91 fprintf(td->iolog_f, "%s %s\n", f->file_name, act[what]);
94 static void iolog_delay(struct thread_data *td, unsigned long delay)
96 unsigned long usec = utime_since_now(&td->last_issue);
97 unsigned long this_delay;
105 * less than 100 usec delay, just regard it as noise
110 while (delay && !td->terminate) {
112 if (this_delay > 500000)
115 usec_sleep(td, this_delay);
120 static int ipo_special(struct thread_data *td, struct io_piece *ipo)
128 if (ipo->ddir != DDIR_INVAL)
131 f = td->files[ipo->fileno];
133 switch (ipo->file_action) {
134 case FIO_LOG_OPEN_FILE:
135 ret = td_io_open_file(td, f);
138 td_verror(td, ret, "iolog open file");
140 case FIO_LOG_CLOSE_FILE:
141 td_io_close_file(td, f);
143 case FIO_LOG_UNLINK_FILE:
144 unlink(f->file_name);
147 log_err("fio: bad file action %d\n", ipo->file_action);
154 int read_iolog_get(struct thread_data *td, struct io_u *io_u)
156 struct io_piece *ipo;
157 unsigned long elapsed;
159 while (!flist_empty(&td->io_log_list)) {
162 ipo = flist_entry(td->io_log_list.next, struct io_piece, list);
163 flist_del(&ipo->list);
164 remove_trim_entry(td, ipo);
166 ret = ipo_special(td, ipo);
170 } else if (ret > 0) {
175 io_u->ddir = ipo->ddir;
176 if (ipo->ddir != DDIR_WAIT) {
177 io_u->offset = ipo->offset;
178 io_u->buflen = ipo->len;
179 io_u->file = td->files[ipo->fileno];
180 get_file(io_u->file);
181 dprint(FD_IO, "iolog: get %llu/%lu/%s\n", io_u->offset,
182 io_u->buflen, io_u->file->file_name);
184 iolog_delay(td, ipo->delay);
186 elapsed = mtime_since_genesis();
187 if (ipo->delay > elapsed)
188 usec_sleep(td, (ipo->delay - elapsed) * 1000);
193 if (io_u->ddir != DDIR_WAIT)
201 void prune_io_piece_log(struct thread_data *td)
203 struct io_piece *ipo;
206 while ((n = rb_first(&td->io_hist_tree)) != NULL) {
207 ipo = rb_entry(n, struct io_piece, rb_node);
208 rb_erase(n, &td->io_hist_tree);
209 remove_trim_entry(td, ipo);
214 while (!flist_empty(&td->io_hist_list)) {
215 ipo = flist_entry(td->io_hist_list.next, struct io_piece, list);
216 flist_del(&ipo->list);
217 remove_trim_entry(td, ipo);
224 * log a successful write, so we can unwind the log for verify
226 void log_io_piece(struct thread_data *td, struct io_u *io_u)
228 struct rb_node **p, *parent;
229 struct io_piece *ipo, *__ipo;
231 ipo = malloc(sizeof(struct io_piece));
233 ipo->file = io_u->file;
234 ipo->offset = io_u->offset;
235 ipo->len = io_u->buflen;
236 ipo->numberio = io_u->numberio;
237 ipo->flags = IP_F_IN_FLIGHT;
241 if (io_u_should_trim(td, io_u)) {
242 flist_add_tail(&ipo->trim_list, &td->trim_list);
247 * We don't need to sort the entries, if:
249 * Sequential writes, or
250 * Random writes that lay out the file as it goes along
252 * For both these cases, just reading back data in the order we
253 * wrote it out is the fastest.
255 * One exception is if we don't have a random map AND we are doing
256 * verifies, in that case we need to check for duplicate blocks and
257 * drop the old one, which we rely on the rb insert/lookup for
260 if (((!td->o.verifysort) || !td_random(td) || !td->o.overwrite) &&
261 (file_randommap(td, ipo->file) || td->o.verify == VERIFY_NONE)) {
262 INIT_FLIST_HEAD(&ipo->list);
263 flist_add_tail(&ipo->list, &td->io_hist_list);
264 ipo->flags |= IP_F_ONLIST;
269 RB_CLEAR_NODE(&ipo->rb_node);
272 * Sort the entry into the verification list
275 p = &td->io_hist_tree.rb_node;
280 __ipo = rb_entry(parent, struct io_piece, rb_node);
281 if (ipo->file < __ipo->file)
283 else if (ipo->file > __ipo->file)
285 else if (ipo->offset < __ipo->offset)
287 else if (ipo->offset > __ipo->offset)
290 dprint(FD_IO, "iolog: overlap %llu/%lu, %llu/%lu",
291 __ipo->offset, __ipo->len,
292 ipo->offset, ipo->len);
294 rb_erase(parent, &td->io_hist_tree);
295 remove_trim_entry(td, __ipo);
301 rb_link_node(&ipo->rb_node, parent, p);
302 rb_insert_color(&ipo->rb_node, &td->io_hist_tree);
303 ipo->flags |= IP_F_ONRB;
307 void unlog_io_piece(struct thread_data *td, struct io_u *io_u)
309 struct io_piece *ipo = io_u->ipo;
314 if (ipo->flags & IP_F_ONRB)
315 rb_erase(&ipo->rb_node, &td->io_hist_tree);
316 else if (ipo->flags & IP_F_ONLIST)
317 flist_del(&ipo->list);
324 void trim_io_piece(struct thread_data *td, struct io_u *io_u)
326 struct io_piece *ipo = io_u->ipo;
331 ipo->len = io_u->xfer_buflen - io_u->resid;
334 void write_iolog_close(struct thread_data *td)
340 td->iolog_buf = NULL;
344 * Read version 2 iolog data. It is enhanced to include per-file logging,
347 static int read_iolog2(struct thread_data *td, FILE *f)
349 unsigned long long offset;
351 int reads, writes, waits, fileno = 0, file_action = 0; /* stupid gcc */
356 free_release_files(td);
359 * Read in the read iolog and store it, reuse the infrastructure
360 * for doing verifications.
363 fname = malloc(256+16);
364 act = malloc(256+16);
366 reads = writes = waits = 0;
367 while ((p = fgets(str, 4096, f)) != NULL) {
368 struct io_piece *ipo;
371 r = sscanf(p, "%256s %256s %llu %u", fname, act, &offset,
377 if (!strcmp(act, "wait"))
379 else if (!strcmp(act, "read"))
381 else if (!strcmp(act, "write"))
383 else if (!strcmp(act, "sync"))
385 else if (!strcmp(act, "datasync"))
387 else if (!strcmp(act, "trim"))
390 log_err("fio: bad iolog file action: %s\n",
394 fileno = get_fileno(td, fname);
397 if (!strcmp(act, "add")) {
398 fileno = add_file(td, fname, 0, 1);
399 file_action = FIO_LOG_ADD_FILE;
401 } else if (!strcmp(act, "open")) {
402 fileno = get_fileno(td, fname);
403 file_action = FIO_LOG_OPEN_FILE;
404 } else if (!strcmp(act, "close")) {
405 fileno = get_fileno(td, fname);
406 file_action = FIO_LOG_CLOSE_FILE;
408 log_err("fio: bad iolog file action: %s\n",
413 log_err("bad iolog2: %s", p);
419 else if (rw == DDIR_WRITE) {
421 * Don't add a write for ro mode
426 } else if (rw == DDIR_WAIT) {
428 } else if (rw == DDIR_INVAL) {
429 } else if (!ddir_sync(rw)) {
430 log_err("bad ddir: %d\n", rw);
437 ipo = malloc(sizeof(*ipo));
440 if (rw == DDIR_WAIT) {
443 ipo->offset = offset;
445 if (rw != DDIR_INVAL && bytes > td->o.max_bs[rw])
446 td->o.max_bs[rw] = bytes;
447 ipo->fileno = fileno;
448 ipo->file_action = file_action;
452 queue_io_piece(td, ipo);
459 if (writes && read_only) {
460 log_err("fio: <%s> skips replay of %d writes due to"
461 " read-only\n", td->o.name, writes);
465 if (!reads && !writes && !waits)
467 else if (reads && !writes)
468 td->o.td_ddir = TD_DDIR_READ;
469 else if (!reads && writes)
470 td->o.td_ddir = TD_DDIR_WRITE;
472 td->o.td_ddir = TD_DDIR_RW;
478 * open iolog, check version, and call appropriate parser
480 static int init_iolog_read(struct thread_data *td)
482 char buffer[256], *p;
486 f = fopen(td->o.read_iolog_file, "r");
488 perror("fopen read iolog");
492 p = fgets(buffer, sizeof(buffer), f);
494 td_verror(td, errno, "iolog read");
495 log_err("fio: unable to read iolog\n");
501 * version 2 of the iolog stores a specific string as the
502 * first line, check for that
504 if (!strncmp(iolog_ver2, buffer, strlen(iolog_ver2)))
505 ret = read_iolog2(td, f);
507 log_err("fio: iolog version 1 is no longer supported\n");
516 * Set up a log for storing io patterns.
518 static int init_iolog_write(struct thread_data *td)
524 f = fopen(td->o.write_iolog_file, "a");
526 perror("fopen write iolog");
531 * That's it for writing, setup a log buffer and we're done.
534 td->iolog_buf = malloc(8192);
535 setvbuf(f, td->iolog_buf, _IOFBF, 8192);
538 * write our version line
540 if (fprintf(f, "%s\n", iolog_ver2) < 0) {
541 perror("iolog init\n");
546 * add all known files
548 for_each_file(td, ff, i)
549 log_file(td, ff, FIO_LOG_ADD_FILE);
554 int init_iolog(struct thread_data *td)
558 if (td->o.read_iolog_file) {
562 * Check if it's a blktrace file and load that if possible.
563 * Otherwise assume it's a normal log file and load that.
565 if (is_blktrace(td->o.read_iolog_file, &need_swap))
566 ret = load_blktrace(td, td->o.read_iolog_file, need_swap);
568 ret = init_iolog_read(td);
569 } else if (td->o.write_iolog_file)
570 ret = init_iolog_write(td);
573 td_verror(td, EINVAL, "failed initializing iolog");
578 void setup_log(struct io_log **log, struct log_params *p,
579 const char *filename)
581 struct io_log *l = malloc(sizeof(*l));
583 memset(l, 0, sizeof(*l));
585 l->max_samples = 1024;
586 l->log_type = p->log_type;
587 l->log_offset = p->log_offset;
588 l->log_gz = p->log_gz;
589 l->log = malloc(l->max_samples * log_entry_sz(l));
590 l->avg_msec = p->avg_msec;
591 l->filename = strdup(filename);
594 INIT_FLIST_HEAD(&l->chunk_list);
596 if (l->log_gz && !p->td)
598 else if (l->log_gz) {
599 pthread_mutex_init(&l->chunk_lock, NULL);
600 p->td->flags |= TD_F_COMPRESS_LOG;
606 #ifdef CONFIG_SETVBUF
607 static void *set_file_buffer(FILE *f)
609 size_t size = 1048576;
613 setvbuf(f, buf, _IOFBF, size);
617 static void clear_file_buffer(void *buf)
622 static void *set_file_buffer(FILE *f)
627 static void clear_file_buffer(void *buf)
632 void free_log(struct io_log *log)
639 static void flush_samples(FILE *f, void *samples, uint64_t nr_samples,
644 for (i = 0; i < nr_samples; i++) {
645 struct io_sample *s = __get_sample(samples, log_offset, i);
648 fprintf(f, "%lu, %lu, %u, %u\n",
649 (unsigned long) s->time,
650 (unsigned long) s->val,
653 struct io_sample_offset *so = (void *) s;
655 fprintf(f, "%lu, %lu, %u, %u, %llu\n",
656 (unsigned long) s->time,
657 (unsigned long) s->val,
659 (unsigned long long) so->offset);
665 static int z_stream_init(z_stream *stream)
667 stream->zalloc = Z_NULL;
668 stream->zfree = Z_NULL;
669 stream->opaque = Z_NULL;
670 stream->next_in = Z_NULL;
672 if (inflateInit(stream) != Z_OK)
678 struct flush_chunk_iter {
686 static void finish_chunk(z_stream *stream, int log_offset, FILE *f,
687 struct flush_chunk_iter *iter)
692 ret = inflateEnd(stream);
694 log_err("fio: failed to end log inflation (%d)\n", ret);
696 nr_samples = iter->buf_used / __log_entry_sz(log_offset);
697 flush_samples(f, iter->buf, nr_samples, log_offset);
700 iter->buf_size = iter->buf_used = 0;
703 static int flush_chunk(struct iolog_compress *ic, int log_offset, FILE *f,
704 z_stream *stream, struct flush_chunk_iter *iter)
706 if (ic->seq != iter->seq) {
708 finish_chunk(stream, log_offset, f, iter);
710 z_stream_init(stream);
714 stream->avail_in = ic->len;
715 stream->next_in = ic->buf;
717 if (!iter->buf_size) {
718 iter->buf_size = iter->chunk_sz;
719 iter->buf = malloc(iter->buf_size);
722 while (stream->avail_in) {
725 stream->avail_out = iter->buf_size - iter->buf_used;
726 stream->next_out = iter->buf + iter->buf_used;
728 err = inflate(stream, Z_NO_FLUSH);
730 log_err("fio: failed inflating log: %d\n", err);
734 iter->buf_used += iter->buf_size - iter->buf_used - stream->avail_out;
741 static void flush_gz_chunks(struct io_log *log, FILE *f)
743 struct flush_chunk_iter iter = { .chunk_sz = log->log_gz, };
744 struct flist_head *node;
747 while (!flist_empty(&log->chunk_list)) {
748 struct iolog_compress *ic;
750 node = log->chunk_list.next;
751 ic = flist_entry(node, struct iolog_compress, list);
752 flist_del(&ic->list);
753 flush_chunk(ic, log->log_offset, f, &stream, &iter);
757 finish_chunk(&stream, log->log_offset, f, &iter);
764 static void flush_gz_chunks(struct io_log *log, FILE *f)
770 void flush_log(struct io_log *log)
775 f = fopen(log->filename, "w");
781 buf = set_file_buffer(f);
783 flush_gz_chunks(log, f);
785 flush_samples(f, log->log, log->nr_samples, log->log_offset);
788 clear_file_buffer(buf);
791 static int finish_log(struct thread_data *td, struct io_log *log, int trylock)
797 if (fio_trylock_file(log->filename))
800 fio_lock_file(log->filename);
802 if (td->client_type == FIO_CLIENT_TYPE_GUI)
803 fio_send_iolog(td, log, log->filename);
807 fio_unlock_file(log->filename);
814 struct iolog_flush_data {
821 static int gz_work(struct tp_work *work)
823 struct iolog_flush_data *data;
824 struct iolog_compress *c;
825 struct flist_head list;
831 INIT_FLIST_HEAD(&list);
833 data = container_of(work, struct iolog_flush_data, work);
835 stream.zalloc = Z_NULL;
836 stream.zfree = Z_NULL;
837 stream.opaque = Z_NULL;
839 if (deflateInit(&stream, Z_DEFAULT_COMPRESSION) != Z_OK) {
840 log_err("fio: failed to init gz stream\n");
844 seq = ++data->log->chunk_seq;
845 stream.next_in = (void *) data->samples;
846 stream.avail_in = data->nr_samples * log_entry_sz(data->log);
849 c = get_new_chunk(seq);
850 stream.avail_out = GZ_CHUNK;
851 stream.next_out = c->buf;
852 ret = deflate(&stream, Z_NO_FLUSH);
854 log_err("fio: deflate log (%d)\n", ret);
858 c->len = GZ_CHUNK - stream.avail_out;
859 flist_add_tail(&c->list, &list);
861 } while (stream.avail_in);
863 stream.next_out = c->buf + c->len;
864 stream.avail_out = GZ_CHUNK - c->len;
866 ret = deflate(&stream, Z_FINISH);
867 if (ret == Z_STREAM_END)
868 c->len = GZ_CHUNK - stream.avail_out;
871 c = get_new_chunk(seq);
872 stream.avail_out = GZ_CHUNK;
873 stream.next_out = c->buf;
874 ret = deflate(&stream, Z_FINISH);
875 c->len = GZ_CHUNK - stream.avail_out;
876 flist_add_tail(&c->list, &list);
877 } while (ret != Z_STREAM_END);
880 ret = deflateEnd(&stream);
882 log_err("fio: deflateEnd %d\n", ret);
886 if (!flist_empty(&list)) {
887 pthread_mutex_lock(&data->log->chunk_lock);
888 flist_splice_tail(&list, &data->log->chunk_list);
889 pthread_mutex_unlock(&data->log->chunk_lock);
894 pthread_cond_signal(&work->cv);
901 int iolog_flush(struct io_log *log, int wait)
903 struct thread_data *td = log->td;
904 struct iolog_flush_data *data;
907 data = malloc(sizeof(*data));
913 sample_size = log->nr_samples * log_entry_sz(log);
914 data->samples = malloc(sample_size);
915 if (!data->samples) {
920 memcpy(data->samples, log->log, sample_size);
921 data->nr_samples = log->nr_samples;
922 data->work.fn = gz_work;
926 pthread_mutex_init(&data->work.lock, NULL);
927 pthread_cond_init(&data->work.cv, NULL);
932 tp_queue_work(td->tp_data, &data->work);
935 pthread_mutex_lock(&data->work.lock);
936 while (!data->work.done)
937 pthread_cond_wait(&data->work.cv, &data->work.lock);
938 pthread_mutex_unlock(&data->work.lock);
947 int iolog_flush(struct io_log *log, int wait)
954 static int write_iops_log(struct thread_data *td, int try)
956 struct io_log *log = td->iops_log;
961 return finish_log(td, log, try);
964 static int write_slat_log(struct thread_data *td, int try)
966 struct io_log *log = td->slat_log;
971 return finish_log(td, log, try);
974 static int write_clat_log(struct thread_data *td, int try)
976 struct io_log *log = td->clat_log;
981 return finish_log(td, log, try);
984 static int write_lat_log(struct thread_data *td, int try)
986 struct io_log *log = td->lat_log;
991 return finish_log(td, log, try);
994 static int write_bandw_log(struct thread_data *td, int try)
996 struct io_log *log = td->bw_log;
1001 return finish_log(td, log, try);
1016 int (*fn)(struct thread_data *, int);
1019 static struct log_type log_types[] = {
1021 .mask = BW_LOG_MASK,
1022 .fn = write_bandw_log,
1025 .mask = LAT_LOG_MASK,
1026 .fn = write_lat_log,
1029 .mask = SLAT_LOG_MASK,
1030 .fn = write_slat_log,
1033 .mask = CLAT_LOG_MASK,
1034 .fn = write_clat_log,
1037 .mask = IOPS_LOG_MASK,
1038 .fn = write_iops_log,
1042 void fio_writeout_logs(struct thread_data *td)
1044 unsigned int log_mask = 0;
1045 unsigned int log_left = ALL_LOG_NR;
1048 old_state = td_bump_runstate(td, TD_FINISHING);
1053 int prev_log_left = log_left;
1055 for (i = 0; i < ALL_LOG_NR && log_left; i++) {
1056 struct log_type *lt = &log_types[i];
1059 if (!(log_mask & lt->mask)) {
1060 ret = lt->fn(td, log_left != 1);
1063 log_mask |= lt->mask;
1068 if (prev_log_left == log_left)
1072 td_restore_runstate(td, old_state);