From: Radha Ramachandran Date: Mon, 15 Jun 2009 06:40:16 +0000 (+0200) Subject: Add a 'continue_on_error' option to fio X-Git-Tag: fio-1.29-rc1~15 X-Git-Url: https://git.kernel.dk/?p=fio.git;a=commitdiff_plain;h=f2bba1820a567ac00b09916239ac8feb125cead2 Add a 'continue_on_error' option to fio Add option to make fio continue on non-fatal errors. Signed-off-by: Jens Axboe --- diff --git a/HOWTO b/HOWTO index 536e370b..0eab6e11 100644 --- a/HOWTO +++ b/HOWTO @@ -928,6 +928,14 @@ gtod_cpu=int Sometimes it's cheaper to dedicate a single thread of for doing these time calls will be excluded from other uses. Fio will manually clear it from the CPU mask of other jobs. +continue_on_error=bool Normally fio will exit the job on the first observed + failure. If this option is set, fio will continue the job when + there is a 'non-fatal error' (EIO or EILSEQ) until the runtime + is exceeded or the I/O size specified is completed. If this + option is used, there are two more stats that are appended, + the total error count and the first error. The error field + given in the stats is the first error that was hit during the + run. 6.0 Interpreting the output diff --git a/fio.1 b/fio.1 index b984a8cb..637304e5 100644 --- a/fio.1 +++ b/fio.1 @@ -680,6 +680,14 @@ threads/processes that run IO workloads need only copy that segment, instead of entering the kernel with a gettimeofday() call. The CPU set aside for doing these time calls will be excluded from other uses. Fio will manually clear it from the CPU mask of other jobs. +.TP +.BI continue_on_error \fR=\fPbool +Normally fio will exit the job on the first observed failure. If this option is +set, fio will continue the job when there is a 'non-fatal error' +(\fBEIO\fR or \fBEILSEQ\fR) until the runtime is exceeded or the I/O size +specified is completed. If this option is used, there are two more stats that +are appended, the total error count and the first error. The error field given +in the stats is the first error that was hit during the run. .SH OUTPUT While running, \fBfio\fR will display the status of the created jobs. For example: diff --git a/fio.c b/fio.c index 632b0025..e1da2c9e 100644 --- a/fio.c +++ b/fio.c @@ -372,6 +372,43 @@ static inline void update_tv_cache(struct thread_data *td) fio_gettime(&td->tv_cache, NULL); } +static int break_on_this_error(struct thread_data *td, int *retptr) +{ + int ret = *retptr; + + if (ret < 0 || td->error) { + int err; + + if (!td->o.continue_on_error); + return 0; + + if (ret < 0) + err = -ret; + else + err = td->error; + + update_error_count(td, err); + + if (td_non_fatal_error(err)) { + /* + * Continue with the I/Os in case of + * a non fatal error. + */ + td_clear_error(td); + *retptr = 0; + return 0; + } else { + /* + * Stop the I/O in case of a fatal + * error. + */ + return 1; + } + } + + return 0; +} + /* * The main verify engine. Runs over the writes we previously submitted, * reads the blocks back in, and checks the crc/md5 of the data. @@ -432,9 +469,10 @@ static void do_verify(struct thread_data *td) ret = td_io_queue(td, io_u); switch (ret) { case FIO_Q_COMPLETED: - if (io_u->error) + if (io_u->error) { ret = -io_u->error; - else if (io_u->resid) { + clear_io_u(td, io_u); + } else if (io_u->resid) { int bytes = io_u->xfer_buflen - io_u->resid; struct fio_file *f = io_u->file; @@ -478,7 +516,7 @@ sync_done: break; } - if (ret < 0 || td->error) + if (break_on_this_error(td, &ret)) break; /* @@ -569,9 +607,10 @@ static void do_io(struct thread_data *td) ret = td_io_queue(td, io_u); switch (ret) { case FIO_Q_COMPLETED: - if (io_u->error) + if (io_u->error) { ret = -io_u->error; - else if (io_u->resid) { + clear_io_u(td, io_u); + } else if (io_u->resid) { int bytes = io_u->xfer_buflen - io_u->resid; struct fio_file *f = io_u->file; @@ -626,7 +665,7 @@ sync_done: break; } - if (ret < 0 || td->error) + if (break_on_this_error(td, &ret)) break; /* diff --git a/fio.h b/fio.h index 21d49a6a..477b19a4 100644 --- a/fio.h +++ b/fio.h @@ -111,6 +111,13 @@ struct thread_stat { unsigned long long io_bytes[2]; unsigned long runtime[2]; unsigned long total_run_time; + + /* + * IO Error related stats + */ + unsigned continue_on_error; + unsigned long total_err_count; + int first_error; }; struct bssplit { @@ -241,6 +248,11 @@ struct thread_options { */ unsigned int cpuload; unsigned int cpucycle; + + /* + * I/O Error handling + */ + unsigned int continue_on_error; }; #define FIO_VERROR_SIZE 128 @@ -369,6 +381,12 @@ struct thread_data { * For generating file sizes */ os_random_state_t file_size_state; + + /* + * Error counts + */ + unsigned int total_err_count; + int first_error; }; /* @@ -386,10 +404,13 @@ enum { break; \ int e = (err); \ (td)->error = e; \ - snprintf(td->verror, sizeof(td->verror) - 1, "file:%s:%d, func=%s, error=%s", __FILE__, __LINE__, (func), (msg)); \ + if (!(td)->first_error) \ + snprintf(td->verror, sizeof(td->verror) - 1, "file:%s:%d, func=%s, error=%s", __FILE__, __LINE__, (func), (msg)); \ } while (0) +#define td_clear_error(td) \ + (td)->error = 0; #define td_verror(td, err, func) \ __td_verror((td), (err), strerror((err)), (func)) #define td_vmsg(td, err, msg, func) \ @@ -425,6 +446,15 @@ static inline void fio_ro_check(struct thread_data *td, struct io_u *io_u) #define MAX_JOBS (1024) +#define td_non_fatal_error(e) ((e) == -EIO || (e) == EILSEQ) + +static inline void update_error_count(struct thread_data *td, int err) +{ + td->total_err_count++; + if (td->total_err_count == 1) + td->first_error = err; +} + static inline int should_fsync(struct thread_data *td) { if (td->last_was_sync) diff --git a/io_u.c b/io_u.c index 34ab58a1..276f3b0c 100644 --- a/io_u.c +++ b/io_u.c @@ -412,6 +412,12 @@ void put_io_u(struct thread_data *td, struct io_u *io_u) td->cur_depth--; } +void clear_io_u(struct thread_data *td, struct io_u *io_u) +{ + io_u->flags &= ~IO_U_F_FLIGHT; + put_io_u(td, io_u); +} + void requeue_io_u(struct thread_data *td, struct io_u **io_u) { struct io_u *__io_u = *io_u; @@ -994,6 +1000,17 @@ static void io_completed(struct thread_data *td, struct io_u *io_u, icd->error = io_u->error; io_u_log_error(td, io_u); } + if (td->o.continue_on_error && icd->error && + td_non_fatal_error(icd->error)) { + /* + * If there is a non_fatal error, then add to the error count + * and clear all the errors. + */ + update_error_count(td, icd->error); + td_clear_error(td); + icd->error = 0; + io_u->error = 0; + } } static void init_icd(struct thread_data *td, struct io_completion_data *icd, diff --git a/ioengine.h b/ioengine.h index 9c0ed9a9..6190977d 100644 --- a/ioengine.h +++ b/ioengine.h @@ -139,6 +139,7 @@ extern void close_ioengine(struct thread_data *); extern struct io_u *__get_io_u(struct thread_data *); extern struct io_u *get_io_u(struct thread_data *); extern void put_io_u(struct thread_data *, struct io_u *); +extern void clear_io_u(struct thread_data *, struct io_u *); extern void requeue_io_u(struct thread_data *, struct io_u **); extern int __must_check io_u_sync_complete(struct thread_data *, struct io_u *, unsigned long *); extern int __must_check io_u_queued_complete(struct thread_data *, int, unsigned long *); diff --git a/options.c b/options.c index b65def9a..9606ab20 100644 --- a/options.c +++ b/options.c @@ -1504,6 +1504,13 @@ static struct fio_option options[] = { .cb = str_gtod_cpu_cb, .help = "Setup dedicated gettimeofday() thread on this CPU", }, + { + .name = "continue_on_error", + .type = FIO_OPT_BOOL, + .off1 = td_var_offset(continue_on_error), + .help = "Continue on non-fatal errors during I/O", + .def = "0", + }, { .name = NULL, }, diff --git a/stat.c b/stat.c index 977796cd..ec87deba 100644 --- a/stat.c +++ b/stat.c @@ -335,6 +335,10 @@ static void show_thread_status(struct thread_stat *ts, stat_calc_lat_u(ts, io_u_lat_u); stat_calc_lat_m(ts, io_u_lat_m); show_latencies(io_u_lat_u, io_u_lat_m); + if (ts->continue_on_error) { + log_info(" errors: total=%lu, first_error=%d\n", + ts->total_err_count, ts->first_error); + } } static void show_ddir_status_terse(struct thread_stat *ts, @@ -410,6 +414,8 @@ static void show_thread_status_terse(struct thread_stat *ts, log_info(";%3.2f%%", io_u_lat_u[i]); for (i = 0; i < FIO_IO_U_LAT_M_NR; i++) log_info(";%3.2f%%", io_u_lat_m[i]); + if (ts->continue_on_error) + log_info(";%lu;%d", ts->total_err_count, ts->first_error); log_info("\n"); if (ts->description) @@ -523,9 +529,18 @@ void show_run_stats(void) ts->pid = td->pid; } - if (td->error && !ts->error) { - ts->error = td->error; - ts->verror = td->verror; + ts->continue_on_error = td->o.continue_on_error; + ts->total_err_count += td->total_err_count; + ts->first_error = td->first_error; + if (!ts->error) { + if (!td->error && td->o.continue_on_error && + td->first_error) { + ts->error = td->first_error; + ts->verror = td->verror; + } else if (td->error) { + ts->error = td->error; + ts->verror = td->verror; + } } for (l = 0; l <= DDIR_WRITE; l++) {