summary |
shortlog |
log |
commit | commitdiff |
tree
raw |
patch |
inline | side by side (from parent 1:
9896178)
Add option to make fio continue on non-fatal errors.
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
for doing these time calls will be excluded from other
uses. Fio will manually clear it from the CPU mask of other
jobs.
for doing these time calls will be excluded from other
uses. Fio will manually clear it from the CPU mask of other
jobs.
+continue_on_error=bool Normally fio will exit the job on the first observed
+ failure. If this option is set, fio will continue the job when
+ there is a 'non-fatal error' (EIO or EILSEQ) until the runtime
+ is exceeded or the I/O size specified is completed. If this
+ option is used, there are two more stats that are appended,
+ the total error count and the first error. The error field
+ given in the stats is the first error that was hit during the
+ run.
6.0 Interpreting the output
6.0 Interpreting the output
entering the kernel with a gettimeofday() call. The CPU set aside for doing
these time calls will be excluded from other uses. Fio will manually clear it
from the CPU mask of other jobs.
entering the kernel with a gettimeofday() call. The CPU set aside for doing
these time calls will be excluded from other uses. Fio will manually clear it
from the CPU mask of other jobs.
+.TP
+.BI continue_on_error \fR=\fPbool
+Normally fio will exit the job on the first observed failure. If this option is
+set, fio will continue the job when there is a 'non-fatal error'
+(\fBEIO\fR or \fBEILSEQ\fR) until the runtime is exceeded or the I/O size
+specified is completed. If this option is used, there are two more stats that
+are appended, the total error count and the first error. The error field given
+in the stats is the first error that was hit during the run.
.SH OUTPUT
While running, \fBfio\fR will display the status of the created jobs. For
example:
.SH OUTPUT
While running, \fBfio\fR will display the status of the created jobs. For
example:
fio_gettime(&td->tv_cache, NULL);
}
fio_gettime(&td->tv_cache, NULL);
}
+static int break_on_this_error(struct thread_data *td, int *retptr)
+{
+ int ret = *retptr;
+
+ if (ret < 0 || td->error) {
+ int err;
+
+ if (!td->o.continue_on_error);
+ return 0;
+
+ if (ret < 0)
+ err = -ret;
+ else
+ err = td->error;
+
+ update_error_count(td, err);
+
+ if (td_non_fatal_error(err)) {
+ /*
+ * Continue with the I/Os in case of
+ * a non fatal error.
+ */
+ td_clear_error(td);
+ *retptr = 0;
+ return 0;
+ } else {
+ /*
+ * Stop the I/O in case of a fatal
+ * error.
+ */
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
/*
* The main verify engine. Runs over the writes we previously submitted,
* reads the blocks back in, and checks the crc/md5 of the data.
/*
* The main verify engine. Runs over the writes we previously submitted,
* reads the blocks back in, and checks the crc/md5 of the data.
ret = td_io_queue(td, io_u);
switch (ret) {
case FIO_Q_COMPLETED:
ret = td_io_queue(td, io_u);
switch (ret) {
case FIO_Q_COMPLETED:
- else if (io_u->resid) {
+ clear_io_u(td, io_u);
+ } else if (io_u->resid) {
int bytes = io_u->xfer_buflen - io_u->resid;
struct fio_file *f = io_u->file;
int bytes = io_u->xfer_buflen - io_u->resid;
struct fio_file *f = io_u->file;
- if (ret < 0 || td->error)
+ if (break_on_this_error(td, &ret))
ret = td_io_queue(td, io_u);
switch (ret) {
case FIO_Q_COMPLETED:
ret = td_io_queue(td, io_u);
switch (ret) {
case FIO_Q_COMPLETED:
- else if (io_u->resid) {
+ clear_io_u(td, io_u);
+ } else if (io_u->resid) {
int bytes = io_u->xfer_buflen - io_u->resid;
struct fio_file *f = io_u->file;
int bytes = io_u->xfer_buflen - io_u->resid;
struct fio_file *f = io_u->file;
- if (ret < 0 || td->error)
+ if (break_on_this_error(td, &ret))
unsigned long long io_bytes[2];
unsigned long runtime[2];
unsigned long total_run_time;
unsigned long long io_bytes[2];
unsigned long runtime[2];
unsigned long total_run_time;
+
+ /*
+ * IO Error related stats
+ */
+ unsigned continue_on_error;
+ unsigned long total_err_count;
+ int first_error;
*/
unsigned int cpuload;
unsigned int cpucycle;
*/
unsigned int cpuload;
unsigned int cpucycle;
+
+ /*
+ * I/O Error handling
+ */
+ unsigned int continue_on_error;
};
#define FIO_VERROR_SIZE 128
};
#define FIO_VERROR_SIZE 128
* For generating file sizes
*/
os_random_state_t file_size_state;
* For generating file sizes
*/
os_random_state_t file_size_state;
+
+ /*
+ * Error counts
+ */
+ unsigned int total_err_count;
+ int first_error;
break; \
int e = (err); \
(td)->error = e; \
break; \
int e = (err); \
(td)->error = e; \
- snprintf(td->verror, sizeof(td->verror) - 1, "file:%s:%d, func=%s, error=%s", __FILE__, __LINE__, (func), (msg)); \
+ if (!(td)->first_error) \
+ snprintf(td->verror, sizeof(td->verror) - 1, "file:%s:%d, func=%s, error=%s", __FILE__, __LINE__, (func), (msg)); \
+#define td_clear_error(td) \
+ (td)->error = 0;
#define td_verror(td, err, func) \
__td_verror((td), (err), strerror((err)), (func))
#define td_vmsg(td, err, msg, func) \
#define td_verror(td, err, func) \
__td_verror((td), (err), strerror((err)), (func))
#define td_vmsg(td, err, msg, func) \
+#define td_non_fatal_error(e) ((e) == -EIO || (e) == EILSEQ)
+
+static inline void update_error_count(struct thread_data *td, int err)
+{
+ td->total_err_count++;
+ if (td->total_err_count == 1)
+ td->first_error = err;
+}
+
static inline int should_fsync(struct thread_data *td)
{
if (td->last_was_sync)
static inline int should_fsync(struct thread_data *td)
{
if (td->last_was_sync)
+void clear_io_u(struct thread_data *td, struct io_u *io_u)
+{
+ io_u->flags &= ~IO_U_F_FLIGHT;
+ put_io_u(td, io_u);
+}
+
void requeue_io_u(struct thread_data *td, struct io_u **io_u)
{
struct io_u *__io_u = *io_u;
void requeue_io_u(struct thread_data *td, struct io_u **io_u)
{
struct io_u *__io_u = *io_u;
icd->error = io_u->error;
io_u_log_error(td, io_u);
}
icd->error = io_u->error;
io_u_log_error(td, io_u);
}
+ if (td->o.continue_on_error && icd->error &&
+ td_non_fatal_error(icd->error)) {
+ /*
+ * If there is a non_fatal error, then add to the error count
+ * and clear all the errors.
+ */
+ update_error_count(td, icd->error);
+ td_clear_error(td);
+ icd->error = 0;
+ io_u->error = 0;
+ }
}
static void init_icd(struct thread_data *td, struct io_completion_data *icd,
}
static void init_icd(struct thread_data *td, struct io_completion_data *icd,
extern struct io_u *__get_io_u(struct thread_data *);
extern struct io_u *get_io_u(struct thread_data *);
extern void put_io_u(struct thread_data *, struct io_u *);
extern struct io_u *__get_io_u(struct thread_data *);
extern struct io_u *get_io_u(struct thread_data *);
extern void put_io_u(struct thread_data *, struct io_u *);
+extern void clear_io_u(struct thread_data *, struct io_u *);
extern void requeue_io_u(struct thread_data *, struct io_u **);
extern int __must_check io_u_sync_complete(struct thread_data *, struct io_u *, unsigned long *);
extern int __must_check io_u_queued_complete(struct thread_data *, int, unsigned long *);
extern void requeue_io_u(struct thread_data *, struct io_u **);
extern int __must_check io_u_sync_complete(struct thread_data *, struct io_u *, unsigned long *);
extern int __must_check io_u_queued_complete(struct thread_data *, int, unsigned long *);
.cb = str_gtod_cpu_cb,
.help = "Setup dedicated gettimeofday() thread on this CPU",
},
.cb = str_gtod_cpu_cb,
.help = "Setup dedicated gettimeofday() thread on this CPU",
},
+ {
+ .name = "continue_on_error",
+ .type = FIO_OPT_BOOL,
+ .off1 = td_var_offset(continue_on_error),
+ .help = "Continue on non-fatal errors during I/O",
+ .def = "0",
+ },
stat_calc_lat_u(ts, io_u_lat_u);
stat_calc_lat_m(ts, io_u_lat_m);
show_latencies(io_u_lat_u, io_u_lat_m);
stat_calc_lat_u(ts, io_u_lat_u);
stat_calc_lat_m(ts, io_u_lat_m);
show_latencies(io_u_lat_u, io_u_lat_m);
+ if (ts->continue_on_error) {
+ log_info(" errors: total=%lu, first_error=%d\n",
+ ts->total_err_count, ts->first_error);
+ }
}
static void show_ddir_status_terse(struct thread_stat *ts,
}
static void show_ddir_status_terse(struct thread_stat *ts,
log_info(";%3.2f%%", io_u_lat_u[i]);
for (i = 0; i < FIO_IO_U_LAT_M_NR; i++)
log_info(";%3.2f%%", io_u_lat_m[i]);
log_info(";%3.2f%%", io_u_lat_u[i]);
for (i = 0; i < FIO_IO_U_LAT_M_NR; i++)
log_info(";%3.2f%%", io_u_lat_m[i]);
+ if (ts->continue_on_error)
+ log_info(";%lu;%d", ts->total_err_count, ts->first_error);
log_info("\n");
if (ts->description)
log_info("\n");
if (ts->description)
- if (td->error && !ts->error) {
- ts->error = td->error;
- ts->verror = td->verror;
+ ts->continue_on_error = td->o.continue_on_error;
+ ts->total_err_count += td->total_err_count;
+ ts->first_error = td->first_error;
+ if (!ts->error) {
+ if (!td->error && td->o.continue_on_error &&
+ td->first_error) {
+ ts->error = td->first_error;
+ ts->verror = td->verror;
+ } else if (td->error) {
+ ts->error = td->error;
+ ts->verror = td->verror;
+ }
}
for (l = 0; l <= DDIR_WRITE; l++) {
}
for (l = 0; l <= DDIR_WRITE; l++) {