list_add(&io_u->list, &td->io_u_freelist);
}
+ io_u_init_timeout();
+
return 0;
}
}
fio_gettime(&td->epoch, NULL);
+ memcpy(&td->timeout_end, &td->epoch, sizeof(td->epoch));
getrusage(RUSAGE_SELF, &td->ts.ru_start);
runtime[0] = runtime[1] = 0;
* check if someone quit or got killed in an unusual way
*/
ret = waitpid(td->pid, &status, WNOHANG);
- if (ret < 0)
+ if (ret < 0) {
+ if (errno == ECHILD) {
+ log_err("fio: pid=%d disappeared\n", td->pid);
+ td_set_runstate(td, TD_REAPED);
+ goto reaped;
+ }
perror("waitpid");
- else if ((ret == td->pid) && WIFSIGNALED(status)) {
+ } else if ((ret == td->pid) && WIFSIGNALED(status)) {
int sig = WTERMSIG(status);
log_err("fio: pid=%d, got signal=%d\n", td->pid, sig);
int status;
ret = waitpid(td->pid, &status, 0);
- if (ret < 0)
+ if (ret < 0) {
+ if (errno == ECHILD) {
+ log_err("fio: pid=%d disappeared\n", td->pid);
+ td_set_runstate(td, TD_REAPED);
+ goto reaped;
+ }
perror("waitpid");
- else if (WIFEXITED(status) && WEXITSTATUS(status)) {
+ } else if (WIFEXITED(status) && WEXITSTATUS(status)) {
if (!exit_value)
exit_value++;
}
*/
struct list_head io_hist_list;
struct list_head io_log_list;
+
+ /*
+ * timeout handling
+ */
+ struct timeval timeout_end;
+ struct itimerval timer;
};
+/*
+ * 30 second per-io_u timeout, with 5 second intervals to avoid resetting
+ * the timer on each queue operation.
+ */
+#define IO_U_TIMEOUT_INC 5
+#define IO_U_TIMEOUT 30
+
#define __td_verror(td, err, msg) \
do { \
if ((td)->error) \
extern long __must_check io_u_sync_complete(struct thread_data *, struct io_u *, endio_handler *);
extern long __must_check io_u_queued_complete(struct thread_data *, int, endio_handler *);
extern void io_u_queued(struct thread_data *, struct io_u *);
+extern void io_u_init_timeout(void);
+extern void io_u_set_timeout(struct thread_data *);
/*
* io engine entry points
slat_time = mtime_since(&io_u->start_time, &io_u->issue_time);
add_slat_sample(td, io_u->ddir, slat_time);
}
+
+void io_u_set_timeout(struct thread_data *td)
+{
+ assert(td->cur_depth);
+
+ td->timer.it_interval.tv_sec = 0;
+ td->timer.it_interval.tv_usec = 0;
+ td->timer.it_value.tv_sec = IO_U_TIMEOUT + IO_U_TIMEOUT_INC;
+ td->timer.it_value.tv_usec = 0;
+ setitimer(ITIMER_REAL, &td->timer, NULL);
+ fio_gettime(&td->timeout_end, NULL);
+}
+
+static void io_u_timeout_handler(int fio_unused sig)
+{
+ struct thread_data *td, *__td;
+ pid_t pid = getpid();
+ int i;
+
+ log_err("fio: io_u timeout\n");
+
+ /*
+ * TLS would be nice...
+ */
+ td = NULL;
+ for_each_td(__td, i) {
+ if (__td->pid == pid) {
+ td = __td;
+ break;
+ }
+ }
+
+ if (!td) {
+ log_err("fio: io_u timeout, can't find job\n");
+ exit(1);
+ }
+
+ if (!td->cur_depth) {
+ log_err("fio: timeout without pending work?\n");
+ return;
+ }
+
+ log_err("fio: io_u timeout: job=%s, pid=%d\n", td->name, td->pid);
+ td->error = ETIMEDOUT;
+ exit(1);
+}
+
+void io_u_init_timeout(void)
+{
+ signal(SIGALRM, io_u_timeout_handler);
+}
assert((io_u->flags & IO_U_F_FLIGHT) == 0);
io_u->flags |= IO_U_F_FLIGHT;
- if (td->io_ops->flags & FIO_SYNCIO)
+ if (td->io_ops->flags & FIO_SYNCIO) {
fio_gettime(&io_u->issue_time, NULL);
+ /*
+ * for a sync engine, set the timeout upfront
+ */
+ if (mtime_since(&td->timeout_end, &io_u->issue_time) < IO_U_TIMEOUT)
+ io_u_set_timeout(td);
+ }
+
if (io_u->ddir != DDIR_SYNC)
td->io_issues[io_u->ddir]++;
ret = td->io_ops->queue(td, io_u);
- if ((td->io_ops->flags & FIO_SYNCIO) == 0)
+ if ((td->io_ops->flags & FIO_SYNCIO) == 0) {
fio_gettime(&io_u->issue_time, NULL);
+ /*
+ * async engine, set the timeout here
+ */
+ if (ret == FIO_Q_QUEUED &&
+ mtime_since(&td->timeout_end, &io_u->issue_time) < IO_U_TIMEOUT)
+ io_u_set_timeout(td);
+ }
+
return ret;
}