Autodetect cgroup blkio mount point
[fio.git] / fio.c
diff --git a/fio.c b/fio.c
index 61aabb301d35e4abacbd070f9b528122d8fd3de8..7d4c96a2b8a5045cda742f4be7842deb66529637 100644 (file)
--- a/fio.c
+++ b/fio.c
 #include "smalloc.h"
 #include "verify.h"
 #include "diskutil.h"
+#include "cgroup.h"
 
 unsigned long page_mask;
 unsigned long page_size;
-#define ALIGN(buf)     \
+
+#define PAGE_ALIGN(buf)        \
        (char *) (((unsigned long) (buf) + page_mask) & ~page_mask)
 
 int groupid = 0;
@@ -59,6 +61,8 @@ static volatile int fio_abort;
 static int exit_value;
 static struct itimerval itimer;
 static pthread_t gtod_thread;
+static struct flist_head *cgroup_list;
+static char *cgroup_mnt;
 
 struct io_log *agg_io_log[2];
 
@@ -222,7 +226,7 @@ static int __check_min_rate(struct thread_data *td, struct timeval *now,
                                if (rate < ratemin ||
                                    bytes < td->rate_bytes[ddir]) {
                                        log_err("%s: min rate %u not met, got"
-                                               " %luKiB/sec\n", td->o.name,
+                                               " %luKB/sec\n", td->o.name,
                                                        ratemin, rate);
                                        return 1;
                                }
@@ -372,6 +376,51 @@ static inline void update_tv_cache(struct thread_data *td)
                fio_gettime(&td->tv_cache, NULL);
 }
 
+static int break_on_this_error(struct thread_data *td, int *retptr)
+{
+       int ret = *retptr;
+
+       if (ret < 0 || td->error) {
+               int err;
+
+               if (!td->o.continue_on_error)
+                       return 1;
+
+               if (ret < 0)
+                       err = -ret;
+               else
+                       err = td->error;
+
+               if (td_non_fatal_error(err)) {
+                       /*
+                        * Continue with the I/Os in case of
+                        * a non fatal error.
+                        */
+                       update_error_count(td, err);
+                       td_clear_error(td);
+                       *retptr = 0;
+                       return 0;
+               } else if (td->o.fill_device && err == ENOSPC) {
+                       /*
+                        * We expect to hit this error if
+                        * fill_device option is set.
+                        */
+                       td_clear_error(td);
+                       td->terminate = 1;
+                       return 1;
+               } else {
+                       /*
+                        * Stop the I/O in case of a fatal
+                        * error.
+                        */
+                       update_error_count(td, err);
+                       return 1;
+               }
+       }
+
+       return 0;
+}
+
 /*
  * The main verify engine. Runs over the writes we previously submitted,
  * reads the blocks back in, and checks the crc/md5 of the data.
@@ -405,18 +454,17 @@ static void do_verify(struct thread_data *td)
        while (!td->terminate) {
                int ret2, full;
 
-               io_u = __get_io_u(td);
-               if (!io_u)
-                       break;
-
                update_tv_cache(td);
 
                if (runtime_exceeded(td, &td->tv_cache)) {
-                       put_io_u(td, io_u);
                        td->terminate = 1;
                        break;
                }
 
+               io_u = __get_io_u(td);
+               if (!io_u)
+                       break;
+
                if (get_next_verify(td, io_u)) {
                        put_io_u(td, io_u);
                        break;
@@ -427,14 +475,18 @@ static void do_verify(struct thread_data *td)
                        break;
                }
 
-               io_u->end_io = verify_io_u;
+               if (td->o.verify_async)
+                       io_u->end_io = verify_io_u_async;
+               else
+                       io_u->end_io = verify_io_u;
 
                ret = td_io_queue(td, io_u);
                switch (ret) {
                case FIO_Q_COMPLETED:
-                       if (io_u->error)
+                       if (io_u->error) {
                                ret = -io_u->error;
-                       else if (io_u->resid) {
+                               clear_io_u(td, io_u);
+                       } else if (io_u->resid) {
                                int bytes = io_u->xfer_buflen - io_u->resid;
                                struct fio_file *f = io_u->file;
 
@@ -478,7 +530,7 @@ sync_done:
                        break;
                }
 
-               if (ret < 0 || td->error)
+               if (break_on_this_error(td, &ret))
                        break;
 
                /*
@@ -487,7 +539,8 @@ sync_done:
                 */
                full = queue_full(td) || ret == FIO_Q_BUSY;
                if (full || !td->o.iodepth_batch_complete) {
-                       min_events = td->o.iodepth_batch_complete;
+                       min_events = min(td->o.iodepth_batch_complete,
+                                        td->cur_depth);
                        if (full && !min_events)
                                min_events = 1;
 
@@ -532,7 +585,8 @@ static void do_io(struct thread_data *td)
        else
                td_set_runstate(td, TD_RUNNING);
 
-       while ((td->this_io_bytes[0] + td->this_io_bytes[1]) < td->o.size) {
+       while ( (td->o.read_iolog_file && !flist_empty(&td->io_log_list)) ||
+               ((td->this_io_bytes[0] + td->this_io_bytes[1]) < td->o.size) ) {
                struct timeval comp_time;
                unsigned long bytes_done[2] = { 0, 0 };
                int min_evts = 0;
@@ -542,24 +596,27 @@ static void do_io(struct thread_data *td)
                if (td->terminate)
                        break;
 
-               io_u = get_io_u(td);
-               if (!io_u)
-                       break;
-
                update_tv_cache(td);
 
                if (runtime_exceeded(td, &td->tv_cache)) {
-                       put_io_u(td, io_u);
                        td->terminate = 1;
                        break;
                }
 
+               io_u = get_io_u(td);
+               if (!io_u)
+                       break;
+
                /*
                 * Add verification end_io handler, if asked to verify
                 * a previously written file.
                 */
-               if (td->o.verify != VERIFY_NONE && io_u->ddir == DDIR_READ) {
-                       io_u->end_io = verify_io_u;
+               if (td->o.verify != VERIFY_NONE && io_u->ddir == DDIR_READ &&
+                   !td_rw(td)) {
+                       if (td->o.verify_async)
+                               io_u->end_io = verify_io_u_async;
+                       else
+                               io_u->end_io = verify_io_u;
                        td_set_runstate(td, TD_VERIFYING);
                } else if (in_ramp_time(td))
                        td_set_runstate(td, TD_RAMP);
@@ -569,9 +626,10 @@ static void do_io(struct thread_data *td)
                ret = td_io_queue(td, io_u);
                switch (ret) {
                case FIO_Q_COMPLETED:
-                       if (io_u->error)
+                       if (io_u->error) {
                                ret = -io_u->error;
-                       else if (io_u->resid) {
+                               clear_io_u(td, io_u);
+                       } else if (io_u->resid) {
                                int bytes = io_u->xfer_buflen - io_u->resid;
                                struct fio_file *f = io_u->file;
 
@@ -626,7 +684,7 @@ sync_done:
                        break;
                }
 
-               if (ret < 0 || td->error)
+               if (break_on_this_error(td, &ret))
                        break;
 
                /*
@@ -634,7 +692,8 @@ sync_done:
                 */
                full = queue_full(td) || ret == FIO_Q_BUSY;
                if (full || !td->o.iodepth_batch_complete) {
-                       min_evts = td->o.iodepth_batch_complete;
+                       min_evts = min(td->o.iodepth_batch_complete,
+                                       td->cur_depth);
                        if (full && !min_evts)
                                min_evts = 1;
 
@@ -753,8 +812,8 @@ static int init_io_u(struct thread_data *td)
        if (allocate_io_mem(td))
                return 1;
 
-       if (td->o.odirect)
-               p = ALIGN(td->orig_buffer);
+       if (td->o.odirect || td->o.mem_align)
+               p = PAGE_ALIGN(td->orig_buffer) + td->o.mem_align;
        else
                p = td->orig_buffer;
 
@@ -774,9 +833,11 @@ static int init_io_u(struct thread_data *td)
                io_u = ptr;
                memset(io_u, 0, sizeof(*io_u));
                INIT_FLIST_HEAD(&io_u->list);
+               dprint(FD_MEM, "io_u alloc %p, index %u\n", io_u, i);
 
                if (!(td->io_ops->flags & FIO_NOIO)) {
                        io_u->buf = p + max_bs * i;
+                       dprint(FD_MEM, "io_u %p, mem %p\n", io_u, io_u->buf);
 
                        if (td_write(td) && !td->o.refill_buffers)
                                io_u_fill_buffer(td, io_u, max_bs);
@@ -944,6 +1005,7 @@ static void *thread_main(void *data)
 {
        unsigned long long runtime[2], elapsed;
        struct thread_data *td = data;
+       pthread_condattr_t attr;
        int clear_state;
 
        if (!td->o.use_thread)
@@ -958,8 +1020,14 @@ static void *thread_main(void *data)
        INIT_FLIST_HEAD(&td->io_u_requeues);
        INIT_FLIST_HEAD(&td->io_log_list);
        INIT_FLIST_HEAD(&td->io_hist_list);
+       INIT_FLIST_HEAD(&td->verify_list);
+       pthread_mutex_init(&td->io_u_lock, NULL);
        td->io_hist_tree = RB_ROOT;
 
+       pthread_condattr_init(&attr);
+       pthread_cond_init(&td->verify_cond, &attr);
+       pthread_cond_init(&td->free_cond, &attr);
+
        td_set_runstate(td, TD_INITIALIZED);
        dprint(FD_MUTEX, "up startup_mutex\n");
        fio_mutex_up(startup_mutex);
@@ -983,7 +1051,10 @@ static void *thread_main(void *data)
        if (init_io_u(td))
                goto err;
 
-       if (td->o.cpumask_set && fio_setaffinity(td) == -1) {
+       if (td->o.verify_async && verify_async_init(td))
+               goto err;
+
+       if (td->o.cpumask_set && fio_setaffinity(td->pid, td->o.cpumask) == -1) {
                td_verror(td, errno, "cpu_set_affinity");
                goto err;
        }
@@ -994,7 +1065,7 @@ static void *thread_main(void *data)
         */
        if (td->o.gtod_cpu) {
                fio_cpu_clear(&td->o.cpumask, td->o.gtod_cpu);
-               if (fio_setaffinity(td) == -1) {
+               if (fio_setaffinity(td->pid, td->o.cpumask) == -1) {
                        td_verror(td, errno, "cpu_set_affinity");
                        goto err;
                }
@@ -1007,6 +1078,9 @@ static void *thread_main(void *data)
                }
        }
 
+       if (td->o.cgroup_weight && cgroup_setup(td, cgroup_list, &cgroup_mnt))
+               goto err;
+
        if (nice(td->o.nice) == -1) {
                td_verror(td, errno, "nice");
                goto err;
@@ -1041,7 +1115,10 @@ static void *thread_main(void *data)
        clear_state = 0;
        while (keep_running(td)) {
                fio_gettime(&td->start, NULL);
-               memcpy(&td->ts.stat_sample_time, &td->start, sizeof(td->start));
+               memcpy(&td->ts.stat_sample_time[0], &td->start,
+                               sizeof(td->start));
+               memcpy(&td->ts.stat_sample_time[1], &td->start,
+                               sizeof(td->start));
                memcpy(&td->tv_cache, &td->start, sizeof(td->start));
 
                if (td->o.ratemin[0] || td->o.ratemin[1])
@@ -1126,9 +1203,14 @@ err:
        if (td->error)
                printf("fio: pid=%d, err=%d/%s\n", (int) td->pid, td->error,
                                                        td->verror);
+
+       if (td->o.verify_async)
+               verify_async_exit(td);
+
        close_and_free_files(td);
        close_ioengine(td);
        cleanup_io_u(td);
+       cgroup_shutdown(td, &cgroup_mnt);
 
        if (td->o.cpumask_set) {
                int ret = fio_cpuset_exit(&td->o.cpumask);
@@ -1374,8 +1456,6 @@ static void run_threads(void)
                        for_each_file(td, f, i) {
                                if (fio_file_open(f))
                                        td_io_close_file(td, f);
-                               else
-                                       assert(f->fd == -1);
                        }
                }
 
@@ -1454,7 +1534,13 @@ static void run_threads(void)
                                        *fio_debug_jobp = pid;
                        }
                        dprint(FD_MUTEX, "wait on startup_mutex\n");
-                       fio_mutex_down(startup_mutex);
+                       if (fio_mutex_down_timeout(startup_mutex, 10)) {
+                               log_err("fio: job startup hung? exiting.\n");
+                               terminate_threads(TERMINATE_ALL);
+                               fio_abort = 1;
+                               nr_started--;
+                               break;
+                       }
                        dprint(FD_MUTEX, "done waiting on startup_mutex\n");
                }
 
@@ -1544,12 +1630,6 @@ int main(int argc, char *argv[])
        if (!getenv("LC_NUMERIC"))
                setlocale(LC_NUMERIC, "en_US");
 
-       if (parse_options(argc, argv))
-               return 1;
-
-       if (!thread_number)
-               return 0;
-
        ps = sysconf(_SC_PAGESIZE);
        if (ps < 0) {
                log_err("Failed to get page size\n");
@@ -1559,6 +1639,14 @@ int main(int argc, char *argv[])
        page_size = ps;
        page_mask = ps - 1;
 
+       fio_keywords_init();
+
+       if (parse_options(argc, argv))
+               return 1;
+
+       if (!thread_number)
+               return 0;
+
        if (write_bw_log) {
                setup_log(&agg_io_log[DDIR_READ]);
                setup_log(&agg_io_log[DDIR_WRITE]);
@@ -1571,6 +1659,9 @@ int main(int argc, char *argv[])
 
        status_timer_arm();
 
+       cgroup_list = smalloc(sizeof(*cgroup_list));
+       INIT_FLIST_HEAD(cgroup_list);
+
        run_threads();
 
        if (!fio_abort) {
@@ -1582,6 +1673,11 @@ int main(int argc, char *argv[])
                }
        }
 
+       cgroup_kill(cgroup_list);
+       sfree(cgroup_list);
+       if (cgroup_mnt)
+               sfree(cgroup_mnt);
+
        fio_mutex_remove(startup_mutex);
        fio_mutex_remove(writeout_mutex);
        return exit_value;