Implement new Rate Control
authorDaveGlen <dglen@micron.com>
Mon, 10 Aug 2015 18:47:53 +0000 (12:47 -0600)
committerDaveGlen <dglen@micron.com>
Mon, 10 Aug 2015 18:47:53 +0000 (12:47 -0600)
Current implementation of rate control has the potential for bursts and
stalls when iodepth>1 due to feedback delay while IO is pending.  more
description here:
https://docs.google.com/drawings/d/1EG-9eGlNvw-m9m0wMSb_C_lyJcaPlhEFbkHRsVpt4wo/edit?usp=sharing

This commit changes the rate control mechanisms to use feed forward io
issues for rate feedback. Moving the rate control on submissions
instead of completions eliminates feedback delay. More details on the
change here:
https://docs.google.com/drawings/d/1NphdZGjYGuOLWJzvXHv44zuy0nnSunvFrROCVfA67Y8/edit?usp=sharing

backend.c
fio.h
init.c
io_u.c
ioengines.c
libfio.c
workqueue.c

index 3eafff6e6bbbcca9bc8ef3594ff3f74e0e222013..d5b260a81c896fbcbe8c2ddf581d584225f69067 100644 (file)
--- a/backend.c
+++ b/backend.c
@@ -762,6 +762,25 @@ static int io_complete_bytes_exceeded(struct thread_data *td)
        return bytes >= limit || exceeds_number_ios(td);
 }
 
+/*
+ * used to calculate the next io time for rate control
+ *
+ */
+static long long usec_for_io(struct thread_data *td, enum fio_ddir ddir)
+{
+       uint64_t secs, remainder, bps, bytes;
+
+       assert(!(td->flags & TD_F_CHILD));
+       bytes = td->rate_io_issue_bytes[ddir];
+       bps = td->rate_bps[ddir];
+       if (bps) {
+               secs = bytes / bps;
+               remainder = bytes % bps;
+               return remainder * 1000000 / bps + secs * 1000000;
+       } else
+               return 0;
+}
+
 /*
  * Main IO worker function. It retrieves io_u's to process and queues
  * and reaps them, checking for rate and errors along the way.
@@ -891,9 +910,16 @@ static uint64_t do_io(struct thread_data *td)
                        if (td->error)
                                break;
                        ret = workqueue_enqueue(&td->io_wq, io_u);
+
+                       if (should_check_rate(td))
+                               td->rate_next_io_time[ddir] = usec_for_io(td, ddir);
+
                } else {
                        ret = td_io_queue(td, io_u);
 
+                       if (should_check_rate(td))
+                               td->rate_next_io_time[ddir] = usec_for_io(td, ddir);
+
                        if (io_queue_event(td, io_u, &ret, ddir, &bytes_issued, 0, &comp_time))
                                break;
 
@@ -924,7 +950,7 @@ reap:
                }
                if (!in_ramp_time(td) && td->o.latency_target)
                        lat_target_check(td);
-
+                                       
                if (td->o.thinktime) {
                        unsigned long long b;
 
diff --git a/fio.h b/fio.h
index 81d58e8b3509b5f3610f2cbc62ef8bc8085e051a..17bc02bc59f0eba2a39aa40b5b5142407b1bc02b 100644 (file)
--- a/fio.h
+++ b/fio.h
@@ -238,9 +238,10 @@ struct thread_data {
         * Rate state
         */
        uint64_t rate_bps[DDIR_RWDIR_CNT];
-       long rate_pending_usleep[DDIR_RWDIR_CNT];
+       unsigned long rate_next_io_time[DDIR_RWDIR_CNT];
        unsigned long rate_bytes[DDIR_RWDIR_CNT];
        unsigned long rate_blocks[DDIR_RWDIR_CNT];
+       unsigned long rate_io_issue_bytes[DDIR_RWDIR_CNT];
        struct timeval lastrate[DDIR_RWDIR_CNT];
 
        /*
diff --git a/init.c b/init.c
index 5edd53e06cca9e03b063fc5aea5f94e9e65575ad..7adee9647192b3f0feb26932dd661f7946e5f71a 100644 (file)
--- a/init.c
+++ b/init.c
@@ -465,7 +465,7 @@ static int __setup_rate(struct thread_data *td, enum fio_ddir ddir)
                return -1;
        }
 
-       td->rate_pending_usleep[ddir] = 0;
+       td->rate_next_io_time[ddir] = 0;
        return 0;
 }
 
diff --git a/io_u.c b/io_u.c
index d80ef983c15972e4dbafc00fb68319826f16b3f2..9f10206b95e8b1e80d14504c35159284cf238802 100644 (file)
--- a/io_u.c
+++ b/io_u.c
@@ -568,49 +568,47 @@ void io_u_quiesce(struct thread_data *td)
 static enum fio_ddir rate_ddir(struct thread_data *td, enum fio_ddir ddir)
 {
        enum fio_ddir odir = ddir ^ 1;
-       long usec;
+       long usec, now;
 
        assert(ddir_rw(ddir));
+       now = utime_since_now(&td->start);
 
-       if (td->rate_pending_usleep[ddir] <= 0)
+       /*
+        * if rate_next_io_time is in the past, need to catch up to rate
+        */
+       if (td->rate_next_io_time[ddir] <= now)
                return ddir;
 
        /*
-        * We have too much pending sleep in this direction. See if we
+        * We are ahead of rate in this direction. See if we
         * should switch.
         */
        if (td_rw(td) && td->o.rwmix[odir]) {
                /*
-                * Other direction does not have too much pending, switch
+                * Other direction is behind rate, switch
                 */
-               if (td->rate_pending_usleep[odir] < 100000)
+               if (td->rate_next_io_time[odir] <= now)
                        return odir;
 
                /*
-                * Both directions have pending sleep. Sleep the minimum time
-                * and deduct from both.
+                * Both directions are ahead of rate. sleep the min
+                * switch if necissary
                 */
-               if (td->rate_pending_usleep[ddir] <=
-                       td->rate_pending_usleep[odir]) {
-                       usec = td->rate_pending_usleep[ddir];
+               if (td->rate_next_io_time[ddir] <=
+                       td->rate_next_io_time[odir]) {
+                       usec = td->rate_next_io_time[ddir] - now;
                } else {
-                       usec = td->rate_pending_usleep[odir];
+                       usec = td->rate_next_io_time[odir] - now;
                        ddir = odir;
                }
        } else
-               usec = td->rate_pending_usleep[ddir];
+               usec = td->rate_next_io_time[ddir] - now;
 
        if (td->o.io_submit_mode == IO_MODE_INLINE)
                io_u_quiesce(td);
 
        usec = usec_sleep(td, usec);
 
-       td->rate_pending_usleep[ddir] -= usec;
-
-       odir = ddir ^ 1;
-       if (td_rw(td) && __should_check_rate(td, odir))
-               td->rate_pending_usleep[odir] -= usec;
-
        return ddir;
 }
 
@@ -1656,18 +1654,6 @@ static void account_io_completion(struct thread_data *td, struct io_u *io_u,
        }
 }
 
-static long long usec_for_io(struct thread_data *td, enum fio_ddir ddir)
-{
-       uint64_t secs, remainder, bps, bytes;
-
-       assert(!(td->flags & TD_F_CHILD));
-       bytes = td->this_io_bytes[ddir];
-       bps = td->rate_bps[ddir];
-       secs = bytes / bps;
-       remainder = bytes % bps;
-       return remainder * 1000000 / bps + secs * 1000000;
-}
-
 static void io_completed(struct thread_data *td, struct io_u **io_u_ptr,
                         struct io_completion_data *icd)
 {
@@ -1709,7 +1695,6 @@ static void io_completed(struct thread_data *td, struct io_u **io_u_ptr,
 
        if (!io_u->error && ddir_rw(ddir)) {
                unsigned int bytes = io_u->buflen - io_u->resid;
-               const enum fio_ddir oddir = ddir ^ 1;
                int ret;
 
                td->io_blocks[ddir]++;
@@ -1738,27 +1723,9 @@ static void io_completed(struct thread_data *td, struct io_u **io_u_ptr,
                }
 
                if (ramp_time_over(td) && (td->runstate == TD_RUNNING ||
-                                          td->runstate == TD_VERIFYING)) {
-                       struct thread_data *__td = td;
-
+                                          td->runstate == TD_VERIFYING))
                        account_io_completion(td, io_u, icd, ddir, bytes);
 
-                       if (td->parent)
-                               __td = td->parent;
-
-                       if (__should_check_rate(__td, ddir)) {
-                               __td->rate_pending_usleep[ddir] =
-                                       (usec_for_io(__td, ddir) -
-                                        utime_since_now(&__td->start));
-                       }
-                       if (ddir != DDIR_TRIM &&
-                           __should_check_rate(__td, oddir)) {
-                               __td->rate_pending_usleep[oddir] =
-                                       (usec_for_io(__td, oddir) -
-                                        utime_since_now(&__td->start));
-                       }
-               }
-
                icd->bytes_done[ddir] += bytes;
 
                if (io_u->end_io) {
index 958731dc3bfab9bda2d7765e61d3f31613b02946..9c5ac603d3291c04c7d754fdc8ddea56389b0d5d 100644 (file)
@@ -299,6 +299,7 @@ int td_io_queue(struct thread_data *td, struct io_u *io_u)
        if (ddir_rw(ddir)) {
                td->io_issues[ddir]++;
                td->io_issue_bytes[ddir] += buflen;
+               td->rate_io_issue_bytes[ddir] += buflen;
        }
 
        ret = td->io_ops->queue(td, io_u);
@@ -308,6 +309,7 @@ int td_io_queue(struct thread_data *td, struct io_u *io_u)
        if (ret == FIO_Q_BUSY && ddir_rw(ddir)) {
                td->io_issues[ddir]--;
                td->io_issue_bytes[ddir] -= buflen;
+               td->rate_io_issue_bytes[ddir] -= buflen;
        }
 
        /*
index b0141a7524e2c3d5c18d7f8e14c1503710c73363..d4cad3ec5aba0a5b4d9d926bc3675e27acc5471a 100644 (file)
--- a/libfio.c
+++ b/libfio.c
@@ -89,6 +89,8 @@ static void reset_io_counters(struct thread_data *td)
                td->rate_bytes[ddir] = 0;
                td->rate_blocks[ddir] = 0;
                td->bytes_done[ddir] = 0;
+               td->rate_io_issue_bytes[ddir] = 0;
+               td->rate_next_io_time[ddir] = 0;
        }
        td->zone_bytes = 0;
 
index 0a6cd2027443f795e4d99a960b92c7d557745ea0..e2365167a9b52d3f429da56c8cd26d377122d362 100644 (file)
@@ -124,6 +124,7 @@ int workqueue_enqueue(struct workqueue *wq, struct io_u *io_u)
                if (ddir_rw(ddir)) {
                        parent->io_issues[ddir]++;
                        parent->io_issue_bytes[ddir] += io_u->xfer_buflen;
+                       parent->rate_io_issue_bytes[ddir] += io_u->xfer_buflen;
                }
 
                pthread_mutex_lock(&sw->lock);