summary |
shortlog |
log |
commit | commitdiff |
tree
raw |
patch |
inline | side by side (from parent 1:
669e8bf)
Current implementation of rate control has the potential for bursts and
stalls when iodepth>1 due to feedback delay while IO is pending. more
description here:
https://docs.google.com/drawings/d/1EG-9eGlNvw-m9m0wMSb_C_lyJcaPlhEFbkHRsVpt4wo/edit?usp=sharing
This commit changes the rate control mechanisms to use feed forward io
issues for rate feedback. Moving the rate control on submissions
instead of completions eliminates feedback delay. More details on the
change here:
https://docs.google.com/drawings/d/1NphdZGjYGuOLWJzvXHv44zuy0nnSunvFrROCVfA67Y8/edit?usp=sharing
return bytes >= limit || exceeds_number_ios(td);
}
return bytes >= limit || exceeds_number_ios(td);
}
+/*
+ * used to calculate the next io time for rate control
+ *
+ */
+static long long usec_for_io(struct thread_data *td, enum fio_ddir ddir)
+{
+ uint64_t secs, remainder, bps, bytes;
+
+ assert(!(td->flags & TD_F_CHILD));
+ bytes = td->rate_io_issue_bytes[ddir];
+ bps = td->rate_bps[ddir];
+ if (bps) {
+ secs = bytes / bps;
+ remainder = bytes % bps;
+ return remainder * 1000000 / bps + secs * 1000000;
+ } else
+ return 0;
+}
+
/*
* Main IO worker function. It retrieves io_u's to process and queues
* and reaps them, checking for rate and errors along the way.
/*
* Main IO worker function. It retrieves io_u's to process and queues
* and reaps them, checking for rate and errors along the way.
if (td->error)
break;
ret = workqueue_enqueue(&td->io_wq, io_u);
if (td->error)
break;
ret = workqueue_enqueue(&td->io_wq, io_u);
+
+ if (should_check_rate(td))
+ td->rate_next_io_time[ddir] = usec_for_io(td, ddir);
+
} else {
ret = td_io_queue(td, io_u);
} else {
ret = td_io_queue(td, io_u);
+ if (should_check_rate(td))
+ td->rate_next_io_time[ddir] = usec_for_io(td, ddir);
+
if (io_queue_event(td, io_u, &ret, ddir, &bytes_issued, 0, &comp_time))
break;
if (io_queue_event(td, io_u, &ret, ddir, &bytes_issued, 0, &comp_time))
break;
}
if (!in_ramp_time(td) && td->o.latency_target)
lat_target_check(td);
}
if (!in_ramp_time(td) && td->o.latency_target)
lat_target_check(td);
if (td->o.thinktime) {
unsigned long long b;
if (td->o.thinktime) {
unsigned long long b;
* Rate state
*/
uint64_t rate_bps[DDIR_RWDIR_CNT];
* Rate state
*/
uint64_t rate_bps[DDIR_RWDIR_CNT];
- long rate_pending_usleep[DDIR_RWDIR_CNT];
+ unsigned long rate_next_io_time[DDIR_RWDIR_CNT];
unsigned long rate_bytes[DDIR_RWDIR_CNT];
unsigned long rate_blocks[DDIR_RWDIR_CNT];
unsigned long rate_bytes[DDIR_RWDIR_CNT];
unsigned long rate_blocks[DDIR_RWDIR_CNT];
+ unsigned long rate_io_issue_bytes[DDIR_RWDIR_CNT];
struct timeval lastrate[DDIR_RWDIR_CNT];
/*
struct timeval lastrate[DDIR_RWDIR_CNT];
/*
- td->rate_pending_usleep[ddir] = 0;
+ td->rate_next_io_time[ddir] = 0;
static enum fio_ddir rate_ddir(struct thread_data *td, enum fio_ddir ddir)
{
enum fio_ddir odir = ddir ^ 1;
static enum fio_ddir rate_ddir(struct thread_data *td, enum fio_ddir ddir)
{
enum fio_ddir odir = ddir ^ 1;
+ now = utime_since_now(&td->start);
- if (td->rate_pending_usleep[ddir] <= 0)
+ /*
+ * if rate_next_io_time is in the past, need to catch up to rate
+ */
+ if (td->rate_next_io_time[ddir] <= now)
- * We have too much pending sleep in this direction. See if we
+ * We are ahead of rate in this direction. See if we
* should switch.
*/
if (td_rw(td) && td->o.rwmix[odir]) {
/*
* should switch.
*/
if (td_rw(td) && td->o.rwmix[odir]) {
/*
- * Other direction does not have too much pending, switch
+ * Other direction is behind rate, switch
- if (td->rate_pending_usleep[odir] < 100000)
+ if (td->rate_next_io_time[odir] <= now)
- * Both directions have pending sleep. Sleep the minimum time
- * and deduct from both.
+ * Both directions are ahead of rate. sleep the min
+ * switch if necissary
- if (td->rate_pending_usleep[ddir] <=
- td->rate_pending_usleep[odir]) {
- usec = td->rate_pending_usleep[ddir];
+ if (td->rate_next_io_time[ddir] <=
+ td->rate_next_io_time[odir]) {
+ usec = td->rate_next_io_time[ddir] - now;
- usec = td->rate_pending_usleep[odir];
+ usec = td->rate_next_io_time[odir] - now;
- usec = td->rate_pending_usleep[ddir];
+ usec = td->rate_next_io_time[ddir] - now;
if (td->o.io_submit_mode == IO_MODE_INLINE)
io_u_quiesce(td);
usec = usec_sleep(td, usec);
if (td->o.io_submit_mode == IO_MODE_INLINE)
io_u_quiesce(td);
usec = usec_sleep(td, usec);
- td->rate_pending_usleep[ddir] -= usec;
-
- odir = ddir ^ 1;
- if (td_rw(td) && __should_check_rate(td, odir))
- td->rate_pending_usleep[odir] -= usec;
-
-static long long usec_for_io(struct thread_data *td, enum fio_ddir ddir)
-{
- uint64_t secs, remainder, bps, bytes;
-
- assert(!(td->flags & TD_F_CHILD));
- bytes = td->this_io_bytes[ddir];
- bps = td->rate_bps[ddir];
- secs = bytes / bps;
- remainder = bytes % bps;
- return remainder * 1000000 / bps + secs * 1000000;
-}
-
static void io_completed(struct thread_data *td, struct io_u **io_u_ptr,
struct io_completion_data *icd)
{
static void io_completed(struct thread_data *td, struct io_u **io_u_ptr,
struct io_completion_data *icd)
{
if (!io_u->error && ddir_rw(ddir)) {
unsigned int bytes = io_u->buflen - io_u->resid;
if (!io_u->error && ddir_rw(ddir)) {
unsigned int bytes = io_u->buflen - io_u->resid;
- const enum fio_ddir oddir = ddir ^ 1;
int ret;
td->io_blocks[ddir]++;
int ret;
td->io_blocks[ddir]++;
}
if (ramp_time_over(td) && (td->runstate == TD_RUNNING ||
}
if (ramp_time_over(td) && (td->runstate == TD_RUNNING ||
- td->runstate == TD_VERIFYING)) {
- struct thread_data *__td = td;
-
+ td->runstate == TD_VERIFYING))
account_io_completion(td, io_u, icd, ddir, bytes);
account_io_completion(td, io_u, icd, ddir, bytes);
- if (td->parent)
- __td = td->parent;
-
- if (__should_check_rate(__td, ddir)) {
- __td->rate_pending_usleep[ddir] =
- (usec_for_io(__td, ddir) -
- utime_since_now(&__td->start));
- }
- if (ddir != DDIR_TRIM &&
- __should_check_rate(__td, oddir)) {
- __td->rate_pending_usleep[oddir] =
- (usec_for_io(__td, oddir) -
- utime_since_now(&__td->start));
- }
- }
-
icd->bytes_done[ddir] += bytes;
if (io_u->end_io) {
icd->bytes_done[ddir] += bytes;
if (io_u->end_io) {
if (ddir_rw(ddir)) {
td->io_issues[ddir]++;
td->io_issue_bytes[ddir] += buflen;
if (ddir_rw(ddir)) {
td->io_issues[ddir]++;
td->io_issue_bytes[ddir] += buflen;
+ td->rate_io_issue_bytes[ddir] += buflen;
}
ret = td->io_ops->queue(td, io_u);
}
ret = td->io_ops->queue(td, io_u);
if (ret == FIO_Q_BUSY && ddir_rw(ddir)) {
td->io_issues[ddir]--;
td->io_issue_bytes[ddir] -= buflen;
if (ret == FIO_Q_BUSY && ddir_rw(ddir)) {
td->io_issues[ddir]--;
td->io_issue_bytes[ddir] -= buflen;
+ td->rate_io_issue_bytes[ddir] -= buflen;
td->rate_bytes[ddir] = 0;
td->rate_blocks[ddir] = 0;
td->bytes_done[ddir] = 0;
td->rate_bytes[ddir] = 0;
td->rate_blocks[ddir] = 0;
td->bytes_done[ddir] = 0;
+ td->rate_io_issue_bytes[ddir] = 0;
+ td->rate_next_io_time[ddir] = 0;
if (ddir_rw(ddir)) {
parent->io_issues[ddir]++;
parent->io_issue_bytes[ddir] += io_u->xfer_buflen;
if (ddir_rw(ddir)) {
parent->io_issues[ddir]++;
parent->io_issue_bytes[ddir] += io_u->xfer_buflen;
+ parent->rate_io_issue_bytes[ddir] += io_u->xfer_buflen;
}
pthread_mutex_lock(&sw->lock);
}
pthread_mutex_lock(&sw->lock);