From 1a7081c7c3884b3eac9cfe6b3ae1d6dc341e7ed2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 24 Oct 2018 05:01:43 -0600 Subject: [PATCH] Add support for latency probing over an interval of load Provide a way to easily run a latency probe on the device. You define a job with peak parameters, and then probe settings for generating iops/latency numbers based on that workload. The latter looks something like this: iodepth_mode=stepped:10-130/10,5/10 which has the format of: low_percentage-high_percentage/step,ramp_time/run_time The above would probe from 10% of peak performance to 130%, in steps of 10%. For each step, it would run a 5 second ramp, then do 10 seconds of testing. For percentages <= 100%, fio will limit the IOPS. For percentages above, it'll ramp up the queue depth. For each section run, it'll look the avg completion latency associated with that queue depth / iops setting. Has normal output (which sucks), and json output. Still experimenting, not final form yet. Signed-off-by: Jens Axboe --- Makefile | 2 +- backend.c | 8 +- cconv.c | 12 + client.c | 9 + examples/iodepth_mode_stepped.fio | 19 ++ fio.h | 21 +- init.c | 2 +- io_u.c | 159 +------------ libfio.c | 1 + options.c | 70 ++++++ server.c | 9 + server.h | 2 +- stat.c | 64 ++++- stat.h | 14 +- target.c | 375 ++++++++++++++++++++++++++++++ target.h | 58 +++++ thread_options.h | 27 ++- 17 files changed, 681 insertions(+), 171 deletions(-) create mode 100644 examples/iodepth_mode_stepped.fio create mode 100644 target.c create mode 100644 target.h diff --git a/Makefile b/Makefile index 4721b789..62c6ddfd 100644 --- a/Makefile +++ b/Makefile @@ -50,7 +50,7 @@ SOURCE := $(sort $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/crc/*.c)) \ gettime-thread.c helpers.c json.c idletime.c td_error.c \ profiles/tiobench.c profiles/act.c io_u_queue.c filelock.c \ workqueue.c rate-submit.c optgroup.c helper_thread.c \ - steadystate.c zone-dist.c + steadystate.c zone-dist.c target.c ifdef CONFIG_LIBHDFS HDFSFLAGS= -I $(JAVA_HOME)/include -I $(JAVA_HOME)/include/linux -I $(FIO_LIBHDFS_INCLUDE) diff --git a/backend.c b/backend.c index d6450baf..3c734f0d 100644 --- a/backend.c +++ b/backend.c @@ -49,6 +49,7 @@ #include "helper_thread.h" #include "pshared.h" #include "zone-dist.h" +#include "target.h" static struct fio_sem *startup_sem; static struct flist_head *cgroup_list; @@ -1090,8 +1091,8 @@ reap: break; } } - if (!in_ramp_time(td) && td->o.latency_target) - lat_target_check(td); + if (!in_ramp_time(td) && lat_target_check(td)) + break; if (ddir_rw(ddir) && td->o.thinktime) handle_thinktime(td, ddir); @@ -1867,7 +1868,8 @@ static void *thread_main(void *data) * (Are we not missing other flags that can be ignored ?) */ if ((td->o.size || td->o.io_size) && !ddir_rw_sum(bytes_done) && - !did_some_io && !td->o.create_only && + !did_some_io && (td->o.iodepth_mode != IOD_STEPPED) && + !td->o.create_only && !(td_ioengine_flagged(td, FIO_NOIO) || td_ioengine_flagged(td, FIO_DISKLESSIO))) log_err("%s: No I/O performed by %s, " diff --git a/cconv.c b/cconv.c index 50e45c63..4040be28 100644 --- a/cconv.c +++ b/cconv.c @@ -100,6 +100,12 @@ void convert_thread_options_to_cpu(struct thread_options *o, o->iodepth_batch_complete_min = le32_to_cpu(top->iodepth_batch_complete_min); o->iodepth_batch_complete_max = le32_to_cpu(top->iodepth_batch_complete_max); o->serialize_overlap = le32_to_cpu(top->serialize_overlap); + o->iodepth_mode = le32_to_cpu(top->iodepth_mode); + o->lat_step_low = le32_to_cpu(top->lat_step_low); + o->lat_step_high = le32_to_cpu(top->lat_step_high); + o->lat_step_inc = le32_to_cpu(top->lat_step_inc); + o->lat_step_ramp = le32_to_cpu(top->lat_step_ramp); + o->lat_step_run = le32_to_cpu(top->lat_step_run); o->size = le64_to_cpu(top->size); o->io_size = le64_to_cpu(top->io_size); o->size_percent = le32_to_cpu(top->size_percent); @@ -363,6 +369,12 @@ void convert_thread_options_to_net(struct thread_options_pack *top, top->iodepth_batch_complete_min = cpu_to_le32(o->iodepth_batch_complete_min); top->iodepth_batch_complete_max = cpu_to_le32(o->iodepth_batch_complete_max); top->serialize_overlap = cpu_to_le32(o->serialize_overlap); + top->iodepth_mode = cpu_to_le32(o->iodepth_mode); + top->lat_step_low = cpu_to_le32(o->lat_step_low); + top->lat_step_high = cpu_to_le32(o->lat_step_high); + top->lat_step_inc = cpu_to_le32(o->lat_step_inc); + top->lat_step_ramp = cpu_to_le32(o->lat_step_ramp); + top->lat_step_run = cpu_to_le32(o->lat_step_run); top->size_percent = cpu_to_le32(o->size_percent); top->fill_device = cpu_to_le32(o->fill_device); top->file_append = cpu_to_le32(o->file_append); diff --git a/client.c b/client.c index 32489067..0c87eb54 100644 --- a/client.c +++ b/client.c @@ -1024,6 +1024,15 @@ static void convert_ts(struct thread_stat *dst, struct thread_stat *src) for (i = 0; i < dst->nr_block_infos; i++) dst->block_infos[i] = le32_to_cpu(src->block_infos[i]); + for (i = 0; i < ARRAY_SIZE(dst->step_stats); i++) { + struct lat_step_stats *ls = &src->step_stats[i]; + + for (j = 0; j < DDIR_RWDIR_CNT; j++) { + dst->step_stats[i].iops[j] = le64_to_cpu(ls->iops[j]); + dst->step_stats[i].avg[j].u.f = fio_uint64_to_double(le64_to_cpu(ls->avg[j].u.i)); + } + } + dst->ss_dur = le64_to_cpu(src->ss_dur); dst->ss_state = le32_to_cpu(src->ss_state); dst->ss_head = le32_to_cpu(src->ss_head); diff --git a/examples/iodepth_mode_stepped.fio b/examples/iodepth_mode_stepped.fio new file mode 100644 index 00000000..fc2b9f46 --- /dev/null +++ b/examples/iodepth_mode_stepped.fio @@ -0,0 +1,19 @@ +# Job demonstrating how to use the iodepth_mode=stepped feature +# +[step] +ioengine=libaio +# iodepth / step_high (130% here) must be high enough to saturate performance +iodepth=64 +numjobs=1 +direct=1 +# Step from 10% to 130%, in 5% intervals. For each step, use a ramp time +# of 5s, then 30 seconds of runtime +iodepth_mode=stepped:10-130/5,5/30 +rw=randread +norandommap +filename=/dev/nvme0n1p9 +runtime=1h +time_based=1 +numjobs=2 +group_reporting=1 +cpus_allowed=0,2 diff --git a/fio.h b/fio.h index b3ba5db2..081998bf 100644 --- a/fio.h +++ b/fio.h @@ -155,6 +155,11 @@ enum { F_ADV_SEQUENTIAL, }; +enum { + IOD_NONE = 0, + IOD_STEPPED, +}; + /* * Per-thread/process specific data. Only used for the network client * for now. @@ -374,9 +379,14 @@ struct thread_data { unsigned int latency_qd; unsigned int latency_qd_high; unsigned int latency_qd_low; + unsigned int latency_qd_step; unsigned int latency_failed; - uint64_t latency_ios; + unsigned int latency_state; + unsigned int latency_iops[DDIR_RWDIR_CNT]; + unsigned int latency_step; + uint64_t latency_ios[DDIR_RWDIR_CNT]; int latency_end_run; + unsigned int nr_lat_stats; /* * read/write mixed workload state @@ -687,13 +697,6 @@ extern int io_queue_event(struct thread_data *td, struct io_u *io_u, int *ret, enum fio_ddir ddir, uint64_t *bytes_issued, int from_verify, struct timespec *comp_time); -/* - * Latency target helpers - */ -extern void lat_target_check(struct thread_data *); -extern void lat_target_init(struct thread_data *); -extern void lat_target_reset(struct thread_data *); - /* * Iterates all threads/processes within all the defined jobs */ @@ -751,6 +754,8 @@ static inline bool should_check_rate(struct thread_data *td) return ddir_rw_sum(td->bytes_done) != 0; } +int setup_rate(struct thread_data *td); + static inline unsigned long long td_max_bs(struct thread_data *td) { unsigned long long max_bs; diff --git a/init.c b/init.c index a2b70c4a..69124803 100644 --- a/init.c +++ b/init.c @@ -559,7 +559,7 @@ static int __setup_rate(struct thread_data *td, enum fio_ddir ddir) return 0; } -static int setup_rate(struct thread_data *td) +int setup_rate(struct thread_data *td) { int ret = 0; diff --git a/io_u.c b/io_u.c index 56abe6fd..e1ac2097 100644 --- a/io_u.c +++ b/io_u.c @@ -11,6 +11,7 @@ #include "lib/pow2.h" #include "minmax.h" #include "zbd.h" +#include "target.h" struct io_completion_data { int nr; /* input */ @@ -1356,146 +1357,6 @@ static long set_io_u_file(struct thread_data *td, struct io_u *io_u) return 0; } -static void lat_fatal(struct thread_data *td, struct io_completion_data *icd, - unsigned long long tnsec, unsigned long long max_nsec) -{ - if (!td->error) - log_err("fio: latency of %llu nsec exceeds specified max (%llu nsec)\n", tnsec, max_nsec); - td_verror(td, ETIMEDOUT, "max latency exceeded"); - icd->error = ETIMEDOUT; -} - -static void lat_new_cycle(struct thread_data *td) -{ - fio_gettime(&td->latency_ts, NULL); - td->latency_ios = ddir_rw_sum(td->io_blocks); - td->latency_failed = 0; -} - -/* - * We had an IO outside the latency target. Reduce the queue depth. If we - * are at QD=1, then it's time to give up. - */ -static bool __lat_target_failed(struct thread_data *td) -{ - if (td->latency_qd == 1) - return true; - - td->latency_qd_high = td->latency_qd; - - if (td->latency_qd == td->latency_qd_low) - td->latency_qd_low--; - - td->latency_qd = (td->latency_qd + td->latency_qd_low) / 2; - - dprint(FD_RATE, "Ramped down: %d %d %d\n", td->latency_qd_low, td->latency_qd, td->latency_qd_high); - - /* - * When we ramp QD down, quiesce existing IO to prevent - * a storm of ramp downs due to pending higher depth. - */ - io_u_quiesce(td); - lat_new_cycle(td); - return false; -} - -static bool lat_target_failed(struct thread_data *td) -{ - if (td->o.latency_percentile.u.f == 100.0) - return __lat_target_failed(td); - - td->latency_failed++; - return false; -} - -void lat_target_init(struct thread_data *td) -{ - td->latency_end_run = 0; - - if (td->o.latency_target) { - dprint(FD_RATE, "Latency target=%llu\n", td->o.latency_target); - fio_gettime(&td->latency_ts, NULL); - td->latency_qd = 1; - td->latency_qd_high = td->o.iodepth; - td->latency_qd_low = 1; - td->latency_ios = ddir_rw_sum(td->io_blocks); - } else - td->latency_qd = td->o.iodepth; -} - -void lat_target_reset(struct thread_data *td) -{ - if (!td->latency_end_run) - lat_target_init(td); -} - -static void lat_target_success(struct thread_data *td) -{ - const unsigned int qd = td->latency_qd; - struct thread_options *o = &td->o; - - td->latency_qd_low = td->latency_qd; - - /* - * If we haven't failed yet, we double up to a failing value instead - * of bisecting from highest possible queue depth. If we have set - * a limit other than td->o.iodepth, bisect between that. - */ - if (td->latency_qd_high != o->iodepth) - td->latency_qd = (td->latency_qd + td->latency_qd_high) / 2; - else - td->latency_qd *= 2; - - if (td->latency_qd > o->iodepth) - td->latency_qd = o->iodepth; - - dprint(FD_RATE, "Ramped up: %d %d %d\n", td->latency_qd_low, td->latency_qd, td->latency_qd_high); - - /* - * Same as last one, we are done. Let it run a latency cycle, so - * we get only the results from the targeted depth. - */ - if (td->latency_qd == qd) { - if (td->latency_end_run) { - dprint(FD_RATE, "We are done\n"); - td->done = 1; - } else { - dprint(FD_RATE, "Quiesce and final run\n"); - io_u_quiesce(td); - td->latency_end_run = 1; - reset_all_stats(td); - reset_io_stats(td); - } - } - - lat_new_cycle(td); -} - -/* - * Check if we can bump the queue depth - */ -void lat_target_check(struct thread_data *td) -{ - uint64_t usec_window; - uint64_t ios; - double success_ios; - - usec_window = utime_since_now(&td->latency_ts); - if (usec_window < td->o.latency_window) - return; - - ios = ddir_rw_sum(td->io_blocks) - td->latency_ios; - success_ios = (double) (ios - td->latency_failed) / (double) ios; - success_ios *= 100.0; - - dprint(FD_RATE, "Success rate: %.2f%% (target %.2f%%)\n", success_ios, td->o.latency_percentile.u.f); - - if (success_ios >= td->o.latency_percentile.u.f) - lat_target_success(td); - else - __lat_target_failed(td); -} - /* * If latency target is enabled, we might be ramping up or down and not * using the full queue depth available. @@ -1506,7 +1367,7 @@ bool queue_full(const struct thread_data *td) if (qempty) return true; - if (!td->o.latency_target) + if (!td->o.latency_target || td->o.iodepth_mode != IOD_STEPPED) return false; return td->cur_depth >= td->latency_qd; @@ -1837,11 +1698,15 @@ static void account_io_completion(struct thread_data *td, struct io_u *io_u, icd->error = ops->io_u_lat(td, tnsec); } - if (td->o.max_latency && tnsec > td->o.max_latency) - lat_fatal(td, icd, tnsec, td->o.max_latency); + if (td->o.max_latency && tnsec > td->o.max_latency) { + icd->error = ETIMEDOUT; + lat_fatal(td, tnsec, td->o.max_latency); + } if (td->o.latency_target && tnsec > td->o.latency_target) { - if (lat_target_failed(td)) - lat_fatal(td, icd, tnsec, td->o.latency_target); + if (lat_target_failed(td)) { + icd->error = ETIMEDOUT; + lat_fatal(td, tnsec, td->o.latency_target); + } } } @@ -1887,8 +1752,8 @@ static void file_log_write_comp(const struct thread_data *td, struct fio_file *f static bool should_account(struct thread_data *td) { - return ramp_time_over(td) && (td->runstate == TD_RUNNING || - td->runstate == TD_VERIFYING); + return lat_step_account(td) && ramp_time_over(td) && + (td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING); } static void io_completed(struct thread_data *td, struct io_u **io_u_ptr, diff --git a/libfio.c b/libfio.c index 674bc1dc..a672dd55 100644 --- a/libfio.c +++ b/libfio.c @@ -34,6 +34,7 @@ #include "filelock.h" #include "helper_thread.h" #include "filehash.h" +#include "target.h" FLIST_HEAD(disk_list); diff --git a/options.c b/options.c index 98187def..52acf978 100644 --- a/options.c +++ b/options.c @@ -13,6 +13,7 @@ #include "lib/pattern.h" #include "options.h" #include "optgroup.h" +#include "target.h" char client_sockaddr_str[INET6_ADDRSTRLEN] = { 0 }; @@ -480,6 +481,51 @@ static int str_rwmix_write_cb(void *data, unsigned long long *val) return 0; } +static int str_iodepth_mode_cb(void *data, const char *input) +{ + struct thread_data *td = cb_data_to_td(data); + struct thread_options *o = &td->o; + char *str, *p, *n; + int ret = 1; + + if (o->iodepth_mode == IOD_NONE) + return 0; + + if (parse_dryrun()) + return 0; + + p = str = strdup(input); + + strip_blank_front(&str); + strip_blank_end(str); + + n = strchr(p, ':'); + if (!n) + goto err; + + *n++ = '\0'; + + /* format is now 'low-min/step' */ + ret = sscanf(n, "%u-%u/%u,%u/%u", &o->lat_step_low, &o->lat_step_high, + &o->lat_step_inc, &o->lat_step_ramp, + &o->lat_step_run); + if (ret == 5) { + ret = 0; + o->lat_step_ramp *= 1000; + o->lat_step_run *= 1000; + } else if (ret == 3) { + o->lat_step_ramp = IOD_STEPPED_DEF_RAMP; + o->lat_step_run = IOD_STEPPED_DEF_RUN; + ret = 0; + } else + ret = 1; +err: + if (ret) + log_err("fio: failed parsing <%s>\n", input); + free(str); + return ret; +} + static int str_exitall_cb(void) { exitall_on_terminate = true; @@ -1959,6 +2005,30 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_IO, .group = FIO_OPT_G_IO_BASIC, }, + { + .name = "iodepth_mode", + .lname = "IO Depth Mode", + .type = FIO_OPT_STR, + .off1 = offsetof(struct thread_options, iodepth_mode), + .cb = str_iodepth_mode_cb, + .help = "How to vary the queue depth", + .parent = "iodepth", + .hide = 1, + .interval = 1, + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_IO_BASIC, + .posval = { + { .ival = "none", + .oval = IOD_NONE, + .help = "No depth modification", + }, + { .ival = "stepped", + .oval = IOD_STEPPED, + .help = "Stepped IO depth:hi-lo/inc,ramp/run", + }, + }, + }, + { .name = "serialize_overlap", .lname = "Serialize overlap", diff --git a/server.c b/server.c index 90d3396b..a636f272 100644 --- a/server.c +++ b/server.c @@ -1550,6 +1550,15 @@ void fio_server_send_ts(struct thread_stat *ts, struct group_run_stats *rs) p.ts.sig_figs = cpu_to_le32(ts->sig_figs); + for (i = 0; i < ARRAY_SIZE(ts->step_stats); i++) { + struct lat_step_stats *ls = &ts->step_stats[i]; + + for (j = 0; j < DDIR_RWDIR_CNT; j++) { + p.ts.step_stats[i].iops[j] = cpu_to_le64(ls->iops[j]); + p.ts.step_stats[i].avg[j].u.i = cpu_to_le64(fio_double_to_uint64(ls->avg[j].u.f)); + } + } + p.ts.nr_block_infos = cpu_to_le64(ts->nr_block_infos); for (i = 0; i < p.ts.nr_block_infos; i++) p.ts.block_infos[i] = cpu_to_le32(ts->block_infos[i]); diff --git a/server.h b/server.h index 371e51ea..abb23bad 100644 --- a/server.h +++ b/server.h @@ -48,7 +48,7 @@ struct fio_net_cmd_reply { }; enum { - FIO_SERVER_VER = 77, + FIO_SERVER_VER = 78, FIO_SERVER_MAX_FRAGMENT_PDU = 1024, FIO_SERVER_MAX_CMD_MB = 2048, diff --git a/stat.c b/stat.c index 331abf67..26125fad 100644 --- a/stat.c +++ b/stat.c @@ -15,6 +15,7 @@ #include "helper_thread.h" #include "smalloc.h" #include "zbd.h" +#include "target.h" #define LOG_MSEC_SLACK 1 @@ -391,7 +392,7 @@ void stat_calc_lat_m(struct thread_stat *ts, double *io_u_lat) stat_calc_lat(ts, io_u_lat, ts->io_u_lat_m, FIO_IO_U_LAT_M_NR); } -static void display_lat(const char *name, unsigned long long min, +void display_lat(const char *name, unsigned long long min, unsigned long long max, double mean, double dev, struct buf_output *out) { @@ -887,6 +888,11 @@ static void show_thread_status_normal(struct thread_stat *ts, if (ts->ss_dur) show_ss_normal(ts, out); + + if (lat_ts_has_stats(ts)) { + log_buf(out, " Stepped latency report\n"); + lat_step_report(ts, out); + } } static void show_ddir_status_terse(struct thread_stat *ts, @@ -1264,7 +1270,7 @@ static struct json_object *show_thread_status_json(struct thread_stat *ts, double io_u_lat_u[FIO_IO_U_LAT_U_NR]; double io_u_lat_m[FIO_IO_U_LAT_M_NR]; double usr_cpu, sys_cpu; - int i; + int i, j; size_t size; root = json_create_object(); @@ -1488,6 +1494,32 @@ static struct json_object *show_thread_status_json(struct thread_stat *ts, json_object_add_value_array(data, "bw", bw); } + if (lat_ts_has_stats(ts)) { + tmp = json_create_object(); + json_object_add_value_object(root, "lat_step", tmp); + } + + for (i = 0; i < DDIR_RWDIR_CNT; i++) { + struct json_object *val; + + if (!__lat_ts_has_stats(ts, i)) + continue; + + val = json_create_object(); + json_object_add_value_object(tmp, io_ddir_name(i), val); + + for (j = 0; j < ARRAY_SIZE(ts->step_stats); j++) { + struct lat_step_stats *ls = &ts->step_stats[j]; + char name[32]; + + if (!ls->iops[i]) + continue; + + sprintf(name, "%llu", (unsigned long long) ls->iops[i]); + json_object_add_value_float(val, name, ls->avg[i].u.f); + } + } + return root; } @@ -1553,6 +1585,25 @@ static void sum_stat(struct io_stat *dst, struct io_stat *src, bool first) dst->S.u.f = S; } +static void sum_lat_step_stats(struct lat_step_stats *dst, + struct lat_step_stats *src, bool first) +{ + int i; + + for (i = 0; i < DDIR_RWDIR_CNT; i++) { + if (!dst->iops[i] && !src->iops[i]) + continue; + if (first) + dst->avg[i].u.f = src->avg[i].u.f; + else { + dst->avg[i].u.f = ((src->avg[i].u.f * src->iops[i]) + + (dst->avg[i].u.f * dst->iops[i])) / + (dst->iops[i] + src->iops[i]); + } + dst->iops[i] += src->iops[i]; + } +} + void sum_group_stats(struct group_run_stats *dst, struct group_run_stats *src) { int i; @@ -1665,6 +1716,9 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src, dst->total_submit += src->total_submit; dst->total_complete += src->total_complete; dst->nr_zone_resets += src->nr_zone_resets; + + for (l = 0; l < ARRAY_SIZE(dst->step_stats); l++) + sum_lat_step_stats(&dst->step_stats[l], &src->step_stats[l], first); } void init_group_run_stat(struct group_run_stats *gs) @@ -1711,6 +1765,9 @@ void __show_run_stats(void) for (i = 0; i < groupid + 1; i++) init_group_run_stat(&runstats[i]); + for (i = 0; i < FIO_OUTPUT_NR; i++) + buf_output_init(&output[i]); + /* * find out how many threads stats we need. if group reporting isn't * enabled, it's one-per-td. @@ -1887,9 +1944,6 @@ void __show_run_stats(void) } } - for (i = 0; i < FIO_OUTPUT_NR; i++) - buf_output_init(&output[i]); - /* * don't overwrite last signal output */ diff --git a/stat.h b/stat.h index b4ba71e3..8a165b78 100644 --- a/stat.h +++ b/stat.h @@ -4,6 +4,11 @@ #include "iolog.h" #include "lib/output_buffer.h" +struct lat_step_stats { + uint64_t iops[DDIR_RWDIR_CNT]; + fio_fp64_t avg[DDIR_RWDIR_CNT]; +}; + struct group_run_stats { uint64_t max_run[DDIR_RWDIR_CNT], min_run[DDIR_RWDIR_CNT]; uint64_t max_bw[DDIR_RWDIR_CNT], min_bw[DDIR_RWDIR_CNT]; @@ -145,6 +150,8 @@ enum block_info_state { #define FIO_JOBDESC_SIZE 256 #define FIO_VERROR_SIZE 128 +#define MAX_STEP_STATS 64 + struct thread_stat { char name[FIO_JOBNAME_SIZE]; char verror[FIO_VERROR_SIZE]; @@ -227,6 +234,9 @@ struct thread_stat { uint64_t latency_window; uint32_t sig_figs; + uint32_t pad4; + + struct lat_step_stats step_stats[MAX_STEP_STATS]; uint64_t ss_dur; uint32_t ss_state; @@ -239,12 +249,12 @@ struct thread_stat { union { uint64_t *ss_iops_data; - uint64_t pad4; + uint64_t pad5; }; union { uint64_t *ss_bw_data; - uint64_t pad5; + uint64_t pad6; }; } __attribute__((packed)); diff --git a/target.c b/target.c new file mode 100644 index 00000000..d372ff1b --- /dev/null +++ b/target.c @@ -0,0 +1,375 @@ +#include + +#include "fio.h" +#include "target.h" +#include "smalloc.h" +#include "stat.h" + +void lat_fatal(struct thread_data *td, unsigned long long tnsec, + unsigned long long max_nsec) +{ + if (!td->error) + log_err("fio: latency of %llu nsec exceeds specified max (%llu nsec)\n", tnsec, max_nsec); + td_verror(td, ETIMEDOUT, "max latency exceeded"); +} + +static void lat_ios_note(struct thread_data *td) +{ + int i; + + for (i = 0; i < DDIR_RWDIR_CNT; i++) + td->latency_ios[i] = td->io_blocks[i]; +} + +static void lat_new_cycle(struct thread_data *td) +{ + fio_gettime(&td->latency_ts, NULL); + lat_ios_note(td); + td->latency_failed = 0; +} + +/* + * We had an IO outside the latency target. Reduce the queue depth. If we + * are at QD=1, then it's time to give up. + */ +static bool __lat_target_failed(struct thread_data *td) +{ + if (td->latency_qd == 1) + return true; + + td->latency_qd_high = td->latency_qd; + + if (td->latency_qd == td->latency_qd_low) + td->latency_qd_low--; + + td->latency_qd = (td->latency_qd + td->latency_qd_low) / 2; + + dprint(FD_RATE, "Ramped down: %d %d %d\n", td->latency_qd_low, td->latency_qd, td->latency_qd_high); + + /* + * When we ramp QD down, quiesce existing IO to prevent + * a storm of ramp downs due to pending higher depth. + */ + io_u_quiesce(td); + lat_new_cycle(td); + return false; +} + +bool lat_target_failed(struct thread_data *td) +{ + if (td->o.latency_percentile.u.f == 100.0) + return __lat_target_failed(td); + + td->latency_failed++; + return false; +} + +static void lat_step_init(struct thread_data *td) +{ + struct thread_options *o = &td->o; + + fio_gettime(&td->latency_ts, NULL); + td->latency_state = IOD_STATE_PROBE_RAMP; + td->latency_step = 0; + td->latency_qd = td->o.iodepth; + dprint(FD_RATE, "Stepped: %d-%d/%d,%d/%d\n", o->lat_step_low, + o->lat_step_high, o->lat_step_inc, + o->lat_step_ramp, o->lat_step_run); +} + +void lat_target_init(struct thread_data *td) +{ + td->latency_end_run = 0; + + if (td->o.latency_target) { + dprint(FD_RATE, "Latency target=%llu\n", td->o.latency_target); + fio_gettime(&td->latency_ts, NULL); + td->latency_qd = 1; + td->latency_qd_high = td->o.iodepth; + td->latency_qd_low = 1; + lat_ios_note(td); + } else if (td->o.iodepth_mode == IOD_STEPPED) + lat_step_init(td); + else + td->latency_qd = td->o.iodepth; +} + +void lat_target_reset(struct thread_data *td) +{ + if (td->o.latency_target && !td->latency_end_run) + lat_target_init(td); +} + +static void lat_target_success(struct thread_data *td) +{ + const unsigned int qd = td->latency_qd; + struct thread_options *o = &td->o; + + td->latency_qd_low = td->latency_qd; + + /* + * If we haven't failed yet, we double up to a failing value instead + * of bisecting from highest possible queue depth. If we have set + * a limit other than td->o.iodepth, bisect between that. + */ + if (td->latency_qd_high != o->iodepth) + td->latency_qd = (td->latency_qd + td->latency_qd_high) / 2; + else + td->latency_qd *= 2; + + if (td->latency_qd > o->iodepth) + td->latency_qd = o->iodepth; + + dprint(FD_RATE, "Ramped up: %d %d %d\n", td->latency_qd_low, td->latency_qd, td->latency_qd_high); + + /* + * Same as last one, we are done. Let it run a latency cycle, so + * we get only the results from the targeted depth. + */ + if (td->latency_qd == qd) { + if (td->latency_end_run) { + dprint(FD_RATE, "We are done\n"); + td->done = 1; + } else { + dprint(FD_RATE, "Quiesce and final run\n"); + io_u_quiesce(td); + td->latency_end_run = 1; + reset_all_stats(td); + reset_io_stats(td); + } + } + + lat_new_cycle(td); +} + +void __lat_target_check(struct thread_data *td) +{ + uint64_t usec_window; + uint64_t ios; + double success_ios; + + usec_window = utime_since_now(&td->latency_ts); + if (usec_window < td->o.latency_window) + return; + + ios = ddir_rw_sum(td->io_blocks) - ddir_rw_sum(td->latency_ios); + success_ios = (double) (ios - td->latency_failed) / (double) ios; + success_ios *= 100.0; + + dprint(FD_RATE, "Success rate: %.2f%% (target %.2f%%)\n", success_ios, td->o.latency_percentile.u.f); + + if (success_ios >= td->o.latency_percentile.u.f) + lat_target_success(td); + else + __lat_target_failed(td); +} + +static void lat_clear_rate(struct thread_data *td) +{ + int i; + + td->flags &= ~TD_F_CHECK_RATE; + for (i = 0; i < DDIR_RWDIR_CNT; i++) + td->o.rate_iops[i] = 0; +} + +/* + * Returns true if we're done stepping + */ +static bool lat_step_recalc(struct thread_data *td) +{ + struct thread_options *o = &td->o; + unsigned int cur, perc; + + cur = td->latency_step * o->lat_step_inc; + if (cur >= o->lat_step_high) + return true; + + perc = (td->latency_step + 1) * o->lat_step_inc; + if (perc < 100) { + int i; + + for (i = 0; i < DDIR_RWDIR_CNT; i++) { + unsigned int this_iops; + + this_iops = (perc * td->latency_iops[i]) / 100; + td->o.rate_iops[i] = this_iops; + } + setup_rate(td); + td->flags |= TD_F_CHECK_RATE; + td->latency_qd = td->o.iodepth * 100 / o->lat_step_high; + } else { + td->latency_qd = td->o.iodepth * perc / o->lat_step_high; + lat_clear_rate(td); + } + + dprint(FD_RATE, "Stepped: step=%d, perc=%d, qd=%d\n", td->latency_step, + perc, td->latency_qd); + return false; +} + +static void lat_step_reset(struct thread_data *td) +{ + struct thread_stat *ts = &td->ts; + struct io_stat *ios = &ts->clat_stat[DDIR_RWDIR_CNT]; + + ios->max_val = ios->min_val = ios->samples = 0; + ios->mean.u.f = ios->S.u.f = 0; + + lat_clear_rate(td); + reset_all_stats(td); + reset_io_stats(td); +} + +static uint64_t lat_iops_since(struct thread_data *td, uint64_t msec, + enum fio_ddir ddir) +{ + if (msec) { + uint64_t ios; + + ios = td->io_blocks[ddir] - td->latency_ios[ddir]; + return (ios * 1000) / msec; + } + + return 0; +} + +static void lat_step_add_sample(struct thread_data *td, uint64_t msec) +{ + struct thread_stat *ts = &td->ts; + unsigned long long min, max; + struct lat_step_stats *ls; + double mean[DDIR_RWDIR_CNT], dev; + int i; + + if (td->nr_lat_stats == ARRAY_SIZE(td->ts.step_stats)) { + log_err("fio: ts->step_stats too small, dropping entries\n"); + return; + } + + for (i = 0; i < DDIR_RWDIR_CNT; i++) + calc_lat(&ts->clat_stat[i], &min, &max, &mean[i], &dev); + + for (i = 0; i < DDIR_RWDIR_CNT; i++) { + ls = &td->ts.step_stats[td->nr_lat_stats]; + + ls->iops[i] = lat_iops_since(td, msec, i); + ls->avg[i].u.f = mean[i]; + } + + td->nr_lat_stats++; +} + +bool __lat_ts_has_stats(struct thread_stat *ts, enum fio_ddir ddir) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ts->step_stats); i++) { + struct lat_step_stats *ls = &ts->step_stats[i]; + + if (ls->iops[ddir]) + return true; + } + + return false; +} + +bool lat_ts_has_stats(struct thread_stat *ts) +{ + int i; + + for (i = 0; i < DDIR_RWDIR_CNT; i++) + if (__lat_ts_has_stats(ts, i)) + return true; + + return false; +} + +void lat_step_report(struct thread_stat *ts, struct buf_output *out) +{ + int i, j; + + for (i = 0; i < ARRAY_SIZE(ts->step_stats); i++) { + struct lat_step_stats *ls = &ts->step_stats[i]; + + for (j = 0; j < DDIR_RWDIR_CNT; j++) { + if (!ls->iops[j]) + continue; + + __log_buf(out, " %s: iops=%llu, lat=%.1f nsec\n", + io_ddir_name(j), + (unsigned long long) ls->iops[j], + ls->avg[j].u.f); + } + } +} + +static void lat_next_state(struct thread_data *td, int new_state) +{ + td->latency_state = new_state; + fio_gettime(&td->latency_ts, NULL); +} + +bool lat_step_check(struct thread_data *td) +{ + struct thread_options *o = &td->o; + uint64_t msec; + + msec = mtime_since_now(&td->latency_ts); + + switch (td->latency_state) { + case IOD_STATE_PROBE_RAMP: + if (msec < o->lat_step_ramp) + break; + + lat_step_reset(td); + lat_ios_note(td); + + lat_next_state(td, IOD_STATE_PROBE_RUN); + break; + case IOD_STATE_PROBE_RUN: { + int i; + + if (msec < o->lat_step_run) + break; + + io_u_quiesce(td); + + for (i = 0; i < DDIR_RWDIR_CNT; i++) + td->latency_iops[i] = lat_iops_since(td, msec, i); + + lat_step_reset(td); + lat_step_recalc(td); + + io_u_quiesce(td); + lat_next_state(td, IOD_STATE_RAMP); + break; + } + case IOD_STATE_RAMP: + if (msec < o->lat_step_ramp) + break; + + lat_ios_note(td); + lat_next_state(td, IOD_STATE_RUN); + break; + case IOD_STATE_RUN: + if (msec < o->lat_step_run) + break; + + io_u_quiesce(td); + fio_gettime(&td->latency_ts, NULL); + td->latency_step++; + + lat_step_add_sample(td, msec); + lat_step_reset(td); + + if (!lat_step_recalc(td)) + break; + + td->done = 1; + lat_next_state(td, IOD_STATE_DONE); + break; + }; + + return td->latency_state == IOD_STATE_DONE; +} diff --git a/target.h b/target.h new file mode 100644 index 00000000..a794285f --- /dev/null +++ b/target.h @@ -0,0 +1,58 @@ +#ifndef FIO_LAT_TARGET_H +#define FIO_LAT_TARGET_H + +#include "fio.h" + +enum { + IOD_STEPPED_DEF_RAMP = 5000, + IOD_STEPPED_DEF_RUN = 30000, +}; + +/* + * Starts out as PROBE_RAMP -> PROBE_RUN, then iterations of + * RAMP -> RUN with various iops limiting settings + */ +enum { + IOD_STATE_PROBE_RAMP = 1, + IOD_STATE_PROBE_RUN, + IOD_STATE_RAMP, + IOD_STATE_RUN, + IOD_STATE_DONE, +}; + +/* + * Latency target helpers + */ +void lat_target_init(struct thread_data *); +void lat_target_reset(struct thread_data *); +bool lat_target_failed(struct thread_data *td); +void lat_step_report(struct thread_stat *ts, struct buf_output *out); +bool lat_ts_has_stats(struct thread_stat *ts); +bool __lat_ts_has_stats(struct thread_stat *ts, enum fio_ddir); + +void lat_fatal(struct thread_data *td, unsigned long long tnsec, + unsigned long long max_nsec); + +bool lat_step_check(struct thread_data *td); +void __lat_target_check(struct thread_data *td); + +static inline bool lat_target_check(struct thread_data *td) +{ + if (td->o.latency_target) { + __lat_target_check(td); + return false; + } else if (td->o.iodepth_mode == IOD_STEPPED) + return lat_step_check(td); + + return false; +} + +static inline bool lat_step_account(struct thread_data *td) +{ + if (td->o.iodepth_mode != IOD_STEPPED) + return true; + + return td->latency_state == IOD_STATE_RUN; +} + +#endif diff --git a/thread_options.h b/thread_options.h index 14c6969f..e062fa6f 100644 --- a/thread_options.h +++ b/thread_options.h @@ -77,6 +77,13 @@ struct thread_options { unsigned int iodepth_batch_complete_min; unsigned int iodepth_batch_complete_max; unsigned int serialize_overlap; + unsigned int iodepth_mode; + + unsigned int lat_step_low; + unsigned int lat_step_high; + unsigned int lat_step_inc; + unsigned int lat_step_ramp; + unsigned int lat_step_run; unsigned int unique_filename; @@ -361,6 +368,7 @@ struct thread_options_pack { uint32_t kb_base; uint32_t unit_base; uint32_t ddir_seq_nr; + uint32_t pad; uint64_t ddir_seq_add; uint32_t iodepth; uint32_t iodepth_low; @@ -368,6 +376,15 @@ struct thread_options_pack { uint32_t iodepth_batch_complete_min; uint32_t iodepth_batch_complete_max; uint32_t serialize_overlap; + + uint32_t iodepth_mode; + uint32_t lat_step_low; + uint32_t lat_step_high; + uint32_t lat_step_inc; + uint32_t lat_step_ramp; + uint32_t lat_step_run; + + uint32_t pad2; uint32_t lat_percentiles; uint64_t size; @@ -416,6 +433,7 @@ struct thread_options_pack { uint32_t verify_fatal; uint32_t verify_dump; uint32_t verify_async; + uint32_t pad3; uint64_t verify_backlog; uint32_t verify_batch; uint32_t experimental_verify; @@ -428,7 +446,7 @@ struct thread_options_pack { uint32_t override_sync; uint32_t rand_repeatable; uint32_t allrand_repeatable; - uint32_t pad; + uint32_t pad4; uint64_t rand_seed; uint32_t log_avg_msec; uint32_t log_hist_msec; @@ -451,6 +469,7 @@ struct thread_options_pack { struct zone_split zone_split[DDIR_RWDIR_CNT][ZONESPLIT_MAX]; uint32_t zone_split_nr[DDIR_RWDIR_CNT]; + uint32_t pad5; fio_fp64_t zipf_theta; fio_fp64_t pareto_h; @@ -459,10 +478,10 @@ struct thread_options_pack { uint32_t random_generator; uint32_t perc_rand[DDIR_RWDIR_CNT]; + uint32_t pad6; uint32_t hugepage_size; uint64_t rw_min_bs; - uint32_t pad2; uint32_t thinktime; uint32_t thinktime_spin; uint32_t thinktime_blocks; @@ -476,6 +495,7 @@ struct thread_options_pack { uint64_t ss_dur; uint64_t ss_ramp_time; uint32_t ss_state; + uint32_t pad7; fio_fp64_t ss_limit; uint32_t overwrite; uint32_t bw_avg_time; @@ -534,6 +554,7 @@ struct thread_options_pack { uint32_t trim_percentage; uint32_t trim_batch; uint32_t trim_zero; + uint32_t pad8; uint64_t trim_backlog; uint32_t clat_percentiles; uint32_t percentile_precision; @@ -570,7 +591,6 @@ struct thread_options_pack { uint32_t rate_iops_min[DDIR_RWDIR_CNT]; uint32_t rate_process; uint32_t rate_ign_think; - uint32_t pad3; uint8_t ioscheduler[FIO_TOP_STR_MAX]; @@ -598,6 +618,7 @@ struct thread_options_pack { int32_t flow; int32_t flow_watermark; uint32_t flow_sleep; + uint32_t pad9; uint64_t offset_increment; uint64_t number_ios; -- 2.25.1