From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 9 Dec 2013 19:38:53 +0000 (-0700)
Subject: Add options to have fio latency profile a device
X-Git-Tag: fio-2.1.5~45
X-Git-Url: https://git.kernel.dk/?p=fio.git;a=commitdiff_plain;h=3e260a46ea9a8de224c3d0a29a608da3440f284a

Add options to have fio latency profile a device

This adds three new options:

- latency_target. This defines a specific latency target, in usec.
- latency_window. This defines the period over which fio samples.
- latency_percentile. This defines the percentage of IOs that must
  meet the criteria specified by latency_target/latency_window.

With these options set, fio will run the described workload and
vary the queue depth between 1 and iodepth= to find the best
performing spot that meets the criteria specified by the three
options.

A sample job file is also added to demonstrate how to use this.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---

diff --git a/HOWTO b/HOWTO
index 250bc582..044b5722 100644
--- a/HOWTO
+++ b/HOWTO
@@ -881,6 +881,20 @@ rate_iops_min=int If fio doesn't meet this rate of IO, it will cause
 		the job to exit. The same format as rate is used for read vs
 		write separation.
 
+latency_target=int	If set, fio will attempt to find the max performance
+		point that the given workload will run at while maintaining a
+		latency below this target. The values is given in microseconds.
+		See latency_window and latency_percentile
+
+latency_window=int	Used with latency_target to specify the sample window
+		that the job is run at varying queue depths to test the
+		performance. The value is given in microseconds.
+
+latency_percentile=float	The percentage of IOs that must fall within the
+		criteria specified by latency_target and latency_window. If not
+		set, this defaults to 100.0, meaning that all IOs must be equal
+		or below to the value set by latency_target.
+
 max_latency=int	If set, fio will exit the job if it exceeds this maximum
 		latency. It will exit with an ETIME error.
 
diff --git a/README b/README
index 1373c789..0acb04fb 100644
--- a/README
+++ b/README
@@ -197,6 +197,8 @@ Currently, additional logging is available for:
 	mutex		Dump info only related to mutex up/down ops
 	profile		Dump info related to profile extensions
 	time		Dump info related to internal time keeping
+	net		Dump info related to networking connections
+	rate		Dump info related to IO rate switching
 	? or help	Show available debug options.
 
 One can specify multiple debug options: e.g. --debug=file,mem will enable
diff --git a/backend.c b/backend.c
index 2ec478c1..101024d1 100644
--- a/backend.c
+++ b/backend.c
@@ -649,6 +649,8 @@ static uint64_t do_io(struct thread_data *td)
 	else
 		td_set_runstate(td, TD_RUNNING);
 
+	lat_target_init(td);
+
 	while ((td->o.read_iolog_file && !flist_empty(&td->io_log_list)) ||
 		(!flist_empty(&td->trim_list)) || !io_bytes_exceeded(td) ||
 		td->o.time_based) {
@@ -680,8 +682,11 @@ static uint64_t do_io(struct thread_data *td)
 			break;
 
 		io_u = get_io_u(td);
-		if (!io_u)
+		if (!io_u) {
+			if (td->o.latency_target)
+				goto reap;
 			break;
+		}
 
 		ddir = io_u->ddir;
 
@@ -776,6 +781,7 @@ sync_done:
 		 * can get BUSY even without IO queued, if the system is
 		 * resource starved.
 		 */
+reap:
 		full = queue_full(td) || (ret == FIO_Q_BUSY && td->cur_depth);
 		if (full || !td->o.iodepth_batch_complete) {
 			min_evts = min(td->o.iodepth_batch_complete,
@@ -812,6 +818,8 @@ sync_done:
 				break;
 			}
 		}
+		if (!in_ramp_time(td) && td->o.latency_target)
+			lat_target_check(td);
 
 		if (td->o.thinktime) {
 			unsigned long long b;
diff --git a/cconv.c b/cconv.c
index 82383b2d..dd61d103 100644
--- a/cconv.c
+++ b/cconv.c
@@ -199,6 +199,9 @@ void convert_thread_options_to_cpu(struct thread_options *o,
 	o->flow_watermark = __le32_to_cpu(top->flow_watermark);
 	o->flow_sleep = le32_to_cpu(top->flow_sleep);
 	o->sync_file_range = le32_to_cpu(top->sync_file_range);
+	o->latency_target = le64_to_cpu(top->latency_target);
+	o->latency_window = le64_to_cpu(top->latency_window);
+	o->latency_percentile.u.f = fio_uint64_to_double(le64_to_cpu(top->latency_percentile.u.i));
 	o->compress_percentage = le32_to_cpu(top->compress_percentage);
 	o->compress_chunk = le32_to_cpu(top->compress_chunk);
 
@@ -348,6 +351,9 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
 	top->flow_watermark = __cpu_to_le32(o->flow_watermark);
 	top->flow_sleep = cpu_to_le32(o->flow_sleep);
 	top->sync_file_range = cpu_to_le32(o->sync_file_range);
+	top->latency_target = __cpu_to_le64(o->latency_target);
+	top->latency_window = __cpu_to_le64(o->latency_window);
+	top->latency_percentile.u.i = __cpu_to_le64(fio_double_to_uint64(o->latency_percentile.u.f));
 	top->compress_percentage = cpu_to_le32(o->compress_percentage);
 	top->compress_chunk = cpu_to_le32(o->compress_chunk);
 
diff --git a/client.c b/client.c
index 1557240b..7975ce60 100644
--- a/client.c
+++ b/client.c
@@ -796,6 +796,11 @@ static void convert_ts(struct thread_stat *dst, struct thread_stat *src)
 	dst->first_error	= le32_to_cpu(src->first_error);
 	dst->kb_base		= le32_to_cpu(src->kb_base);
 	dst->unit_base		= le32_to_cpu(src->unit_base);
+
+	dst->latency_depth	= le32_to_cpu(src->latency_depth);
+	dst->latency_target	= le64_to_cpu(src->latency_target);
+	dst->latency_window	= le64_to_cpu(src->latency_window);
+	dst->latency_percentile.u.f = fio_uint64_to_double(le64_to_cpu(src->latency_percentile.u.i));
 }
 
 static void convert_gs(struct group_run_stats *dst, struct group_run_stats *src)
diff --git a/debug.h b/debug.h
index 6a74131a..e248695e 100644
--- a/debug.h
+++ b/debug.h
@@ -19,6 +19,7 @@ enum {
 	FD_PROFILE,
 	FD_TIME,
 	FD_NET,
+	FD_RATE,
 	FD_DEBUG_MAX,
 };
 
diff --git a/examples/latency-profile.fio b/examples/latency-profile.fio
new file mode 100644
index 00000000..f5cd8444
--- /dev/null
+++ b/examples/latency-profile.fio
@@ -0,0 +1,21 @@
+# Test job that demonstrates how to use the latency target
+# profiling. Fio will find the queue depth between 1..128
+# that fits within the latency constraints of this 4k random
+# read workload.
+
+[global]
+bs=4k
+rw=randread
+random_generator=lfsr
+direct=1
+ioengine=libaio
+iodepth=128
+# Set max acceptable latency to 500msec
+latency_target=500000
+# profile over a 5s window
+latency_window=5000000
+# 99.9% of IOs must be below the target
+latency_percentile=99.9
+
+[device]
+filename=/dev/sda
diff --git a/fio.1 b/fio.1
index 15a1ac5d..9ee0eef1 100644
--- a/fio.1
+++ b/fio.1
@@ -764,6 +764,23 @@ is used for read vs write separation.
 Average bandwidth for \fBrate\fR and \fBratemin\fR over this number of
 milliseconds.  Default: 1000ms.
 .TP
+.BI latency_target \fR=\fPint
+If set, fio will attempt to find the max performance point that the given
+workload will run at while maintaining a latency below this target. The
+values is given in microseconds. See \fBlatency_window\fR and
+\fBlatency_percentile\fR.
+.TP
+.BI latency_window \fR=\fPint
+Used with \fBlatency_target\fR to specify the sample window that the job
+is run at varying queue depths to test the performance. The value is given
+in microseconds.
+.TP
+.BI latency_percentile \fR=\fPfloat
+The percentage of IOs that must fall within the criteria specified by
+\fBlatency_target\fR and \fBlatency_window\fR. If not set, this defaults
+to 100.0, meaning that all IOs must be equal or below to the value set
+by \fBlatency_target\fR.
+.TP
 .BI max_latency \fR=\fPint
 If set, fio will exit the job if it exceeds this maximum latency. It will exit
 with an ETIME error.
diff --git a/fio.h b/fio.h
index d0fe83a5..a0927bd9 100644
--- a/fio.h
+++ b/fio.h
@@ -248,6 +248,16 @@ struct thread_data {
 	unsigned int tv_cache_mask;
 	unsigned int ramp_time_over;
 
+	/*
+	 * Time since last latency_window was started
+	 */
+	struct timeval latency_ts;
+	unsigned int latency_qd;
+	unsigned int latency_qd_high;
+	unsigned int latency_qd_low;
+	unsigned int latency_failed;
+	uint64_t latency_ios;
+
 	/*
 	 * read/write mixed workload state
 	 */
@@ -483,6 +493,12 @@ extern int is_blktrace(const char *, int *);
 extern int load_blktrace(struct thread_data *, const char *, int);
 #endif
 
+/*
+ * Latency target helpers
+ */
+extern void lat_target_check(struct thread_data *);
+extern void lat_target_init(struct thread_data *);
+
 #define for_each_td(td, i)	\
 	for ((i) = 0, (td) = &threads[0]; (i) < (int) thread_number; (i)++, (td)++)
 #define for_each_file(td, f, i)	\
diff --git a/init.c b/init.c
index 4dd0c9ae..ba22f785 100644
--- a/init.c
+++ b/init.c
@@ -1382,7 +1382,7 @@ static void usage(const char *name)
 	printf("%s [options] [job options] <job file(s)>\n", name);
 	printf("  --debug=options\tEnable debug logging. May be one/more of:\n"
 		"\t\t\tprocess,file,io,mem,blktrace,verify,random,parse,\n"
-		"\t\t\tdiskutil,job,mutex,profile,time,net\n");
+		"\t\t\tdiskutil,job,mutex,profile,time,net,rate\n");
 	printf("  --parse-only\t\tParse options only, don't start any IO\n");
 	printf("  --output\t\tWrite output to file\n");
 	printf("  --runtime\t\tRuntime in seconds\n");
@@ -1482,6 +1482,10 @@ struct debug_level debug_levels[] = {
 	  .help = "Network logging",
 	  .shift = FD_NET,
 	},
+	{ .name = "rate",
+	  .help = "Rate logging",
+	  .shift = FD_RATE,
+	},
 	{ .name = NULL, },
 };
 
diff --git a/io_u.c b/io_u.c
index 7df57aa8..ea6c2518 100644
--- a/io_u.c
+++ b/io_u.c
@@ -1126,6 +1126,136 @@ static int set_io_u_file(struct thread_data *td, struct io_u *io_u)
 	return 0;
 }
 
+static void lat_fatal(struct thread_data *td, struct io_completion_data *icd,
+		      unsigned long tusec, unsigned long max_usec)
+{
+	if (!td->error)
+		log_err("fio: latency of %lu usec exceeds specified max (%lu usec)\n", tusec, max_usec);
+	td_verror(td, ETIMEDOUT, "max latency exceeded");
+	icd->error = ETIMEDOUT;
+}
+
+static void lat_new_cycle(struct thread_data *td)
+{
+	fio_gettime(&td->latency_ts, NULL);
+	td->latency_ios = ddir_rw_sum(td->io_blocks);
+	td->latency_failed = 0;
+}
+
+/*
+ * We had an IO outside the latency target. Reduce the queue depth. If we
+ * are at QD=1, then it's time to give up.
+ */
+static int __lat_target_failed(struct thread_data *td)
+{
+	if (td->latency_qd == 1)
+		return 1;
+
+	td->latency_qd_high = td->latency_qd;
+	td->latency_qd = (td->latency_qd + td->latency_qd_low) / 2;
+
+	dprint(FD_RATE, "Ramped down: %d %d %d\n", td->latency_qd_low, td->latency_qd, td->latency_qd_high);
+
+	/*
+	 * When we ramp QD down, quiesce existing IO to prevent
+	 * a storm of ramp downs due to pending higher depth.
+	 */
+	io_u_quiesce(td);
+	lat_new_cycle(td);
+	return 0;
+}
+
+static int lat_target_failed(struct thread_data *td)
+{
+	if (td->o.latency_percentile.u.f == 100.0)
+		return __lat_target_failed(td);
+
+	td->latency_failed++;
+	return 0;
+}
+
+void lat_target_init(struct thread_data *td)
+{
+	if (td->o.latency_target) {
+		dprint(FD_RATE, "Latency target=%llu\n", td->o.latency_target);
+		fio_gettime(&td->latency_ts, NULL);
+		td->latency_qd = 1;
+		td->latency_qd_high = td->o.iodepth;
+		td->latency_qd_low = 1;
+		td->latency_ios = ddir_rw_sum(td->io_blocks);
+	} else
+		td->latency_qd = td->o.iodepth;
+}
+
+static void lat_target_success(struct thread_data *td)
+{
+	const unsigned int qd = td->latency_qd;
+
+	td->latency_qd_low = td->latency_qd;
+
+	/*
+	 * If we haven't failed yet, we double up to a failing value instead
+	 * of bisecting from highest possible queue depth. If we have set
+	 * a limit other than td->o.iodepth, bisect between that.
+	 */
+	if (td->latency_qd_high != td->o.iodepth)
+		td->latency_qd = (td->latency_qd + td->latency_qd_high) / 2;
+	else
+		td->latency_qd *= 2;
+
+	if (td->latency_qd > td->o.iodepth)
+		td->latency_qd = td->o.iodepth;
+
+	dprint(FD_RATE, "Ramped up: %d %d %d\n", td->latency_qd_low, td->latency_qd, td->latency_qd_high);
+	/*
+	 * Same as last one, we are done
+	 */
+	if (td->latency_qd == qd)
+		td->done = 1;
+
+	lat_new_cycle(td);
+}
+
+/*
+ * Check if we can bump the queue depth
+ */
+void lat_target_check(struct thread_data *td)
+{
+	uint64_t usec_window;
+	uint64_t ios;
+	double success_ios;
+
+	usec_window = utime_since_now(&td->latency_ts);
+	if (usec_window < td->o.latency_window)
+		return;
+
+	ios = ddir_rw_sum(td->io_blocks) - td->latency_ios;
+	success_ios = (double) (ios - td->latency_failed) / (double) ios;
+	success_ios *= 100.0;
+
+	dprint(FD_RATE, "Success rate: %.2f%% (target %.2f%%)\n", success_ios, td->o.latency_percentile.u.f);
+
+	if (success_ios >= td->o.latency_percentile.u.f)
+		lat_target_success(td);
+	else
+		__lat_target_failed(td);
+}
+
+/*
+ * If latency target is enabled, we might be ramping up or down and not
+ * using the full queue depth available.
+ */
+int queue_full(struct thread_data *td)
+{
+	const int qempty = io_u_qempty(&td->io_u_freelist);
+
+	if (qempty)
+		return 1;
+	if (!td->o.latency_target)
+		return 0;
+
+	return td->cur_depth >= td->latency_qd;
+}
 
 struct io_u *__get_io_u(struct thread_data *td)
 {
@@ -1136,7 +1266,7 @@ struct io_u *__get_io_u(struct thread_data *td)
 again:
 	if (!io_u_rempty(&td->io_u_requeues))
 		io_u = io_u_rpop(&td->io_u_requeues);
-	else if (!io_u_qempty(&td->io_u_freelist)) {
+	else if (!queue_full(td)) {
 		io_u = io_u_qpop(&td->io_u_freelist);
 
 		io_u->buflen = 0;
@@ -1395,11 +1525,11 @@ static void account_io_completion(struct thread_data *td, struct io_u *io_u,
 				icd->error = ops->io_u_lat(td, tusec);
 		}
 
-		if (td->o.max_latency && tusec > td->o.max_latency) {
-			if (!td->error)
-				log_err("fio: latency of %lu usec exceeds specified max (%u usec)\n", tusec, td->o.max_latency);
-			td_verror(td, ETIMEDOUT, "max latency exceeded");
-			icd->error = ETIMEDOUT;
+		if (td->o.max_latency && tusec > td->o.max_latency)
+			lat_fatal(td, icd, tusec, td->o.max_latency);
+		if (td->o.latency_target && tusec > td->o.latency_target) {
+			if (lat_target_failed(td))
+				lat_fatal(td, icd, tusec, td->o.latency_target);
 		}
 	}
 
diff --git a/ioengine.h b/ioengine.h
index 31662ebe..6dd2aa40 100644
--- a/ioengine.h
+++ b/ioengine.h
@@ -188,7 +188,6 @@ extern int fio_show_ioengine_help(const char *engine);
 /*
  * io unit handling
  */
-#define queue_full(td)	io_u_qempty(&(td)->io_u_freelist)
 extern struct io_u *__get_io_u(struct thread_data *);
 extern struct io_u *get_io_u(struct thread_data *);
 extern void put_io_u(struct thread_data *, struct io_u *);
@@ -204,6 +203,7 @@ extern void fill_io_buffer(struct thread_data *, void *, unsigned int, unsigned
 extern void io_u_fill_buffer(struct thread_data *td, struct io_u *, unsigned int, unsigned int);
 void io_u_mark_complete(struct thread_data *, unsigned int);
 void io_u_mark_submit(struct thread_data *, unsigned int);
+int queue_full(struct thread_data *);
 
 int do_io_u_sync(struct thread_data *, struct io_u *);
 int do_io_u_trim(struct thread_data *, struct io_u *);
diff --git a/libfio.c b/libfio.c
index c26d6a3a..7f365147 100644
--- a/libfio.c
+++ b/libfio.c
@@ -133,6 +133,8 @@ void reset_all_stats(struct thread_data *td)
 	fio_gettime(&tv, NULL);
 	memcpy(&td->epoch, &tv, sizeof(tv));
 	memcpy(&td->start, &tv, sizeof(tv));
+
+	lat_target_init(td);
 }
 
 void reset_fio_state(void)
diff --git a/options.c b/options.c
index d2493e45..b0f45098 100644
--- a/options.c
+++ b/options.c
@@ -1042,6 +1042,10 @@ struct opt_group *opt_group_from_mask(unsigned int *mask)
 }
 
 static struct opt_group fio_opt_cat_groups[] = {
+	{
+		.name	= "Latency profiling",
+		.mask	= FIO_OPT_G_LATPROF,
+	},
 	{
 		.name	= "Rate",
 		.mask	= FIO_OPT_G_RATE,
@@ -2623,7 +2627,38 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
 		.off1	= td_var_offset(max_latency),
 		.help	= "Maximum tolerated IO latency (usec)",
 		.category = FIO_OPT_C_IO,
-		.group = FIO_OPT_G_RATE,
+		.group = FIO_OPT_G_LATPROF,
+	},
+	{
+		.name	= "latency_target",
+		.lname	= "Latency Target (usec)",
+		.type	= FIO_OPT_STR_VAL_TIME,
+		.off1	= td_var_offset(latency_target),
+		.help	= "Ramp to max queue depth supporting this latency",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_LATPROF,
+	},
+	{
+		.name	= "latency_window",
+		.lname	= "Latency Window (usec)",
+		.type	= FIO_OPT_STR_VAL_TIME,
+		.off1	= td_var_offset(latency_window),
+		.help	= "Time to sustain latency_target",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_LATPROF,
+	},
+	{
+		.name	= "latency_percentile",
+		.lname	= "Latency Percentile",
+		.type	= FIO_OPT_FLOAT_LIST,
+		.off1	= td_var_offset(latency_percentile),
+		.help	= "Percentile of IOs must be below latency_target",
+		.def	= "100",
+		.maxlen	= 1,
+		.minfp	= 0.0,
+		.maxfp	= 100.0,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_LATPROF,
 	},
 	{
 		.name	= "invalidate",
diff --git a/options.h b/options.h
index 3795afd5..fc363682 100644
--- a/options.h
+++ b/options.h
@@ -94,6 +94,7 @@ enum opt_category_group {
 	__FIO_OPT_G_NETIO,
 	__FIO_OPT_G_LIBAIO,
 	__FIO_OPT_G_ACT,
+	__FIO_OPT_G_LATPROF,
 	__FIO_OPT_G_NR,
 
 	FIO_OPT_G_RATE		= (1U << __FIO_OPT_G_RATE),
@@ -122,6 +123,7 @@ enum opt_category_group {
 	FIO_OPT_G_NETIO		= (1U << __FIO_OPT_G_NETIO),
 	FIO_OPT_G_LIBAIO	= (1U << __FIO_OPT_G_LIBAIO),
 	FIO_OPT_G_ACT		= (1U << __FIO_OPT_G_ACT),
+	FIO_OPT_G_LATPROF	= (1U << __FIO_OPT_G_LATPROF),
 	FIO_OPT_G_INVALID	= (1U << __FIO_OPT_G_NR),
 };
 
diff --git a/parse.c b/parse.c
index e3f431ed..6141c91e 100644
--- a/parse.c
+++ b/parse.c
@@ -547,9 +547,11 @@ static int __handle_option(struct fio_option *o, const char *ptr, void *data,
 			** Initialize precision to 0 and zero out list
 			** in case specified list is shorter than default
 			*/
-			ul2 = 0;
-			ilp = td_var(data, o->off2);
-			*ilp = ul2;
+			if (o->off2) {
+				ul2 = 0;
+				ilp = td_var(data, o->off2);
+				*ilp = ul2;
+			}
 
 			flp = td_var(data, o->off1);
 			for(i = 0; i < o->maxlen; i++)
@@ -592,9 +594,11 @@ static int __handle_option(struct fio_option *o, const char *ptr, void *data,
 			while (*++cp2 != '\0' && *cp2 >= '0' && *cp2 <= '9')
 				len++;
 
-			ilp = td_var(data, o->off2);
-			if (len > *ilp)
-				*ilp = len;
+			if (o->off2) {
+				ilp = td_var(data, o->off2);
+				if (len > *ilp)
+					*ilp = len;
+			}
 		}
 
 		break;
diff --git a/server.c b/server.c
index 83245430..6e96b77a 100644
--- a/server.c
+++ b/server.c
@@ -1042,6 +1042,11 @@ void fio_server_send_ts(struct thread_stat *ts, struct group_run_stats *rs)
 	p.ts.kb_base		= cpu_to_le32(ts->kb_base);
 	p.ts.unit_base		= cpu_to_le32(ts->unit_base);
 
+	p.ts.latency_depth	= cpu_to_le32(ts->latency_depth);
+	p.ts.latency_target	= cpu_to_le64(ts->latency_target);
+	p.ts.latency_window	= cpu_to_le64(ts->latency_window);
+	p.ts.latency_percentile.u.i = __cpu_to_le64(fio_double_to_uint64(ts->latency_percentile.u.f));
+
 	convert_gs(&p.rs, rs);
 
 	fio_net_send_cmd(server_fd, FIO_NET_CMD_TS, &p, sizeof(p), NULL, NULL);
diff --git a/server.h b/server.h
index 405370e3..84901cf6 100644
--- a/server.h
+++ b/server.h
@@ -38,7 +38,7 @@ struct fio_net_cmd_reply {
 };
 
 enum {
-	FIO_SERVER_VER			= 27,
+	FIO_SERVER_VER			= 28,
 
 	FIO_SERVER_MAX_FRAGMENT_PDU	= 1024,
 
diff --git a/stat.c b/stat.c
index ac5ff162..5d50ae86 100644
--- a/stat.c
+++ b/stat.c
@@ -497,7 +497,6 @@ static void show_latencies(struct thread_stat *ts)
 	show_lat_m(io_u_lat_m);
 }
 
-
 void show_thread_status_normal(struct thread_stat *ts, struct group_run_stats *rs)
 {
 	double usr_cpu, sys_cpu;
@@ -587,6 +586,13 @@ void show_thread_status_normal(struct thread_stat *ts, struct group_run_stats *r
 					ts->first_error,
 					strerror(ts->first_error));
 	}
+	if (ts->latency_depth) {
+		log_info("     latency   : target=%llu, window=%llu, percentile=%.2f%%, depth=%u\n",
+					(unsigned long long)ts->latency_target,
+					(unsigned long long)ts->latency_window,
+					ts->latency_percentile.u.f,
+					ts->latency_depth);
+	}
 }
 
 static void show_ddir_status_terse(struct thread_stat *ts,
@@ -975,6 +981,13 @@ static struct json_object *show_thread_status_json(struct thread_stat *ts,
 		json_object_add_value_int(root, "first_error", ts->first_error);
 	}
 
+	if (ts->latency_depth) {
+		json_object_add_value_int(root, "latency_depth", ts->latency_depth);
+		json_object_add_value_int(root, "latency_target", ts->latency_target);
+		json_object_add_value_float(root, "latency_percentile", ts->latency_percentile.u.f);
+		json_object_add_value_int(root, "latency_window", ts->latency_window);
+	}
+
 	/* Additional output if description is set */
 	if (strlen(ts->description))
 		json_object_add_value_string(root, "desc", ts->description);
@@ -1265,6 +1278,11 @@ static void __show_run_stats(void)
 			}
 		}
 
+		ts->latency_depth = td->latency_qd;
+		ts->latency_target = td->o.latency_target;
+		ts->latency_percentile = td->o.latency_percentile;
+		ts->latency_window = td->o.latency_window;
+
 		sum_thread_stats(ts, &td->ts, idx);
 	}
 
diff --git a/stat.h b/stat.h
index 8190d1e6..7ad0c9df 100644
--- a/stat.h
+++ b/stat.h
@@ -175,6 +175,11 @@ struct thread_stat {
 
 	uint32_t kb_base;
 	uint32_t unit_base;
+
+	uint32_t latency_depth;
+	uint64_t latency_target;
+	fio_fp64_t latency_percentile;
+	uint64_t latency_window;
 };
 
 struct jobs_eta {
diff --git a/thread_options.h b/thread_options.h
index 44cbf91a..7476d8c2 100644
--- a/thread_options.h
+++ b/thread_options.h
@@ -240,6 +240,10 @@ struct thread_options {
 	unsigned long long number_ios;
 
 	unsigned int sync_file_range;
+
+	unsigned long long latency_target;
+	unsigned long long latency_window;
+	fio_fp64_t latency_percentile;
 };
 
 #define FIO_TOP_STR_MAX		256
@@ -446,6 +450,10 @@ struct thread_options_pack {
 	uint64_t number_ios;
 
 	uint32_t sync_file_range;
+
+	uint64_t latency_target;
+	uint64_t latency_window;
+	fio_fp64_t latency_percentile;
 } __attribute__((packed));
 
 extern void convert_thread_options_to_cpu(struct thread_options *o, struct thread_options_pack *top);