blk-iocost: make iocg_kick_waitq() call iocg_kick_delay() after paying debt
[linux-2.6-block.git] / block / blk-iocost.c
CommitLineData
7caa4715
TH
1/* SPDX-License-Identifier: GPL-2.0
2 *
3 * IO cost model based controller.
4 *
5 * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
6 * Copyright (C) 2019 Andy Newell <newella@fb.com>
7 * Copyright (C) 2019 Facebook
8 *
9 * One challenge of controlling IO resources is the lack of trivially
10 * observable cost metric. This is distinguished from CPU and memory where
11 * wallclock time and the number of bytes can serve as accurate enough
12 * approximations.
13 *
14 * Bandwidth and iops are the most commonly used metrics for IO devices but
15 * depending on the type and specifics of the device, different IO patterns
16 * easily lead to multiple orders of magnitude variations rendering them
17 * useless for the purpose of IO capacity distribution. While on-device
18 * time, with a lot of clutches, could serve as a useful approximation for
19 * non-queued rotational devices, this is no longer viable with modern
20 * devices, even the rotational ones.
21 *
22 * While there is no cost metric we can trivially observe, it isn't a
23 * complete mystery. For example, on a rotational device, seek cost
24 * dominates while a contiguous transfer contributes a smaller amount
25 * proportional to the size. If we can characterize at least the relative
26 * costs of these different types of IOs, it should be possible to
27 * implement a reasonable work-conserving proportional IO resource
28 * distribution.
29 *
30 * 1. IO Cost Model
31 *
32 * IO cost model estimates the cost of an IO given its basic parameters and
33 * history (e.g. the end sector of the last IO). The cost is measured in
34 * device time. If a given IO is estimated to cost 10ms, the device should
35 * be able to process ~100 of those IOs in a second.
36 *
37 * Currently, there's only one builtin cost model - linear. Each IO is
38 * classified as sequential or random and given a base cost accordingly.
39 * On top of that, a size cost proportional to the length of the IO is
40 * added. While simple, this model captures the operational
41 * characteristics of a wide varienty of devices well enough. Default
42 * paramters for several different classes of devices are provided and the
43 * parameters can be configured from userspace via
44 * /sys/fs/cgroup/io.cost.model.
45 *
46 * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
47 * device-specific coefficients.
48 *
49 * 2. Control Strategy
50 *
51 * The device virtual time (vtime) is used as the primary control metric.
52 * The control strategy is composed of the following three parts.
53 *
54 * 2-1. Vtime Distribution
55 *
56 * When a cgroup becomes active in terms of IOs, its hierarchical share is
57 * calculated. Please consider the following hierarchy where the numbers
58 * inside parentheses denote the configured weights.
59 *
60 * root
61 * / \
62 * A (w:100) B (w:300)
63 * / \
64 * A0 (w:100) A1 (w:100)
65 *
66 * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
67 * of equal weight, each gets 50% share. If then B starts issuing IOs, B
68 * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
69 * 12.5% each. The distribution mechanism only cares about these flattened
70 * shares. They're called hweights (hierarchical weights) and always add
71 * upto 1 (HWEIGHT_WHOLE).
72 *
73 * A given cgroup's vtime runs slower in inverse proportion to its hweight.
74 * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
75 * against the device vtime - an IO which takes 10ms on the underlying
76 * device is considered to take 80ms on A0.
77 *
78 * This constitutes the basis of IO capacity distribution. Each cgroup's
79 * vtime is running at a rate determined by its hweight. A cgroup tracks
80 * the vtime consumed by past IOs and can issue a new IO iff doing so
81 * wouldn't outrun the current device vtime. Otherwise, the IO is
82 * suspended until the vtime has progressed enough to cover it.
83 *
84 * 2-2. Vrate Adjustment
85 *
86 * It's unrealistic to expect the cost model to be perfect. There are too
87 * many devices and even on the same device the overall performance
88 * fluctuates depending on numerous factors such as IO mixture and device
89 * internal garbage collection. The controller needs to adapt dynamically.
90 *
91 * This is achieved by adjusting the overall IO rate according to how busy
92 * the device is. If the device becomes overloaded, we're sending down too
93 * many IOs and should generally slow down. If there are waiting issuers
94 * but the device isn't saturated, we're issuing too few and should
95 * generally speed up.
96 *
97 * To slow down, we lower the vrate - the rate at which the device vtime
98 * passes compared to the wall clock. For example, if the vtime is running
99 * at the vrate of 75%, all cgroups added up would only be able to issue
100 * 750ms worth of IOs per second, and vice-versa for speeding up.
101 *
102 * Device business is determined using two criteria - rq wait and
103 * completion latencies.
104 *
105 * When a device gets saturated, the on-device and then the request queues
106 * fill up and a bio which is ready to be issued has to wait for a request
107 * to become available. When this delay becomes noticeable, it's a clear
108 * indication that the device is saturated and we lower the vrate. This
109 * saturation signal is fairly conservative as it only triggers when both
110 * hardware and software queues are filled up, and is used as the default
111 * busy signal.
112 *
113 * As devices can have deep queues and be unfair in how the queued commands
114 * are executed, soley depending on rq wait may not result in satisfactory
115 * control quality. For a better control quality, completion latency QoS
116 * parameters can be configured so that the device is considered saturated
117 * if N'th percentile completion latency rises above the set point.
118 *
119 * The completion latency requirements are a function of both the
120 * underlying device characteristics and the desired IO latency quality of
121 * service. There is an inherent trade-off - the tighter the latency QoS,
122 * the higher the bandwidth lossage. Latency QoS is disabled by default
123 * and can be set through /sys/fs/cgroup/io.cost.qos.
124 *
125 * 2-3. Work Conservation
126 *
127 * Imagine two cgroups A and B with equal weights. A is issuing a small IO
128 * periodically while B is sending out enough parallel IOs to saturate the
129 * device on its own. Let's say A's usage amounts to 100ms worth of IO
130 * cost per second, i.e., 10% of the device capacity. The naive
131 * distribution of half and half would lead to 60% utilization of the
132 * device, a significant reduction in the total amount of work done
133 * compared to free-for-all competition. This is too high a cost to pay
134 * for IO control.
135 *
136 * To conserve the total amount of work done, we keep track of how much
137 * each active cgroup is actually using and yield part of its weight if
138 * there are other cgroups which can make use of it. In the above case,
139 * A's weight will be lowered so that it hovers above the actual usage and
140 * B would be able to use the rest.
141 *
142 * As we don't want to penalize a cgroup for donating its weight, the
143 * surplus weight adjustment factors in a margin and has an immediate
144 * snapback mechanism in case the cgroup needs more IO vtime for itself.
145 *
146 * Note that adjusting down surplus weights has the same effects as
147 * accelerating vtime for other cgroups and work conservation can also be
148 * implemented by adjusting vrate dynamically. However, squaring who can
149 * donate and should take back how much requires hweight propagations
150 * anyway making it easier to implement and understand as a separate
151 * mechanism.
6954ff18
TH
152 *
153 * 3. Monitoring
154 *
155 * Instead of debugfs or other clumsy monitoring mechanisms, this
156 * controller uses a drgn based monitoring script -
157 * tools/cgroup/iocost_monitor.py. For details on drgn, please see
158 * https://github.com/osandov/drgn. The ouput looks like the following.
159 *
160 * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
7c1ee704
TH
161 * active weight hweight% inflt% dbt delay usages%
162 * test/a * 50/ 50 33.33/ 33.33 27.65 2 0*041 033:033:033
163 * test/b * 100/ 100 66.67/ 66.67 17.56 0 0*000 066:079:077
6954ff18
TH
164 *
165 * - per : Timer period
166 * - cur_per : Internal wall and device vtime clock
167 * - vrate : Device virtual time rate against wall clock
168 * - weight : Surplus-adjusted and configured weights
169 * - hweight : Surplus-adjusted and configured hierarchical weights
170 * - inflt : The percentage of in-flight IO cost at the end of last period
171 * - del_ms : Deferred issuer delay induction level and duration
172 * - usages : Usage history
7caa4715
TH
173 */
174
175#include <linux/kernel.h>
176#include <linux/module.h>
177#include <linux/timer.h>
178#include <linux/time64.h>
179#include <linux/parser.h>
180#include <linux/sched/signal.h>
181#include <linux/blk-cgroup.h>
5e124f74
TH
182#include <asm/local.h>
183#include <asm/local64.h>
7caa4715
TH
184#include "blk-rq-qos.h"
185#include "blk-stat.h"
186#include "blk-wbt.h"
187
188#ifdef CONFIG_TRACEPOINTS
189
190/* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
191#define TRACE_IOCG_PATH_LEN 1024
192static DEFINE_SPINLOCK(trace_iocg_path_lock);
193static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
194
195#define TRACE_IOCG_PATH(type, iocg, ...) \
196 do { \
197 unsigned long flags; \
198 if (trace_iocost_##type##_enabled()) { \
199 spin_lock_irqsave(&trace_iocg_path_lock, flags); \
200 cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
201 trace_iocg_path, TRACE_IOCG_PATH_LEN); \
202 trace_iocost_##type(iocg, trace_iocg_path, \
203 ##__VA_ARGS__); \
204 spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \
205 } \
206 } while (0)
207
208#else /* CONFIG_TRACE_POINTS */
209#define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0)
210#endif /* CONFIG_TRACE_POINTS */
211
212enum {
213 MILLION = 1000000,
214
215 /* timer period is calculated from latency requirements, bound it */
216 MIN_PERIOD = USEC_PER_MSEC,
217 MAX_PERIOD = USEC_PER_SEC,
218
219 /*
220 * A cgroup's vtime can run 50% behind the device vtime, which
221 * serves as its IO credit buffer. Surplus weight adjustment is
222 * immediately canceled if the vtime margin runs below 10%.
223 */
224 MARGIN_PCT = 50,
225 INUSE_MARGIN_PCT = 10,
226
227 /* Have some play in waitq timer operations */
228 WAITQ_TIMER_MARGIN_PCT = 5,
229
230 /*
231 * vtime can wrap well within a reasonable uptime when vrate is
232 * consistently raised. Don't trust recorded cgroup vtime if the
233 * period counter indicates that it's older than 5mins.
234 */
235 VTIME_VALID_DUR = 300 * USEC_PER_SEC,
236
237 /*
238 * Remember the past three non-zero usages and use the max for
239 * surplus calculation. Three slots guarantee that we remember one
240 * full period usage from the last active stretch even after
241 * partial deactivation and re-activation periods. Don't start
242 * giving away weight before collecting two data points to prevent
243 * hweight adjustments based on one partial activation period.
244 */
245 NR_USAGE_SLOTS = 3,
246 MIN_VALID_USAGES = 2,
247
248 /* 1/64k is granular enough and can easily be handled w/ u32 */
249 HWEIGHT_WHOLE = 1 << 16,
250
251 /*
252 * As vtime is used to calculate the cost of each IO, it needs to
253 * be fairly high precision. For example, it should be able to
254 * represent the cost of a single page worth of discard with
255 * suffificient accuracy. At the same time, it should be able to
256 * represent reasonably long enough durations to be useful and
257 * convenient during operation.
258 *
259 * 1s worth of vtime is 2^37. This gives us both sub-nanosecond
260 * granularity and days of wrap-around time even at extreme vrates.
261 */
262 VTIME_PER_SEC_SHIFT = 37,
263 VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
264 VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
cd006509 265 VTIME_PER_NSEC = VTIME_PER_SEC / NSEC_PER_SEC,
7caa4715
TH
266
267 /* bound vrate adjustments within two orders of magnitude */
268 VRATE_MIN_PPM = 10000, /* 1% */
269 VRATE_MAX_PPM = 100000000, /* 10000% */
270
271 VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
272 VRATE_CLAMP_ADJ_PCT = 4,
273
274 /* if IOs end up waiting for requests, issue less */
275 RQ_WAIT_BUSY_PCT = 5,
276
277 /* unbusy hysterisis */
278 UNBUSY_THR_PCT = 75,
279
280 /* don't let cmds which take a very long time pin lagging for too long */
281 MAX_LAGGING_PERIODS = 10,
282
283 /*
284 * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
285 * donate the surplus.
286 */
287 SURPLUS_SCALE_PCT = 125, /* * 125% */
288 SURPLUS_SCALE_ABS = HWEIGHT_WHOLE / 50, /* + 2% */
289 SURPLUS_MIN_ADJ_DELTA = HWEIGHT_WHOLE / 33, /* 3% */
290
291 /* switch iff the conditions are met for longer than this */
292 AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
293
294 /*
295 * Count IO size in 4k pages. The 12bit shift helps keeping
296 * size-proportional components of cost calculation in closer
297 * numbers of digits to per-IO cost components.
298 */
299 IOC_PAGE_SHIFT = 12,
300 IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT,
301 IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
302
303 /* if apart further than 16M, consider randio for linear model */
304 LCOEF_RANDIO_PAGES = 4096,
305};
306
307enum ioc_running {
308 IOC_IDLE,
309 IOC_RUNNING,
310 IOC_STOP,
311};
312
313/* io.cost.qos controls including per-dev enable of the whole controller */
314enum {
315 QOS_ENABLE,
316 QOS_CTRL,
317 NR_QOS_CTRL_PARAMS,
318};
319
320/* io.cost.qos params */
321enum {
322 QOS_RPPM,
323 QOS_RLAT,
324 QOS_WPPM,
325 QOS_WLAT,
326 QOS_MIN,
327 QOS_MAX,
328 NR_QOS_PARAMS,
329};
330
331/* io.cost.model controls */
332enum {
333 COST_CTRL,
334 COST_MODEL,
335 NR_COST_CTRL_PARAMS,
336};
337
338/* builtin linear cost model coefficients */
339enum {
340 I_LCOEF_RBPS,
341 I_LCOEF_RSEQIOPS,
342 I_LCOEF_RRANDIOPS,
343 I_LCOEF_WBPS,
344 I_LCOEF_WSEQIOPS,
345 I_LCOEF_WRANDIOPS,
346 NR_I_LCOEFS,
347};
348
349enum {
350 LCOEF_RPAGE,
351 LCOEF_RSEQIO,
352 LCOEF_RRANDIO,
353 LCOEF_WPAGE,
354 LCOEF_WSEQIO,
355 LCOEF_WRANDIO,
356 NR_LCOEFS,
357};
358
359enum {
360 AUTOP_INVALID,
361 AUTOP_HDD,
362 AUTOP_SSD_QD1,
363 AUTOP_SSD_DFL,
364 AUTOP_SSD_FAST,
365};
366
367struct ioc_gq;
368
369struct ioc_params {
370 u32 qos[NR_QOS_PARAMS];
371 u64 i_lcoefs[NR_I_LCOEFS];
372 u64 lcoefs[NR_LCOEFS];
373 u32 too_fast_vrate_pct;
374 u32 too_slow_vrate_pct;
375};
376
377struct ioc_missed {
5e124f74
TH
378 local_t nr_met;
379 local_t nr_missed;
7caa4715
TH
380 u32 last_met;
381 u32 last_missed;
382};
383
384struct ioc_pcpu_stat {
385 struct ioc_missed missed[2];
386
5e124f74 387 local64_t rq_wait_ns;
7caa4715
TH
388 u64 last_rq_wait_ns;
389};
390
391/* per device */
392struct ioc {
393 struct rq_qos rqos;
394
395 bool enabled;
396
397 struct ioc_params params;
398 u32 period_us;
399 u32 margin_us;
400 u64 vrate_min;
401 u64 vrate_max;
402
403 spinlock_t lock;
404 struct timer_list timer;
405 struct list_head active_iocgs; /* active cgroups */
406 struct ioc_pcpu_stat __percpu *pcpu_stat;
407
408 enum ioc_running running;
409 atomic64_t vtime_rate;
410
67b7b641 411 seqcount_spinlock_t period_seqcount;
7caa4715
TH
412 u32 period_at; /* wallclock starttime */
413 u64 period_at_vtime; /* vtime starttime */
414
415 atomic64_t cur_period; /* inc'd each period */
416 int busy_level; /* saturation history */
417
418 u64 inuse_margin_vtime;
419 bool weights_updated;
420 atomic_t hweight_gen; /* for lazy hweights */
421
422 u64 autop_too_fast_at;
423 u64 autop_too_slow_at;
424 int autop_idx;
425 bool user_qos_params:1;
426 bool user_cost_model:1;
427};
428
429/* per device-cgroup pair */
430struct ioc_gq {
431 struct blkg_policy_data pd;
432 struct ioc *ioc;
433
434 /*
435 * A iocg can get its weight from two sources - an explicit
436 * per-device-cgroup configuration or the default weight of the
437 * cgroup. `cfg_weight` is the explicit per-device-cgroup
438 * configuration. `weight` is the effective considering both
439 * sources.
440 *
441 * When an idle cgroup becomes active its `active` goes from 0 to
442 * `weight`. `inuse` is the surplus adjusted active weight.
443 * `active` and `inuse` are used to calculate `hweight_active` and
444 * `hweight_inuse`.
445 *
446 * `last_inuse` remembers `inuse` while an iocg is idle to persist
447 * surplus adjustments.
448 */
449 u32 cfg_weight;
450 u32 weight;
451 u32 active;
452 u32 inuse;
453 u32 last_inuse;
454
455 sector_t cursor; /* to detect randio */
456
457 /*
458 * `vtime` is this iocg's vtime cursor which progresses as IOs are
459 * issued. If lagging behind device vtime, the delta represents
460 * the currently available IO budget. If runnning ahead, the
461 * overage.
462 *
463 * `vtime_done` is the same but progressed on completion rather
464 * than issue. The delta behind `vtime` represents the cost of
465 * currently in-flight IOs.
466 *
467 * `last_vtime` is used to remember `vtime` at the end of the last
468 * period to calculate utilization.
469 */
470 atomic64_t vtime;
471 atomic64_t done_vtime;
0b80f986 472 u64 abs_vdebt;
7caa4715
TH
473 u64 last_vtime;
474
475 /*
476 * The period this iocg was last active in. Used for deactivation
477 * and invalidating `vtime`.
478 */
479 atomic64_t active_period;
480 struct list_head active_list;
481
00410f1b 482 /* see __propagate_weights() and current_hweight() for details */
7caa4715
TH
483 u64 child_active_sum;
484 u64 child_inuse_sum;
485 int hweight_gen;
486 u32 hweight_active;
487 u32 hweight_inuse;
488 bool has_surplus;
489
490 struct wait_queue_head waitq;
491 struct hrtimer waitq_timer;
492 struct hrtimer delay_timer;
493
494 /* usage is recorded as fractions of HWEIGHT_WHOLE */
495 int usage_idx;
496 u32 usages[NR_USAGE_SLOTS];
497
498 /* this iocg's depth in the hierarchy and ancestors including self */
499 int level;
500 struct ioc_gq *ancestors[];
501};
502
503/* per cgroup */
504struct ioc_cgrp {
505 struct blkcg_policy_data cpd;
506 unsigned int dfl_weight;
507};
508
509struct ioc_now {
510 u64 now_ns;
511 u32 now;
512 u64 vnow;
513 u64 vrate;
514};
515
516struct iocg_wait {
517 struct wait_queue_entry wait;
518 struct bio *bio;
519 u64 abs_cost;
520 bool committed;
521};
522
523struct iocg_wake_ctx {
524 struct ioc_gq *iocg;
525 u32 hw_inuse;
526 s64 vbudget;
527};
528
529static const struct ioc_params autop[] = {
530 [AUTOP_HDD] = {
531 .qos = {
7afcccaf
TH
532 [QOS_RLAT] = 250000, /* 250ms */
533 [QOS_WLAT] = 250000,
7caa4715
TH
534 [QOS_MIN] = VRATE_MIN_PPM,
535 [QOS_MAX] = VRATE_MAX_PPM,
536 },
537 .i_lcoefs = {
538 [I_LCOEF_RBPS] = 174019176,
539 [I_LCOEF_RSEQIOPS] = 41708,
540 [I_LCOEF_RRANDIOPS] = 370,
541 [I_LCOEF_WBPS] = 178075866,
542 [I_LCOEF_WSEQIOPS] = 42705,
543 [I_LCOEF_WRANDIOPS] = 378,
544 },
545 },
546 [AUTOP_SSD_QD1] = {
547 .qos = {
548 [QOS_RLAT] = 25000, /* 25ms */
549 [QOS_WLAT] = 25000,
550 [QOS_MIN] = VRATE_MIN_PPM,
551 [QOS_MAX] = VRATE_MAX_PPM,
552 },
553 .i_lcoefs = {
554 [I_LCOEF_RBPS] = 245855193,
555 [I_LCOEF_RSEQIOPS] = 61575,
556 [I_LCOEF_RRANDIOPS] = 6946,
557 [I_LCOEF_WBPS] = 141365009,
558 [I_LCOEF_WSEQIOPS] = 33716,
559 [I_LCOEF_WRANDIOPS] = 26796,
560 },
561 },
562 [AUTOP_SSD_DFL] = {
563 .qos = {
564 [QOS_RLAT] = 25000, /* 25ms */
565 [QOS_WLAT] = 25000,
566 [QOS_MIN] = VRATE_MIN_PPM,
567 [QOS_MAX] = VRATE_MAX_PPM,
568 },
569 .i_lcoefs = {
570 [I_LCOEF_RBPS] = 488636629,
571 [I_LCOEF_RSEQIOPS] = 8932,
572 [I_LCOEF_RRANDIOPS] = 8518,
573 [I_LCOEF_WBPS] = 427891549,
574 [I_LCOEF_WSEQIOPS] = 28755,
575 [I_LCOEF_WRANDIOPS] = 21940,
576 },
577 .too_fast_vrate_pct = 500,
578 },
579 [AUTOP_SSD_FAST] = {
580 .qos = {
581 [QOS_RLAT] = 5000, /* 5ms */
582 [QOS_WLAT] = 5000,
583 [QOS_MIN] = VRATE_MIN_PPM,
584 [QOS_MAX] = VRATE_MAX_PPM,
585 },
586 .i_lcoefs = {
587 [I_LCOEF_RBPS] = 3102524156LLU,
588 [I_LCOEF_RSEQIOPS] = 724816,
589 [I_LCOEF_RRANDIOPS] = 778122,
590 [I_LCOEF_WBPS] = 1742780862LLU,
591 [I_LCOEF_WSEQIOPS] = 425702,
592 [I_LCOEF_WRANDIOPS] = 443193,
593 },
594 .too_slow_vrate_pct = 10,
595 },
596};
597
598/*
599 * vrate adjust percentages indexed by ioc->busy_level. We adjust up on
600 * vtime credit shortage and down on device saturation.
601 */
602static u32 vrate_adj_pct[] =
603 { 0, 0, 0, 0,
604 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
605 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
606 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
607
608static struct blkcg_policy blkcg_policy_iocost;
609
610/* accessors and helpers */
611static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
612{
613 return container_of(rqos, struct ioc, rqos);
614}
615
616static struct ioc *q_to_ioc(struct request_queue *q)
617{
618 return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
619}
620
621static const char *q_name(struct request_queue *q)
622{
623 if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
624 return kobject_name(q->kobj.parent);
625 else
626 return "<unknown>";
627}
628
629static const char __maybe_unused *ioc_name(struct ioc *ioc)
630{
631 return q_name(ioc->rqos.q);
632}
633
634static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
635{
636 return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
637}
638
639static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
640{
641 return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
642}
643
644static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
645{
646 return pd_to_blkg(&iocg->pd);
647}
648
649static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
650{
651 return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
652 struct ioc_cgrp, cpd);
653}
654
655/*
656 * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical
36a52481 657 * weight, the more expensive each IO. Must round up.
7caa4715
TH
658 */
659static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
660{
661 return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
662}
663
36a52481
TH
664/*
665 * The inverse of abs_cost_to_cost(). Must round up.
666 */
667static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
668{
669 return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE);
670}
671
7caa4715
TH
672static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
673{
674 bio->bi_iocost_cost = cost;
675 atomic64_add(cost, &iocg->vtime);
676}
677
678#define CREATE_TRACE_POINTS
679#include <trace/events/iocost.h>
680
681/* latency Qos params changed, update period_us and all the dependent params */
682static void ioc_refresh_period_us(struct ioc *ioc)
683{
684 u32 ppm, lat, multi, period_us;
685
686 lockdep_assert_held(&ioc->lock);
687
688 /* pick the higher latency target */
689 if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
690 ppm = ioc->params.qos[QOS_RPPM];
691 lat = ioc->params.qos[QOS_RLAT];
692 } else {
693 ppm = ioc->params.qos[QOS_WPPM];
694 lat = ioc->params.qos[QOS_WLAT];
695 }
696
697 /*
698 * We want the period to be long enough to contain a healthy number
699 * of IOs while short enough for granular control. Define it as a
700 * multiple of the latency target. Ideally, the multiplier should
701 * be scaled according to the percentile so that it would nominally
702 * contain a certain number of requests. Let's be simpler and
703 * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
704 */
705 if (ppm)
706 multi = max_t(u32, (MILLION - ppm) / 50000, 2);
707 else
708 multi = 2;
709 period_us = multi * lat;
710 period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
711
712 /* calculate dependent params */
713 ioc->period_us = period_us;
714 ioc->margin_us = period_us * MARGIN_PCT / 100;
715 ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
716 period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
717}
718
719static int ioc_autop_idx(struct ioc *ioc)
720{
721 int idx = ioc->autop_idx;
722 const struct ioc_params *p = &autop[idx];
723 u32 vrate_pct;
724 u64 now_ns;
725
726 /* rotational? */
727 if (!blk_queue_nonrot(ioc->rqos.q))
728 return AUTOP_HDD;
729
730 /* handle SATA SSDs w/ broken NCQ */
731 if (blk_queue_depth(ioc->rqos.q) == 1)
732 return AUTOP_SSD_QD1;
733
734 /* use one of the normal ssd sets */
735 if (idx < AUTOP_SSD_DFL)
736 return AUTOP_SSD_DFL;
737
738 /* if user is overriding anything, maintain what was there */
739 if (ioc->user_qos_params || ioc->user_cost_model)
740 return idx;
741
742 /* step up/down based on the vrate */
743 vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
744 VTIME_PER_USEC);
745 now_ns = ktime_get_ns();
746
747 if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
748 if (!ioc->autop_too_fast_at)
749 ioc->autop_too_fast_at = now_ns;
750 if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
751 return idx + 1;
752 } else {
753 ioc->autop_too_fast_at = 0;
754 }
755
756 if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
757 if (!ioc->autop_too_slow_at)
758 ioc->autop_too_slow_at = now_ns;
759 if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
760 return idx - 1;
761 } else {
762 ioc->autop_too_slow_at = 0;
763 }
764
765 return idx;
766}
767
768/*
769 * Take the followings as input
770 *
771 * @bps maximum sequential throughput
772 * @seqiops maximum sequential 4k iops
773 * @randiops maximum random 4k iops
774 *
775 * and calculate the linear model cost coefficients.
776 *
777 * *@page per-page cost 1s / (@bps / 4096)
778 * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0)
779 * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0)
780 */
781static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
782 u64 *page, u64 *seqio, u64 *randio)
783{
784 u64 v;
785
786 *page = *seqio = *randio = 0;
787
788 if (bps)
789 *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
790 DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
791
792 if (seqiops) {
793 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
794 if (v > *page)
795 *seqio = v - *page;
796 }
797
798 if (randiops) {
799 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
800 if (v > *page)
801 *randio = v - *page;
802 }
803}
804
805static void ioc_refresh_lcoefs(struct ioc *ioc)
806{
807 u64 *u = ioc->params.i_lcoefs;
808 u64 *c = ioc->params.lcoefs;
809
810 calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
811 &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
812 calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
813 &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
814}
815
816static bool ioc_refresh_params(struct ioc *ioc, bool force)
817{
818 const struct ioc_params *p;
819 int idx;
820
821 lockdep_assert_held(&ioc->lock);
822
823 idx = ioc_autop_idx(ioc);
824 p = &autop[idx];
825
826 if (idx == ioc->autop_idx && !force)
827 return false;
828
829 if (idx != ioc->autop_idx)
830 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
831
832 ioc->autop_idx = idx;
833 ioc->autop_too_fast_at = 0;
834 ioc->autop_too_slow_at = 0;
835
836 if (!ioc->user_qos_params)
837 memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
838 if (!ioc->user_cost_model)
839 memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
840
841 ioc_refresh_period_us(ioc);
842 ioc_refresh_lcoefs(ioc);
843
844 ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
845 VTIME_PER_USEC, MILLION);
846 ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
847 VTIME_PER_USEC, MILLION);
848
849 return true;
850}
851
852/* take a snapshot of the current [v]time and vrate */
853static void ioc_now(struct ioc *ioc, struct ioc_now *now)
854{
855 unsigned seq;
856
857 now->now_ns = ktime_get();
858 now->now = ktime_to_us(now->now_ns);
859 now->vrate = atomic64_read(&ioc->vtime_rate);
860
861 /*
862 * The current vtime is
863 *
864 * vtime at period start + (wallclock time since the start) * vrate
865 *
866 * As a consistent snapshot of `period_at_vtime` and `period_at` is
867 * needed, they're seqcount protected.
868 */
869 do {
870 seq = read_seqcount_begin(&ioc->period_seqcount);
871 now->vnow = ioc->period_at_vtime +
872 (now->now - ioc->period_at) * now->vrate;
873 } while (read_seqcount_retry(&ioc->period_seqcount, seq));
874}
875
876static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
877{
7caa4715
TH
878 WARN_ON_ONCE(ioc->running != IOC_RUNNING);
879
880 write_seqcount_begin(&ioc->period_seqcount);
881 ioc->period_at = now->now;
882 ioc->period_at_vtime = now->vnow;
883 write_seqcount_end(&ioc->period_seqcount);
884
885 ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
886 add_timer(&ioc->timer);
887}
888
889/*
890 * Update @iocg's `active` and `inuse` to @active and @inuse, update level
891 * weight sums and propagate upwards accordingly.
892 */
00410f1b 893static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse)
7caa4715
TH
894{
895 struct ioc *ioc = iocg->ioc;
896 int lvl;
897
898 lockdep_assert_held(&ioc->lock);
899
db84a72a
TH
900 inuse = clamp_t(u32, inuse, 1, active);
901
902 if (active == iocg->active && inuse == iocg->inuse)
903 return;
7caa4715
TH
904
905 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
906 struct ioc_gq *parent = iocg->ancestors[lvl];
907 struct ioc_gq *child = iocg->ancestors[lvl + 1];
908 u32 parent_active = 0, parent_inuse = 0;
909
910 /* update the level sums */
911 parent->child_active_sum += (s32)(active - child->active);
912 parent->child_inuse_sum += (s32)(inuse - child->inuse);
913 /* apply the udpates */
914 child->active = active;
915 child->inuse = inuse;
916
917 /*
918 * The delta between inuse and active sums indicates that
919 * that much of weight is being given away. Parent's inuse
920 * and active should reflect the ratio.
921 */
922 if (parent->child_active_sum) {
923 parent_active = parent->weight;
924 parent_inuse = DIV64_U64_ROUND_UP(
925 parent_active * parent->child_inuse_sum,
926 parent->child_active_sum);
927 }
928
929 /* do we need to keep walking up? */
930 if (parent_active == parent->active &&
931 parent_inuse == parent->inuse)
932 break;
933
934 active = parent_active;
935 inuse = parent_inuse;
936 }
937
938 ioc->weights_updated = true;
939}
940
00410f1b 941static void commit_weights(struct ioc *ioc)
7caa4715
TH
942{
943 lockdep_assert_held(&ioc->lock);
944
945 if (ioc->weights_updated) {
946 /* paired with rmb in current_hweight(), see there */
947 smp_wmb();
948 atomic_inc(&ioc->hweight_gen);
949 ioc->weights_updated = false;
950 }
951}
952
00410f1b 953static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse)
7caa4715 954{
00410f1b
TH
955 __propagate_weights(iocg, active, inuse);
956 commit_weights(iocg->ioc);
7caa4715
TH
957}
958
959static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
960{
961 struct ioc *ioc = iocg->ioc;
962 int lvl;
963 u32 hwa, hwi;
964 int ioc_gen;
965
966 /* hot path - if uptodate, use cached */
967 ioc_gen = atomic_read(&ioc->hweight_gen);
968 if (ioc_gen == iocg->hweight_gen)
969 goto out;
970
971 /*
00410f1b
TH
972 * Paired with wmb in commit_weights(). If we saw the updated
973 * hweight_gen, all the weight updates from __propagate_weights() are
974 * visible too.
7caa4715
TH
975 *
976 * We can race with weight updates during calculation and get it
977 * wrong. However, hweight_gen would have changed and a future
978 * reader will recalculate and we're guaranteed to discard the
979 * wrong result soon.
980 */
981 smp_rmb();
982
983 hwa = hwi = HWEIGHT_WHOLE;
984 for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
985 struct ioc_gq *parent = iocg->ancestors[lvl];
986 struct ioc_gq *child = iocg->ancestors[lvl + 1];
987 u32 active_sum = READ_ONCE(parent->child_active_sum);
988 u32 inuse_sum = READ_ONCE(parent->child_inuse_sum);
989 u32 active = READ_ONCE(child->active);
990 u32 inuse = READ_ONCE(child->inuse);
991
992 /* we can race with deactivations and either may read as zero */
993 if (!active_sum || !inuse_sum)
994 continue;
995
996 active_sum = max(active, active_sum);
997 hwa = hwa * active / active_sum; /* max 16bits * 10000 */
998
999 inuse_sum = max(inuse, inuse_sum);
1000 hwi = hwi * inuse / inuse_sum; /* max 16bits * 10000 */
1001 }
1002
1003 iocg->hweight_active = max_t(u32, hwa, 1);
1004 iocg->hweight_inuse = max_t(u32, hwi, 1);
1005 iocg->hweight_gen = ioc_gen;
1006out:
1007 if (hw_activep)
1008 *hw_activep = iocg->hweight_active;
1009 if (hw_inusep)
1010 *hw_inusep = iocg->hweight_inuse;
1011}
1012
1013static void weight_updated(struct ioc_gq *iocg)
1014{
1015 struct ioc *ioc = iocg->ioc;
1016 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1017 struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1018 u32 weight;
1019
1020 lockdep_assert_held(&ioc->lock);
1021
1022 weight = iocg->cfg_weight ?: iocc->dfl_weight;
1023 if (weight != iocg->weight && iocg->active)
00410f1b 1024 propagate_weights(iocg, weight,
7caa4715
TH
1025 DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight));
1026 iocg->weight = weight;
1027}
1028
1029static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1030{
1031 struct ioc *ioc = iocg->ioc;
1032 u64 last_period, cur_period, max_period_delta;
1033 u64 vtime, vmargin, vmin;
1034 int i;
1035
1036 /*
1037 * If seem to be already active, just update the stamp to tell the
1038 * timer that we're still active. We don't mind occassional races.
1039 */
1040 if (!list_empty(&iocg->active_list)) {
1041 ioc_now(ioc, now);
1042 cur_period = atomic64_read(&ioc->cur_period);
1043 if (atomic64_read(&iocg->active_period) != cur_period)
1044 atomic64_set(&iocg->active_period, cur_period);
1045 return true;
1046 }
1047
1048 /* racy check on internal node IOs, treat as root level IOs */
1049 if (iocg->child_active_sum)
1050 return false;
1051
1052 spin_lock_irq(&ioc->lock);
1053
1054 ioc_now(ioc, now);
1055
1056 /* update period */
1057 cur_period = atomic64_read(&ioc->cur_period);
1058 last_period = atomic64_read(&iocg->active_period);
1059 atomic64_set(&iocg->active_period, cur_period);
1060
1061 /* already activated or breaking leaf-only constraint? */
8b37bc27
JX
1062 if (!list_empty(&iocg->active_list))
1063 goto succeed_unlock;
1064 for (i = iocg->level - 1; i > 0; i--)
1065 if (!list_empty(&iocg->ancestors[i]->active_list))
7caa4715 1066 goto fail_unlock;
8b37bc27 1067
7caa4715
TH
1068 if (iocg->child_active_sum)
1069 goto fail_unlock;
1070
1071 /*
1072 * vtime may wrap when vrate is raised substantially due to
1073 * underestimated IO costs. Look at the period and ignore its
1074 * vtime if the iocg has been idle for too long. Also, cap the
1075 * budget it can start with to the margin.
1076 */
1077 max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
1078 vtime = atomic64_read(&iocg->vtime);
1079 vmargin = ioc->margin_us * now->vrate;
1080 vmin = now->vnow - vmargin;
1081
1082 if (last_period + max_period_delta < cur_period ||
1083 time_before64(vtime, vmin)) {
1084 atomic64_add(vmin - vtime, &iocg->vtime);
1085 atomic64_add(vmin - vtime, &iocg->done_vtime);
1086 vtime = vmin;
1087 }
1088
1089 /*
1090 * Activate, propagate weight and start period timer if not
1091 * running. Reset hweight_gen to avoid accidental match from
1092 * wrapping.
1093 */
1094 iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1095 list_add(&iocg->active_list, &ioc->active_iocgs);
00410f1b
TH
1096 propagate_weights(iocg, iocg->weight,
1097 iocg->last_inuse ?: iocg->weight);
7caa4715
TH
1098
1099 TRACE_IOCG_PATH(iocg_activate, iocg, now,
1100 last_period, cur_period, vtime);
1101
1102 iocg->last_vtime = vtime;
1103
1104 if (ioc->running == IOC_IDLE) {
1105 ioc->running = IOC_RUNNING;
1106 ioc_start_period(ioc, now);
1107 }
1108
8b37bc27 1109succeed_unlock:
7caa4715
TH
1110 spin_unlock_irq(&ioc->lock);
1111 return true;
1112
1113fail_unlock:
1114 spin_unlock_irq(&ioc->lock);
1115 return false;
1116}
1117
6ef20f78
TH
1118static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
1119{
1120 struct ioc *ioc = iocg->ioc;
1121 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1122 u64 vtime = atomic64_read(&iocg->vtime);
1123 u64 vmargin = ioc->margin_us * now->vrate;
1124 u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
1125 u64 delta_ns, expires, oexpires;
1126 u32 hw_inuse;
1127
1128 lockdep_assert_held(&iocg->waitq.lock);
1129
1130 /* debt-adjust vtime */
1131 current_hweight(iocg, NULL, &hw_inuse);
1132 vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1133
1134 /*
1135 * Clear or maintain depending on the overage. Non-zero vdebt is what
1136 * guarantees that @iocg is online and future iocg_kick_delay() will
1137 * clear use_delay. Don't leave it on when there's no vdebt.
1138 */
1139 if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) {
1140 blkcg_clear_delay(blkg);
1141 return false;
1142 }
1143 if (!atomic_read(&blkg->use_delay) &&
1144 time_before_eq64(vtime, now->vnow + vmargin))
1145 return false;
1146
1147 /* use delay */
1148 delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow,
1149 now->vrate) * NSEC_PER_USEC;
1150 blkcg_set_delay(blkg, delta_ns);
1151 expires = now->now_ns + delta_ns;
1152
1153 /* if already active and close enough, don't bother */
1154 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
1155 if (hrtimer_is_queued(&iocg->delay_timer) &&
1156 abs(oexpires - expires) <= margin_ns / 4)
1157 return true;
1158
1159 hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
1160 margin_ns / 4, HRTIMER_MODE_ABS);
1161 return true;
1162}
1163
1164static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
1165{
1166 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
1167 struct ioc_now now;
1168 unsigned long flags;
1169
1170 spin_lock_irqsave(&iocg->waitq.lock, flags);
1171 ioc_now(iocg->ioc, &now);
1172 iocg_kick_delay(iocg, &now);
1173 spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1174
1175 return HRTIMER_NORESTART;
1176}
1177
7caa4715
TH
1178static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1179 int flags, void *key)
1180{
1181 struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1182 struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1183 u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1184
1185 ctx->vbudget -= cost;
1186
1187 if (ctx->vbudget < 0)
1188 return -1;
1189
1190 iocg_commit_bio(ctx->iocg, wait->bio, cost);
1191
1192 /*
1193 * autoremove_wake_function() removes the wait entry only when it
1194 * actually changed the task state. We want the wait always
1195 * removed. Remove explicitly and use default_wake_function().
1196 */
1197 list_del_init(&wq_entry->entry);
1198 wait->committed = true;
1199
1200 default_wake_function(wq_entry, mode, flags, key);
1201 return 0;
1202}
1203
1204static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
1205{
1206 struct ioc *ioc = iocg->ioc;
1207 struct iocg_wake_ctx ctx = { .iocg = iocg };
1208 u64 margin_ns = (u64)(ioc->period_us *
1209 WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
0b80f986 1210 u64 vdebt, vshortage, expires, oexpires;
36a52481
TH
1211 s64 vbudget;
1212 u32 hw_inuse;
7caa4715
TH
1213
1214 lockdep_assert_held(&iocg->waitq.lock);
1215
36a52481
TH
1216 current_hweight(iocg, NULL, &hw_inuse);
1217 vbudget = now->vnow - atomic64_read(&iocg->vtime);
1218
1219 /* pay off debt */
0b80f986 1220 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
36a52481
TH
1221 if (vdebt && vbudget > 0) {
1222 u64 delta = min_t(u64, vbudget, vdebt);
1223 u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
0b80f986 1224 iocg->abs_vdebt);
36a52481
TH
1225
1226 atomic64_add(delta, &iocg->vtime);
1227 atomic64_add(delta, &iocg->done_vtime);
0b80f986 1228 iocg->abs_vdebt -= abs_delta;
7b84b49e
TH
1229
1230 iocg_kick_delay(iocg, now);
36a52481
TH
1231 }
1232
7caa4715
TH
1233 /*
1234 * Wake up the ones which are due and see how much vtime we'll need
1235 * for the next one.
1236 */
36a52481
TH
1237 ctx.hw_inuse = hw_inuse;
1238 ctx.vbudget = vbudget - vdebt;
7caa4715
TH
1239 __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1240 if (!waitqueue_active(&iocg->waitq))
1241 return;
1242 if (WARN_ON_ONCE(ctx.vbudget >= 0))
1243 return;
1244
1245 /* determine next wakeup, add a quarter margin to guarantee chunking */
1246 vshortage = -ctx.vbudget;
1247 expires = now->now_ns +
1248 DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
1249 expires += margin_ns / 4;
1250
1251 /* if already active and close enough, don't bother */
1252 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1253 if (hrtimer_is_queued(&iocg->waitq_timer) &&
1254 abs(oexpires - expires) <= margin_ns / 4)
1255 return;
1256
1257 hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1258 margin_ns / 4, HRTIMER_MODE_ABS);
1259}
1260
1261static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1262{
1263 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1264 struct ioc_now now;
1265 unsigned long flags;
1266
1267 ioc_now(iocg->ioc, &now);
1268
1269 spin_lock_irqsave(&iocg->waitq.lock, flags);
1270 iocg_kick_waitq(iocg, &now);
1271 spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1272
1273 return HRTIMER_NORESTART;
1274}
1275
7caa4715
TH
1276static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1277{
1278 u32 nr_met[2] = { };
1279 u32 nr_missed[2] = { };
1280 u64 rq_wait_ns = 0;
1281 int cpu, rw;
1282
1283 for_each_online_cpu(cpu) {
1284 struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1285 u64 this_rq_wait_ns;
1286
1287 for (rw = READ; rw <= WRITE; rw++) {
5e124f74
TH
1288 u32 this_met = local_read(&stat->missed[rw].nr_met);
1289 u32 this_missed = local_read(&stat->missed[rw].nr_missed);
7caa4715
TH
1290
1291 nr_met[rw] += this_met - stat->missed[rw].last_met;
1292 nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1293 stat->missed[rw].last_met = this_met;
1294 stat->missed[rw].last_missed = this_missed;
1295 }
1296
5e124f74 1297 this_rq_wait_ns = local64_read(&stat->rq_wait_ns);
7caa4715
TH
1298 rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1299 stat->last_rq_wait_ns = this_rq_wait_ns;
1300 }
1301
1302 for (rw = READ; rw <= WRITE; rw++) {
1303 if (nr_met[rw] + nr_missed[rw])
1304 missed_ppm_ar[rw] =
1305 DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1306 nr_met[rw] + nr_missed[rw]);
1307 else
1308 missed_ppm_ar[rw] = 0;
1309 }
1310
1311 *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1312 ioc->period_us * NSEC_PER_USEC);
1313}
1314
1315/* was iocg idle this period? */
1316static bool iocg_is_idle(struct ioc_gq *iocg)
1317{
1318 struct ioc *ioc = iocg->ioc;
1319
1320 /* did something get issued this period? */
1321 if (atomic64_read(&iocg->active_period) ==
1322 atomic64_read(&ioc->cur_period))
1323 return false;
1324
1325 /* is something in flight? */
dcd6589b 1326 if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
7caa4715
TH
1327 return false;
1328
1329 return true;
1330}
1331
1332/* returns usage with margin added if surplus is large enough */
1333static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
1334{
1335 /* add margin */
1336 usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
1337 usage += SURPLUS_SCALE_ABS;
1338
1339 /* don't bother if the surplus is too small */
1340 if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
1341 return 0;
1342
1343 return usage;
1344}
1345
1346static void ioc_timer_fn(struct timer_list *timer)
1347{
1348 struct ioc *ioc = container_of(timer, struct ioc, timer);
1349 struct ioc_gq *iocg, *tiocg;
1350 struct ioc_now now;
1351 int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
1352 u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
1353 u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
1354 u32 missed_ppm[2], rq_wait_pct;
1355 u64 period_vtime;
25d41e4a 1356 int prev_busy_level, i;
7caa4715
TH
1357
1358 /* how were the latencies during the period? */
1359 ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
1360
1361 /* take care of active iocgs */
1362 spin_lock_irq(&ioc->lock);
1363
1364 ioc_now(ioc, &now);
1365
1366 period_vtime = now.vnow - ioc->period_at_vtime;
1367 if (WARN_ON_ONCE(!period_vtime)) {
1368 spin_unlock_irq(&ioc->lock);
1369 return;
1370 }
1371
1372 /*
1373 * Waiters determine the sleep durations based on the vrate they
1374 * saw at the time of sleep. If vrate has increased, some waiters
1375 * could be sleeping for too long. Wake up tardy waiters which
1376 * should have woken up in the last period and expire idle iocgs.
1377 */
1378 list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
d9012a59 1379 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
0b80f986 1380 !iocg_is_idle(iocg))
7caa4715
TH
1381 continue;
1382
1383 spin_lock(&iocg->waitq.lock);
1384
0b80f986 1385 if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
7caa4715
TH
1386 /* might be oversleeping vtime / hweight changes, kick */
1387 iocg_kick_waitq(iocg, &now);
7caa4715
TH
1388 } else if (iocg_is_idle(iocg)) {
1389 /* no waiter and idle, deactivate */
1390 iocg->last_inuse = iocg->inuse;
00410f1b 1391 __propagate_weights(iocg, 0, 0);
7caa4715
TH
1392 list_del_init(&iocg->active_list);
1393 }
1394
1395 spin_unlock(&iocg->waitq.lock);
1396 }
00410f1b 1397 commit_weights(ioc);
7caa4715
TH
1398
1399 /* calc usages and see whether some weights need to be moved around */
1400 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1401 u64 vdone, vtime, vusage, vmargin, vmin;
1402 u32 hw_active, hw_inuse, usage;
1403
1404 /*
1405 * Collect unused and wind vtime closer to vnow to prevent
1406 * iocgs from accumulating a large amount of budget.
1407 */
1408 vdone = atomic64_read(&iocg->done_vtime);
1409 vtime = atomic64_read(&iocg->vtime);
1410 current_hweight(iocg, &hw_active, &hw_inuse);
1411
1412 /*
1413 * Latency QoS detection doesn't account for IOs which are
1414 * in-flight for longer than a period. Detect them by
1415 * comparing vdone against period start. If lagging behind
1416 * IOs from past periods, don't increase vrate.
1417 */
7cd806a9
TH
1418 if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
1419 !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
7caa4715
TH
1420 time_after64(vtime, vdone) &&
1421 time_after64(vtime, now.vnow -
1422 MAX_LAGGING_PERIODS * period_vtime) &&
1423 time_before64(vdone, now.vnow - period_vtime))
1424 nr_lagging++;
1425
1426 if (waitqueue_active(&iocg->waitq))
1427 vusage = now.vnow - iocg->last_vtime;
1428 else if (time_before64(iocg->last_vtime, vtime))
1429 vusage = vtime - iocg->last_vtime;
1430 else
1431 vusage = 0;
1432
1433 iocg->last_vtime += vusage;
1434 /*
1435 * Factor in in-flight vtime into vusage to avoid
1436 * high-latency completions appearing as idle. This should
1437 * be done after the above ->last_time adjustment.
1438 */
1439 vusage = max(vusage, vtime - vdone);
1440
1441 /* calculate hweight based usage ratio and record */
1442 if (vusage) {
1443 usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
1444 period_vtime);
1445 iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
1446 iocg->usages[iocg->usage_idx] = usage;
1447 } else {
1448 usage = 0;
1449 }
1450
1451 /* see whether there's surplus vtime */
1452 vmargin = ioc->margin_us * now.vrate;
1453 vmin = now.vnow - vmargin;
1454
1455 iocg->has_surplus = false;
1456
1457 if (!waitqueue_active(&iocg->waitq) &&
1458 time_before64(vtime, vmin)) {
1459 u64 delta = vmin - vtime;
1460
1461 /* throw away surplus vtime */
1462 atomic64_add(delta, &iocg->vtime);
1463 atomic64_add(delta, &iocg->done_vtime);
1464 iocg->last_vtime += delta;
1465 /* if usage is sufficiently low, maybe it can donate */
1466 if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
1467 iocg->has_surplus = true;
1468 nr_surpluses++;
1469 }
1470 } else if (hw_inuse < hw_active) {
1471 u32 new_hwi, new_inuse;
1472
1473 /* was donating but might need to take back some */
1474 if (waitqueue_active(&iocg->waitq)) {
1475 new_hwi = hw_active;
1476 } else {
1477 new_hwi = max(hw_inuse,
1478 usage * SURPLUS_SCALE_PCT / 100 +
1479 SURPLUS_SCALE_ABS);
1480 }
1481
1482 new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
1483 hw_inuse);
1484 new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
1485
1486 if (new_inuse > iocg->inuse) {
1487 TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
1488 iocg->inuse, new_inuse,
1489 hw_inuse, new_hwi);
00410f1b
TH
1490 __propagate_weights(iocg, iocg->weight,
1491 new_inuse);
7caa4715
TH
1492 }
1493 } else {
1494 /* genuninely out of vtime */
1495 nr_shortages++;
1496 }
1497 }
1498
1499 if (!nr_shortages || !nr_surpluses)
1500 goto skip_surplus_transfers;
1501
1502 /* there are both shortages and surpluses, transfer surpluses */
1503 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1504 u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
1505 int nr_valid = 0;
1506
1507 if (!iocg->has_surplus)
1508 continue;
1509
1510 /* base the decision on max historical usage */
1511 for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
1512 if (iocg->usages[i]) {
1513 usage = max(usage, iocg->usages[i]);
1514 nr_valid++;
1515 }
1516 }
1517 if (nr_valid < MIN_VALID_USAGES)
1518 continue;
1519
1520 current_hweight(iocg, &hw_active, &hw_inuse);
1521 new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
1522 if (!new_hwi)
1523 continue;
1524
1525 new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
1526 hw_inuse);
1527 if (new_inuse < iocg->inuse) {
1528 TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
1529 iocg->inuse, new_inuse,
1530 hw_inuse, new_hwi);
00410f1b 1531 __propagate_weights(iocg, iocg->weight, new_inuse);
7caa4715
TH
1532 }
1533 }
1534skip_surplus_transfers:
00410f1b 1535 commit_weights(ioc);
7caa4715
TH
1536
1537 /*
1538 * If q is getting clogged or we're missing too much, we're issuing
1539 * too much IO and should lower vtime rate. If we're not missing
1540 * and experiencing shortages but not surpluses, we're too stingy
1541 * and should increase vtime rate.
1542 */
25d41e4a 1543 prev_busy_level = ioc->busy_level;
7caa4715
TH
1544 if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
1545 missed_ppm[READ] > ppm_rthr ||
1546 missed_ppm[WRITE] > ppm_wthr) {
81ca627a 1547 /* clearly missing QoS targets, slow down vrate */
7caa4715
TH
1548 ioc->busy_level = max(ioc->busy_level, 0);
1549 ioc->busy_level++;
7cd806a9 1550 } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
7caa4715
TH
1551 missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
1552 missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
81ca627a
TH
1553 /* QoS targets are being met with >25% margin */
1554 if (nr_shortages) {
1555 /*
1556 * We're throttling while the device has spare
1557 * capacity. If vrate was being slowed down, stop.
1558 */
7cd806a9 1559 ioc->busy_level = min(ioc->busy_level, 0);
81ca627a
TH
1560
1561 /*
1562 * If there are IOs spanning multiple periods, wait
1563 * them out before pushing the device harder. If
1564 * there are surpluses, let redistribution work it
1565 * out first.
1566 */
1567 if (!nr_lagging && !nr_surpluses)
7cd806a9 1568 ioc->busy_level--;
81ca627a
TH
1569 } else {
1570 /*
1571 * Nobody is being throttled and the users aren't
1572 * issuing enough IOs to saturate the device. We
1573 * simply don't know how close the device is to
1574 * saturation. Coast.
1575 */
1576 ioc->busy_level = 0;
7cd806a9 1577 }
7caa4715 1578 } else {
81ca627a 1579 /* inside the hysterisis margin, we're good */
7caa4715
TH
1580 ioc->busy_level = 0;
1581 }
1582
1583 ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
1584
7cd806a9 1585 if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
7caa4715
TH
1586 u64 vrate = atomic64_read(&ioc->vtime_rate);
1587 u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
1588
1589 /* rq_wait signal is always reliable, ignore user vrate_min */
1590 if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
1591 vrate_min = VRATE_MIN;
1592
1593 /*
1594 * If vrate is out of bounds, apply clamp gradually as the
1595 * bounds can change abruptly. Otherwise, apply busy_level
1596 * based adjustment.
1597 */
1598 if (vrate < vrate_min) {
1599 vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
1600 100);
1601 vrate = min(vrate, vrate_min);
1602 } else if (vrate > vrate_max) {
1603 vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
1604 100);
1605 vrate = max(vrate, vrate_max);
1606 } else {
1607 int idx = min_t(int, abs(ioc->busy_level),
1608 ARRAY_SIZE(vrate_adj_pct) - 1);
1609 u32 adj_pct = vrate_adj_pct[idx];
1610
1611 if (ioc->busy_level > 0)
1612 adj_pct = 100 - adj_pct;
1613 else
1614 adj_pct = 100 + adj_pct;
1615
1616 vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1617 vrate_min, vrate_max);
1618 }
1619
d6c8e949 1620 trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
7caa4715
TH
1621 nr_lagging, nr_shortages,
1622 nr_surpluses);
1623
1624 atomic64_set(&ioc->vtime_rate, vrate);
1625 ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
1626 ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
25d41e4a
TH
1627 } else if (ioc->busy_level != prev_busy_level || nr_lagging) {
1628 trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
d6c8e949 1629 missed_ppm, rq_wait_pct, nr_lagging,
25d41e4a 1630 nr_shortages, nr_surpluses);
7caa4715
TH
1631 }
1632
1633 ioc_refresh_params(ioc, false);
1634
1635 /*
1636 * This period is done. Move onto the next one. If nothing's
1637 * going on with the device, stop the timer.
1638 */
1639 atomic64_inc(&ioc->cur_period);
1640
1641 if (ioc->running != IOC_STOP) {
1642 if (!list_empty(&ioc->active_iocgs)) {
1643 ioc_start_period(ioc, &now);
1644 } else {
1645 ioc->busy_level = 0;
1646 ioc->running = IOC_IDLE;
1647 }
1648 }
1649
1650 spin_unlock_irq(&ioc->lock);
1651}
1652
1653static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
1654 bool is_merge, u64 *costp)
1655{
1656 struct ioc *ioc = iocg->ioc;
1657 u64 coef_seqio, coef_randio, coef_page;
1658 u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
1659 u64 seek_pages = 0;
1660 u64 cost = 0;
1661
1662 switch (bio_op(bio)) {
1663 case REQ_OP_READ:
1664 coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO];
1665 coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO];
1666 coef_page = ioc->params.lcoefs[LCOEF_RPAGE];
1667 break;
1668 case REQ_OP_WRITE:
1669 coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO];
1670 coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO];
1671 coef_page = ioc->params.lcoefs[LCOEF_WPAGE];
1672 break;
1673 default:
1674 goto out;
1675 }
1676
1677 if (iocg->cursor) {
1678 seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
1679 seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
1680 }
1681
1682 if (!is_merge) {
1683 if (seek_pages > LCOEF_RANDIO_PAGES) {
1684 cost += coef_randio;
1685 } else {
1686 cost += coef_seqio;
1687 }
1688 }
1689 cost += pages * coef_page;
1690out:
1691 *costp = cost;
1692}
1693
1694static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
1695{
1696 u64 cost;
1697
1698 calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
1699 return cost;
1700}
1701
cd006509
TH
1702static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
1703 u64 *costp)
1704{
1705 unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
1706
1707 switch (req_op(rq)) {
1708 case REQ_OP_READ:
1709 *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
1710 break;
1711 case REQ_OP_WRITE:
1712 *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
1713 break;
1714 default:
1715 *costp = 0;
1716 }
1717}
1718
1719static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
1720{
1721 u64 cost;
1722
1723 calc_size_vtime_cost_builtin(rq, ioc, &cost);
1724 return cost;
1725}
1726
7caa4715
TH
1727static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
1728{
1729 struct blkcg_gq *blkg = bio->bi_blkg;
1730 struct ioc *ioc = rqos_to_ioc(rqos);
1731 struct ioc_gq *iocg = blkg_to_iocg(blkg);
1732 struct ioc_now now;
1733 struct iocg_wait wait;
1734 u32 hw_active, hw_inuse;
1735 u64 abs_cost, cost, vtime;
1736
1737 /* bypass IOs if disabled or for root cgroup */
1738 if (!ioc->enabled || !iocg->level)
1739 return;
1740
1741 /* always activate so that even 0 cost IOs get protected to some level */
1742 if (!iocg_activate(iocg, &now))
1743 return;
1744
1745 /* calculate the absolute vtime cost */
1746 abs_cost = calc_vtime_cost(bio, iocg, false);
1747 if (!abs_cost)
1748 return;
1749
1750 iocg->cursor = bio_end_sector(bio);
1751
1752 vtime = atomic64_read(&iocg->vtime);
1753 current_hweight(iocg, &hw_active, &hw_inuse);
1754
1755 if (hw_inuse < hw_active &&
1756 time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
1757 TRACE_IOCG_PATH(inuse_reset, iocg, &now,
1758 iocg->inuse, iocg->weight, hw_inuse, hw_active);
1759 spin_lock_irq(&ioc->lock);
00410f1b 1760 propagate_weights(iocg, iocg->weight, iocg->weight);
7caa4715
TH
1761 spin_unlock_irq(&ioc->lock);
1762 current_hweight(iocg, &hw_active, &hw_inuse);
1763 }
1764
1765 cost = abs_cost_to_cost(abs_cost, hw_inuse);
1766
1767 /*
1768 * If no one's waiting and within budget, issue right away. The
1769 * tests are racy but the races aren't systemic - we only miss once
1770 * in a while which is fine.
1771 */
0b80f986 1772 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
7caa4715
TH
1773 time_before_eq64(vtime + cost, now.vnow)) {
1774 iocg_commit_bio(iocg, bio, cost);
1775 return;
1776 }
1777
36a52481 1778 /*
0b80f986
TH
1779 * We activated above but w/o any synchronization. Deactivation is
1780 * synchronized with waitq.lock and we won't get deactivated as long
1781 * as we're waiting or has debt, so we're good if we're activated
1782 * here. In the unlikely case that we aren't, just issue the IO.
1783 */
1784 spin_lock_irq(&iocg->waitq.lock);
1785
1786 if (unlikely(list_empty(&iocg->active_list))) {
1787 spin_unlock_irq(&iocg->waitq.lock);
1788 iocg_commit_bio(iocg, bio, cost);
1789 return;
1790 }
1791
1792 /*
1793 * We're over budget. If @bio has to be issued regardless, remember
1794 * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
1795 * off the debt before waking more IOs.
1796 *
36a52481 1797 * This way, the debt is continuously paid off each period with the
0b80f986
TH
1798 * actual budget available to the cgroup. If we just wound vtime, we
1799 * would incorrectly use the current hw_inuse for the entire amount
1800 * which, for example, can lead to the cgroup staying blocked for a
1801 * long time even with substantially raised hw_inuse.
1802 *
1803 * An iocg with vdebt should stay online so that the timer can keep
1804 * deducting its vdebt and [de]activate use_delay mechanism
1805 * accordingly. We don't want to race against the timer trying to
1806 * clear them and leave @iocg inactive w/ dangling use_delay heavily
1807 * penalizing the cgroup and its descendants.
36a52481 1808 */
7caa4715 1809 if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
0b80f986 1810 iocg->abs_vdebt += abs_cost;
54c52e10 1811 if (iocg_kick_delay(iocg, &now))
d7bd15a1
TH
1812 blkcg_schedule_throttle(rqos->q,
1813 (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
0b80f986 1814 spin_unlock_irq(&iocg->waitq.lock);
7caa4715
TH
1815 return;
1816 }
1817
1818 /*
1819 * Append self to the waitq and schedule the wakeup timer if we're
1820 * the first waiter. The timer duration is calculated based on the
1821 * current vrate. vtime and hweight changes can make it too short
1822 * or too long. Each wait entry records the absolute cost it's
1823 * waiting for to allow re-evaluation using a custom wait entry.
1824 *
1825 * If too short, the timer simply reschedules itself. If too long,
1826 * the period timer will notice and trigger wakeups.
1827 *
1828 * All waiters are on iocg->waitq and the wait states are
1829 * synchronized using waitq.lock.
1830 */
7caa4715
TH
1831 init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
1832 wait.wait.private = current;
1833 wait.bio = bio;
1834 wait.abs_cost = abs_cost;
1835 wait.committed = false; /* will be set true by waker */
1836
1837 __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
1838 iocg_kick_waitq(iocg, &now);
1839
1840 spin_unlock_irq(&iocg->waitq.lock);
1841
1842 while (true) {
1843 set_current_state(TASK_UNINTERRUPTIBLE);
1844 if (wait.committed)
1845 break;
1846 io_schedule();
1847 }
1848
1849 /* waker already committed us, proceed */
1850 finish_wait(&iocg->waitq, &wait.wait);
1851}
1852
1853static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
1854 struct bio *bio)
1855{
1856 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
e1518f63 1857 struct ioc *ioc = iocg->ioc;
7caa4715 1858 sector_t bio_end = bio_end_sector(bio);
e1518f63 1859 struct ioc_now now;
7caa4715
TH
1860 u32 hw_inuse;
1861 u64 abs_cost, cost;
0b80f986 1862 unsigned long flags;
7caa4715 1863
e1518f63
TH
1864 /* bypass if disabled or for root cgroup */
1865 if (!ioc->enabled || !iocg->level)
7caa4715
TH
1866 return;
1867
1868 abs_cost = calc_vtime_cost(bio, iocg, true);
1869 if (!abs_cost)
1870 return;
1871
e1518f63
TH
1872 ioc_now(ioc, &now);
1873 current_hweight(iocg, NULL, &hw_inuse);
1874 cost = abs_cost_to_cost(abs_cost, hw_inuse);
1875
7caa4715
TH
1876 /* update cursor if backmerging into the request at the cursor */
1877 if (blk_rq_pos(rq) < bio_end &&
1878 blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
1879 iocg->cursor = bio_end;
1880
e1518f63 1881 /*
0b80f986
TH
1882 * Charge if there's enough vtime budget and the existing request has
1883 * cost assigned.
e1518f63
TH
1884 */
1885 if (rq->bio && rq->bio->bi_iocost_cost &&
0b80f986 1886 time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
e1518f63 1887 iocg_commit_bio(iocg, bio, cost);
0b80f986
TH
1888 return;
1889 }
1890
1891 /*
1892 * Otherwise, account it as debt if @iocg is online, which it should
1893 * be for the vast majority of cases. See debt handling in
1894 * ioc_rqos_throttle() for details.
1895 */
1896 spin_lock_irqsave(&iocg->waitq.lock, flags);
1897 if (likely(!list_empty(&iocg->active_list))) {
1898 iocg->abs_vdebt += abs_cost;
873f1c8d 1899 iocg_kick_delay(iocg, &now);
0b80f986
TH
1900 } else {
1901 iocg_commit_bio(iocg, bio, cost);
1902 }
1903 spin_unlock_irqrestore(&iocg->waitq.lock, flags);
7caa4715
TH
1904}
1905
1906static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
1907{
1908 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1909
1910 if (iocg && bio->bi_iocost_cost)
1911 atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
1912}
1913
1914static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
1915{
1916 struct ioc *ioc = rqos_to_ioc(rqos);
5e124f74 1917 struct ioc_pcpu_stat *ccs;
cd006509 1918 u64 on_q_ns, rq_wait_ns, size_nsec;
7caa4715
TH
1919 int pidx, rw;
1920
1921 if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
1922 return;
1923
1924 switch (req_op(rq) & REQ_OP_MASK) {
1925 case REQ_OP_READ:
1926 pidx = QOS_RLAT;
1927 rw = READ;
1928 break;
1929 case REQ_OP_WRITE:
1930 pidx = QOS_WLAT;
1931 rw = WRITE;
1932 break;
1933 default:
1934 return;
1935 }
1936
1937 on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
1938 rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
cd006509 1939 size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
7caa4715 1940
5e124f74
TH
1941 ccs = get_cpu_ptr(ioc->pcpu_stat);
1942
cd006509
TH
1943 if (on_q_ns <= size_nsec ||
1944 on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
5e124f74 1945 local_inc(&ccs->missed[rw].nr_met);
7caa4715 1946 else
5e124f74
TH
1947 local_inc(&ccs->missed[rw].nr_missed);
1948
1949 local64_add(rq_wait_ns, &ccs->rq_wait_ns);
7caa4715 1950
5e124f74 1951 put_cpu_ptr(ccs);
7caa4715
TH
1952}
1953
1954static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
1955{
1956 struct ioc *ioc = rqos_to_ioc(rqos);
1957
1958 spin_lock_irq(&ioc->lock);
1959 ioc_refresh_params(ioc, false);
1960 spin_unlock_irq(&ioc->lock);
1961}
1962
1963static void ioc_rqos_exit(struct rq_qos *rqos)
1964{
1965 struct ioc *ioc = rqos_to_ioc(rqos);
1966
1967 blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
1968
1969 spin_lock_irq(&ioc->lock);
1970 ioc->running = IOC_STOP;
1971 spin_unlock_irq(&ioc->lock);
1972
1973 del_timer_sync(&ioc->timer);
1974 free_percpu(ioc->pcpu_stat);
1975 kfree(ioc);
1976}
1977
1978static struct rq_qos_ops ioc_rqos_ops = {
1979 .throttle = ioc_rqos_throttle,
1980 .merge = ioc_rqos_merge,
1981 .done_bio = ioc_rqos_done_bio,
1982 .done = ioc_rqos_done,
1983 .queue_depth_changed = ioc_rqos_queue_depth_changed,
1984 .exit = ioc_rqos_exit,
1985};
1986
1987static int blk_iocost_init(struct request_queue *q)
1988{
1989 struct ioc *ioc;
1990 struct rq_qos *rqos;
5e124f74 1991 int i, cpu, ret;
7caa4715
TH
1992
1993 ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
1994 if (!ioc)
1995 return -ENOMEM;
1996
1997 ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
1998 if (!ioc->pcpu_stat) {
1999 kfree(ioc);
2000 return -ENOMEM;
2001 }
2002
5e124f74
TH
2003 for_each_possible_cpu(cpu) {
2004 struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu);
2005
2006 for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) {
2007 local_set(&ccs->missed[i].nr_met, 0);
2008 local_set(&ccs->missed[i].nr_missed, 0);
2009 }
2010 local64_set(&ccs->rq_wait_ns, 0);
2011 }
2012
7caa4715
TH
2013 rqos = &ioc->rqos;
2014 rqos->id = RQ_QOS_COST;
2015 rqos->ops = &ioc_rqos_ops;
2016 rqos->q = q;
2017
2018 spin_lock_init(&ioc->lock);
2019 timer_setup(&ioc->timer, ioc_timer_fn, 0);
2020 INIT_LIST_HEAD(&ioc->active_iocgs);
2021
2022 ioc->running = IOC_IDLE;
2023 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
67b7b641 2024 seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
7caa4715
TH
2025 ioc->period_at = ktime_to_us(ktime_get());
2026 atomic64_set(&ioc->cur_period, 0);
2027 atomic_set(&ioc->hweight_gen, 0);
2028
2029 spin_lock_irq(&ioc->lock);
2030 ioc->autop_idx = AUTOP_INVALID;
2031 ioc_refresh_params(ioc, true);
2032 spin_unlock_irq(&ioc->lock);
2033
2034 rq_qos_add(q, rqos);
2035 ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
2036 if (ret) {
2037 rq_qos_del(q, rqos);
3532e722 2038 free_percpu(ioc->pcpu_stat);
7caa4715
TH
2039 kfree(ioc);
2040 return ret;
2041 }
2042 return 0;
2043}
2044
2045static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
2046{
2047 struct ioc_cgrp *iocc;
2048
2049 iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
e916ad29
TH
2050 if (!iocc)
2051 return NULL;
7caa4715 2052
e916ad29 2053 iocc->dfl_weight = CGROUP_WEIGHT_DFL;
7caa4715
TH
2054 return &iocc->cpd;
2055}
2056
2057static void ioc_cpd_free(struct blkcg_policy_data *cpd)
2058{
2059 kfree(container_of(cpd, struct ioc_cgrp, cpd));
2060}
2061
2062static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
2063 struct blkcg *blkcg)
2064{
2065 int levels = blkcg->css.cgroup->level + 1;
2066 struct ioc_gq *iocg;
2067
f61d6e25 2068 iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node);
7caa4715
TH
2069 if (!iocg)
2070 return NULL;
2071
2072 return &iocg->pd;
2073}
2074
2075static void ioc_pd_init(struct blkg_policy_data *pd)
2076{
2077 struct ioc_gq *iocg = pd_to_iocg(pd);
2078 struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2079 struct ioc *ioc = q_to_ioc(blkg->q);
2080 struct ioc_now now;
2081 struct blkcg_gq *tblkg;
2082 unsigned long flags;
2083
2084 ioc_now(ioc, &now);
2085
2086 iocg->ioc = ioc;
2087 atomic64_set(&iocg->vtime, now.vnow);
2088 atomic64_set(&iocg->done_vtime, now.vnow);
2089 atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2090 INIT_LIST_HEAD(&iocg->active_list);
2091 iocg->hweight_active = HWEIGHT_WHOLE;
2092 iocg->hweight_inuse = HWEIGHT_WHOLE;
2093
2094 init_waitqueue_head(&iocg->waitq);
2095 hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2096 iocg->waitq_timer.function = iocg_waitq_timer_fn;
2097 hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2098 iocg->delay_timer.function = iocg_delay_timer_fn;
2099
2100 iocg->level = blkg->blkcg->css.cgroup->level;
2101
2102 for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2103 struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2104 iocg->ancestors[tiocg->level] = tiocg;
2105 }
2106
2107 spin_lock_irqsave(&ioc->lock, flags);
2108 weight_updated(iocg);
2109 spin_unlock_irqrestore(&ioc->lock, flags);
2110}
2111
2112static void ioc_pd_free(struct blkg_policy_data *pd)
2113{
2114 struct ioc_gq *iocg = pd_to_iocg(pd);
2115 struct ioc *ioc = iocg->ioc;
5aeac7c4 2116 unsigned long flags;
7caa4715
TH
2117
2118 if (ioc) {
5aeac7c4 2119 spin_lock_irqsave(&ioc->lock, flags);
7caa4715 2120 if (!list_empty(&iocg->active_list)) {
00410f1b 2121 propagate_weights(iocg, 0, 0);
7caa4715
TH
2122 list_del_init(&iocg->active_list);
2123 }
5aeac7c4 2124 spin_unlock_irqrestore(&ioc->lock, flags);
e036c4ca
TH
2125
2126 hrtimer_cancel(&iocg->waitq_timer);
2127 hrtimer_cancel(&iocg->delay_timer);
7caa4715
TH
2128 }
2129 kfree(iocg);
2130}
2131
2132static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2133 int off)
2134{
2135 const char *dname = blkg_dev_name(pd->blkg);
2136 struct ioc_gq *iocg = pd_to_iocg(pd);
2137
2138 if (dname && iocg->cfg_weight)
2139 seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight);
2140 return 0;
2141}
2142
2143
2144static int ioc_weight_show(struct seq_file *sf, void *v)
2145{
2146 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2147 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2148
2149 seq_printf(sf, "default %u\n", iocc->dfl_weight);
2150 blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
2151 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2152 return 0;
2153}
2154
2155static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
2156 size_t nbytes, loff_t off)
2157{
2158 struct blkcg *blkcg = css_to_blkcg(of_css(of));
2159 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2160 struct blkg_conf_ctx ctx;
2161 struct ioc_gq *iocg;
2162 u32 v;
2163 int ret;
2164
2165 if (!strchr(buf, ':')) {
2166 struct blkcg_gq *blkg;
2167
2168 if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
2169 return -EINVAL;
2170
2171 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2172 return -EINVAL;
2173
2174 spin_lock(&blkcg->lock);
2175 iocc->dfl_weight = v;
2176 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
2177 struct ioc_gq *iocg = blkg_to_iocg(blkg);
2178
2179 if (iocg) {
2180 spin_lock_irq(&iocg->ioc->lock);
2181 weight_updated(iocg);
2182 spin_unlock_irq(&iocg->ioc->lock);
2183 }
2184 }
2185 spin_unlock(&blkcg->lock);
2186
2187 return nbytes;
2188 }
2189
2190 ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
2191 if (ret)
2192 return ret;
2193
2194 iocg = blkg_to_iocg(ctx.blkg);
2195
2196 if (!strncmp(ctx.body, "default", 7)) {
2197 v = 0;
2198 } else {
2199 if (!sscanf(ctx.body, "%u", &v))
2200 goto einval;
2201 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2202 goto einval;
2203 }
2204
41591a51 2205 spin_lock(&iocg->ioc->lock);
7caa4715
TH
2206 iocg->cfg_weight = v;
2207 weight_updated(iocg);
41591a51 2208 spin_unlock(&iocg->ioc->lock);
7caa4715
TH
2209
2210 blkg_conf_finish(&ctx);
2211 return nbytes;
2212
2213einval:
2214 blkg_conf_finish(&ctx);
2215 return -EINVAL;
2216}
2217
2218static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2219 int off)
2220{
2221 const char *dname = blkg_dev_name(pd->blkg);
2222 struct ioc *ioc = pd_to_iocg(pd)->ioc;
2223
2224 if (!dname)
2225 return 0;
2226
2227 seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
2228 dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
2229 ioc->params.qos[QOS_RPPM] / 10000,
2230 ioc->params.qos[QOS_RPPM] % 10000 / 100,
2231 ioc->params.qos[QOS_RLAT],
2232 ioc->params.qos[QOS_WPPM] / 10000,
2233 ioc->params.qos[QOS_WPPM] % 10000 / 100,
2234 ioc->params.qos[QOS_WLAT],
2235 ioc->params.qos[QOS_MIN] / 10000,
2236 ioc->params.qos[QOS_MIN] % 10000 / 100,
2237 ioc->params.qos[QOS_MAX] / 10000,
2238 ioc->params.qos[QOS_MAX] % 10000 / 100);
2239 return 0;
2240}
2241
2242static int ioc_qos_show(struct seq_file *sf, void *v)
2243{
2244 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2245
2246 blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
2247 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2248 return 0;
2249}
2250
2251static const match_table_t qos_ctrl_tokens = {
2252 { QOS_ENABLE, "enable=%u" },
2253 { QOS_CTRL, "ctrl=%s" },
2254 { NR_QOS_CTRL_PARAMS, NULL },
2255};
2256
2257static const match_table_t qos_tokens = {
2258 { QOS_RPPM, "rpct=%s" },
2259 { QOS_RLAT, "rlat=%u" },
2260 { QOS_WPPM, "wpct=%s" },
2261 { QOS_WLAT, "wlat=%u" },
2262 { QOS_MIN, "min=%s" },
2263 { QOS_MAX, "max=%s" },
2264 { NR_QOS_PARAMS, NULL },
2265};
2266
2267static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
2268 size_t nbytes, loff_t off)
2269{
2270 struct gendisk *disk;
2271 struct ioc *ioc;
2272 u32 qos[NR_QOS_PARAMS];
2273 bool enable, user;
2274 char *p;
2275 int ret;
2276
2277 disk = blkcg_conf_get_disk(&input);
2278 if (IS_ERR(disk))
2279 return PTR_ERR(disk);
2280
2281 ioc = q_to_ioc(disk->queue);
2282 if (!ioc) {
2283 ret = blk_iocost_init(disk->queue);
2284 if (ret)
2285 goto err;
2286 ioc = q_to_ioc(disk->queue);
2287 }
2288
2289 spin_lock_irq(&ioc->lock);
2290 memcpy(qos, ioc->params.qos, sizeof(qos));
2291 enable = ioc->enabled;
2292 user = ioc->user_qos_params;
2293 spin_unlock_irq(&ioc->lock);
2294
2295 while ((p = strsep(&input, " \t\n"))) {
2296 substring_t args[MAX_OPT_ARGS];
2297 char buf[32];
2298 int tok;
2299 s64 v;
2300
2301 if (!*p)
2302 continue;
2303
2304 switch (match_token(p, qos_ctrl_tokens, args)) {
2305 case QOS_ENABLE:
2306 match_u64(&args[0], &v);
2307 enable = v;
2308 continue;
2309 case QOS_CTRL:
2310 match_strlcpy(buf, &args[0], sizeof(buf));
2311 if (!strcmp(buf, "auto"))
2312 user = false;
2313 else if (!strcmp(buf, "user"))
2314 user = true;
2315 else
2316 goto einval;
2317 continue;
2318 }
2319
2320 tok = match_token(p, qos_tokens, args);
2321 switch (tok) {
2322 case QOS_RPPM:
2323 case QOS_WPPM:
2324 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2325 sizeof(buf))
2326 goto einval;
2327 if (cgroup_parse_float(buf, 2, &v))
2328 goto einval;
2329 if (v < 0 || v > 10000)
2330 goto einval;
2331 qos[tok] = v * 100;
2332 break;
2333 case QOS_RLAT:
2334 case QOS_WLAT:
2335 if (match_u64(&args[0], &v))
2336 goto einval;
2337 qos[tok] = v;
2338 break;
2339 case QOS_MIN:
2340 case QOS_MAX:
2341 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2342 sizeof(buf))
2343 goto einval;
2344 if (cgroup_parse_float(buf, 2, &v))
2345 goto einval;
2346 if (v < 0)
2347 goto einval;
2348 qos[tok] = clamp_t(s64, v * 100,
2349 VRATE_MIN_PPM, VRATE_MAX_PPM);
2350 break;
2351 default:
2352 goto einval;
2353 }
2354 user = true;
2355 }
2356
2357 if (qos[QOS_MIN] > qos[QOS_MAX])
2358 goto einval;
2359
2360 spin_lock_irq(&ioc->lock);
2361
2362 if (enable) {
cd006509 2363 blk_stat_enable_accounting(ioc->rqos.q);
7caa4715
TH
2364 blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2365 ioc->enabled = true;
2366 } else {
2367 blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2368 ioc->enabled = false;
2369 }
2370
2371 if (user) {
2372 memcpy(ioc->params.qos, qos, sizeof(qos));
2373 ioc->user_qos_params = true;
2374 } else {
2375 ioc->user_qos_params = false;
2376 }
2377
2378 ioc_refresh_params(ioc, true);
2379 spin_unlock_irq(&ioc->lock);
2380
2381 put_disk_and_module(disk);
2382 return nbytes;
2383einval:
2384 ret = -EINVAL;
2385err:
2386 put_disk_and_module(disk);
2387 return ret;
2388}
2389
2390static u64 ioc_cost_model_prfill(struct seq_file *sf,
2391 struct blkg_policy_data *pd, int off)
2392{
2393 const char *dname = blkg_dev_name(pd->blkg);
2394 struct ioc *ioc = pd_to_iocg(pd)->ioc;
2395 u64 *u = ioc->params.i_lcoefs;
2396
2397 if (!dname)
2398 return 0;
2399
2400 seq_printf(sf, "%s ctrl=%s model=linear "
2401 "rbps=%llu rseqiops=%llu rrandiops=%llu "
2402 "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
2403 dname, ioc->user_cost_model ? "user" : "auto",
2404 u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
2405 u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
2406 return 0;
2407}
2408
2409static int ioc_cost_model_show(struct seq_file *sf, void *v)
2410{
2411 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2412
2413 blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
2414 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2415 return 0;
2416}
2417
2418static const match_table_t cost_ctrl_tokens = {
2419 { COST_CTRL, "ctrl=%s" },
2420 { COST_MODEL, "model=%s" },
2421 { NR_COST_CTRL_PARAMS, NULL },
2422};
2423
2424static const match_table_t i_lcoef_tokens = {
2425 { I_LCOEF_RBPS, "rbps=%u" },
2426 { I_LCOEF_RSEQIOPS, "rseqiops=%u" },
2427 { I_LCOEF_RRANDIOPS, "rrandiops=%u" },
2428 { I_LCOEF_WBPS, "wbps=%u" },
2429 { I_LCOEF_WSEQIOPS, "wseqiops=%u" },
2430 { I_LCOEF_WRANDIOPS, "wrandiops=%u" },
2431 { NR_I_LCOEFS, NULL },
2432};
2433
2434static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
2435 size_t nbytes, loff_t off)
2436{
2437 struct gendisk *disk;
2438 struct ioc *ioc;
2439 u64 u[NR_I_LCOEFS];
2440 bool user;
2441 char *p;
2442 int ret;
2443
2444 disk = blkcg_conf_get_disk(&input);
2445 if (IS_ERR(disk))
2446 return PTR_ERR(disk);
2447
2448 ioc = q_to_ioc(disk->queue);
2449 if (!ioc) {
2450 ret = blk_iocost_init(disk->queue);
2451 if (ret)
2452 goto err;
2453 ioc = q_to_ioc(disk->queue);
2454 }
2455
2456 spin_lock_irq(&ioc->lock);
2457 memcpy(u, ioc->params.i_lcoefs, sizeof(u));
2458 user = ioc->user_cost_model;
2459 spin_unlock_irq(&ioc->lock);
2460
2461 while ((p = strsep(&input, " \t\n"))) {
2462 substring_t args[MAX_OPT_ARGS];
2463 char buf[32];
2464 int tok;
2465 u64 v;
2466
2467 if (!*p)
2468 continue;
2469
2470 switch (match_token(p, cost_ctrl_tokens, args)) {
2471 case COST_CTRL:
2472 match_strlcpy(buf, &args[0], sizeof(buf));
2473 if (!strcmp(buf, "auto"))
2474 user = false;
2475 else if (!strcmp(buf, "user"))
2476 user = true;
2477 else
2478 goto einval;
2479 continue;
2480 case COST_MODEL:
2481 match_strlcpy(buf, &args[0], sizeof(buf));
2482 if (strcmp(buf, "linear"))
2483 goto einval;
2484 continue;
2485 }
2486
2487 tok = match_token(p, i_lcoef_tokens, args);
2488 if (tok == NR_I_LCOEFS)
2489 goto einval;
2490 if (match_u64(&args[0], &v))
2491 goto einval;
2492 u[tok] = v;
2493 user = true;
2494 }
2495
2496 spin_lock_irq(&ioc->lock);
2497 if (user) {
2498 memcpy(ioc->params.i_lcoefs, u, sizeof(u));
2499 ioc->user_cost_model = true;
2500 } else {
2501 ioc->user_cost_model = false;
2502 }
2503 ioc_refresh_params(ioc, true);
2504 spin_unlock_irq(&ioc->lock);
2505
2506 put_disk_and_module(disk);
2507 return nbytes;
2508
2509einval:
2510 ret = -EINVAL;
2511err:
2512 put_disk_and_module(disk);
2513 return ret;
2514}
2515
2516static struct cftype ioc_files[] = {
2517 {
2518 .name = "weight",
2519 .flags = CFTYPE_NOT_ON_ROOT,
2520 .seq_show = ioc_weight_show,
2521 .write = ioc_weight_write,
2522 },
2523 {
2524 .name = "cost.qos",
2525 .flags = CFTYPE_ONLY_ON_ROOT,
2526 .seq_show = ioc_qos_show,
2527 .write = ioc_qos_write,
2528 },
2529 {
2530 .name = "cost.model",
2531 .flags = CFTYPE_ONLY_ON_ROOT,
2532 .seq_show = ioc_cost_model_show,
2533 .write = ioc_cost_model_write,
2534 },
2535 {}
2536};
2537
2538static struct blkcg_policy blkcg_policy_iocost = {
2539 .dfl_cftypes = ioc_files,
2540 .cpd_alloc_fn = ioc_cpd_alloc,
2541 .cpd_free_fn = ioc_cpd_free,
2542 .pd_alloc_fn = ioc_pd_alloc,
2543 .pd_init_fn = ioc_pd_init,
2544 .pd_free_fn = ioc_pd_free,
2545};
2546
2547static int __init ioc_init(void)
2548{
2549 return blkcg_policy_register(&blkcg_policy_iocost);
2550}
2551
2552static void __exit ioc_exit(void)
2553{
2554 return blkcg_policy_unregister(&blkcg_policy_iocost);
2555}
2556
2557module_init(ioc_init);
2558module_exit(ioc_exit);