block/blk-throttle.c

   1 /*
   2  * Interface for controlling IO bandwidth on a request queue
   3  *
   4  * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
   5  */
   6
   7 #include <linux/module.h>
   8 #include <linux/slab.h>
   9 #include <linux/blkdev.h>
  10 #include <linux/bio.h>
  11 #include <linux/blktrace_api.h>
  12 #include "blk-cgroup.h"
  13 #include "blk.h"
  14
  15 /* Max dispatch from a group in 1 round */
  16 static int throtl_grp_quantum = 8;
  17
  18 /* Total max dispatch from all groups in one round */
  19 static int throtl_quantum = 32;
  20
  21 /* Throttling is performed over 100ms slice and after that slice is renewed */
  22 static unsigned long throtl_slice = HZ/10;      /* 100 ms */
  23
  24 static struct blkio_policy_type blkio_policy_throtl;
  25
  26 /* A workqueue to queue throttle related work */
  27 static struct workqueue_struct *kthrotld_workqueue;
  28 static void throtl_schedule_delayed_work(struct throtl_data *td,
  29                                 unsigned long delay);
  30
  31 struct throtl_rb_root {
  32         struct rb_root rb;
  33         struct rb_node *left;
  34         unsigned int count;
  35         unsigned long min_disptime;
  36 };
  37
  38 #define THROTL_RB_ROOT  (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
  39                         .count = 0, .min_disptime = 0}
  40
  41 #define rb_entry_tg(node)       rb_entry((node), struct throtl_grp, rb_node)
  42
  43 struct throtl_grp {
  44         /* List of throtl groups on the request queue*/
  45         struct hlist_node tg_node;
  46
  47         /* active throtl group service_tree member */
  48         struct rb_node rb_node;
  49
  50         /*
  51          * Dispatch time in jiffies. This is the estimated time when group
  52          * will unthrottle and is ready to dispatch more bio. It is used as
  53          * key to sort active groups in service tree.
  54          */
  55         unsigned long disptime;
  56
  57         atomic_t ref;
  58         unsigned int flags;
  59
  60         /* Two lists for READ and WRITE */
  61         struct bio_list bio_lists[2];
  62
  63         /* Number of queued bios on READ and WRITE lists */
  64         unsigned int nr_queued[2];
  65
  66         /* bytes per second rate limits */
  67         uint64_t bps[2];
  68
  69         /* IOPS limits */
  70         unsigned int iops[2];
  71
  72         /* Number of bytes disptached in current slice */
  73         uint64_t bytes_disp[2];
  74         /* Number of bio's dispatched in current slice */
  75         unsigned int io_disp[2];
  76
  77         /* When did we start a new slice */
  78         unsigned long slice_start[2];
  79         unsigned long slice_end[2];
  80
  81         /* Some throttle limits got updated for the group */
  82         int limits_changed;
  83
  84         struct rcu_head rcu_head;
  85 };
  86
  87 struct throtl_data
  88 {
  89         /* List of throtl groups */
  90         struct hlist_head tg_list;
  91
  92         /* service tree for active throtl groups */
  93         struct throtl_rb_root tg_service_tree;
  94
  95         struct throtl_grp *root_tg;
  96         struct request_queue *queue;
  97
  98         /* Total Number of queued bios on READ and WRITE lists */
  99         unsigned int nr_queued[2];
 100
 101         /*
 102          * number of total undestroyed groups
 103          */
 104         unsigned int nr_undestroyed_grps;
 105
 106         /* Work for dispatching throttled bios */
 107         struct delayed_work throtl_work;
 108
 109         int limits_changed;
 110 };
 111
 112 static inline struct throtl_grp *blkg_to_tg(struct blkio_group *blkg)
 113 {
 114         return blkg_to_pdata(blkg, &blkio_policy_throtl);
 115 }
 116
 117 static inline struct blkio_group *tg_to_blkg(struct throtl_grp *tg)
 118 {
 119         return pdata_to_blkg(tg, &blkio_policy_throtl);
 120 }
 121
 122 enum tg_state_flags {
 123         THROTL_TG_FLAG_on_rr = 0,       /* on round-robin busy list */
 124 };
 125
 126 #define THROTL_TG_FNS(name)                                             \
 127 static inline void throtl_mark_tg_##name(struct throtl_grp *tg)         \
 128 {                                                                       \
 129         (tg)->flags |= (1 << THROTL_TG_FLAG_##name);                    \
 130 }                                                                       \
 131 static inline void throtl_clear_tg_##name(struct throtl_grp *tg)        \
 132 {                                                                       \
 133         (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name);                   \
 134 }                                                                       \
 135 static inline int throtl_tg_##name(const struct throtl_grp *tg)         \
 136 {                                                                       \
 137         return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0;       \
 138 }
 139
 140 THROTL_TG_FNS(on_rr);
 141
 142 #define throtl_log_tg(td, tg, fmt, args...)                             \
 143         blk_add_trace_msg((td)->queue, "throtl %s " fmt,                \
 144                           blkg_path(tg_to_blkg(tg)), ##args);           \
 145
 146 #define throtl_log(td, fmt, args...)    \
 147         blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
 148
 149 static inline unsigned int total_nr_queued(struct throtl_data *td)
 150 {
 151         return td->nr_queued[0] + td->nr_queued[1];
 152 }
 153
 154 static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
 155 {
 156         atomic_inc(&tg->ref);
 157         return tg;
 158 }
 159
 160 static void throtl_free_tg(struct rcu_head *head)
 161 {
 162         struct throtl_grp *tg = container_of(head, struct throtl_grp, rcu_head);
 163         struct blkio_group *blkg = tg_to_blkg(tg);
 164
 165         free_percpu(blkg->stats_cpu);
 166         kfree(blkg->pd);
 167         kfree(blkg);
 168 }
 169
 170 static void throtl_put_tg(struct throtl_grp *tg)
 171 {
 172         struct blkio_group *blkg = tg_to_blkg(tg);
 173
 174         BUG_ON(atomic_read(&tg->ref) <= 0);
 175         if (!atomic_dec_and_test(&tg->ref))
 176                 return;
 177
 178         /* release the extra blkcg reference this blkg has been holding */
 179         css_put(&blkg->blkcg->css);
 180
 181         /*
 182          * A group is freed in rcu manner. But having an rcu lock does not
 183          * mean that one can access all the fields of blkg and assume these
 184          * are valid. For example, don't try to follow throtl_data and
 185          * request queue links.
 186          *
 187          * Having a reference to blkg under an rcu allows acess to only
 188          * values local to groups like group stats and group rate limits
 189          */
 190         call_rcu(&tg->rcu_head, throtl_free_tg);
 191 }
 192
 193 static void throtl_init_blkio_group(struct blkio_group *blkg)
 194 {
 195         struct throtl_grp *tg = blkg_to_tg(blkg);
 196
 197         INIT_HLIST_NODE(&tg->tg_node);
 198         RB_CLEAR_NODE(&tg->rb_node);
 199         bio_list_init(&tg->bio_lists[0]);
 200         bio_list_init(&tg->bio_lists[1]);
 201         tg->limits_changed = false;
 202
 203         tg->bps[READ] = -1;
 204         tg->bps[WRITE] = -1;
 205         tg->iops[READ] = -1;
 206         tg->iops[WRITE] = -1;
 207
 208         /*
 209          * Take the initial reference that will be released on destroy
 210          * This can be thought of a joint reference by cgroup and
 211          * request queue which will be dropped by either request queue
 212          * exit or cgroup deletion path depending on who is exiting first.
 213          */
 214         atomic_set(&tg->ref, 1);
 215 }
 216
 217 static void throtl_link_blkio_group(struct request_queue *q,
 218                                     struct blkio_group *blkg)
 219 {
 220         struct throtl_data *td = q->td;
 221         struct throtl_grp *tg = blkg_to_tg(blkg);
 222
 223         hlist_add_head(&tg->tg_node, &td->tg_list);
 224         td->nr_undestroyed_grps++;
 225 }
 226
 227 static struct
 228 throtl_grp *throtl_lookup_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
 229 {
 230         /*
 231          * This is the common case when there are no blkio cgroups.
 232          * Avoid lookup in this case
 233          */
 234         if (blkcg == &blkio_root_cgroup)
 235                 return td->root_tg;
 236
 237         return blkg_to_tg(blkg_lookup(blkcg, td->queue, BLKIO_POLICY_THROTL));
 238 }
 239
 240 static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
 241                                                   struct blkio_cgroup *blkcg)
 242 {
 243         struct request_queue *q = td->queue;
 244         struct throtl_grp *tg = NULL;
 245
 246         /*
 247          * This is the common case when there are no blkio cgroups.
 248          * Avoid lookup in this case
 249          */
 250         if (blkcg == &blkio_root_cgroup) {
 251                 tg = td->root_tg;
 252         } else {
 253                 struct blkio_group *blkg;
 254
 255                 blkg = blkg_lookup_create(blkcg, q, BLKIO_POLICY_THROTL, false);
 256
 257                 /* if %NULL and @q is alive, fall back to root_tg */
 258                 if (!IS_ERR(blkg))
 259                         tg = blkg_to_tg(blkg);
 260                 else if (!blk_queue_dead(q))
 261                         tg = td->root_tg;
 262         }
 263
 264         return tg;
 265 }
 266
 267 static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
 268 {
 269         /* Service tree is empty */
 270         if (!root->count)
 271                 return NULL;
 272
 273         if (!root->left)
 274                 root->left = rb_first(&root->rb);
 275
 276         if (root->left)
 277                 return rb_entry_tg(root->left);
 278
 279         return NULL;
 280 }
 281
 282 static void rb_erase_init(struct rb_node *n, struct rb_root *root)
 283 {
 284         rb_erase(n, root);
 285         RB_CLEAR_NODE(n);
 286 }
 287
 288 static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
 289 {
 290         if (root->left == n)
 291                 root->left = NULL;
 292         rb_erase_init(n, &root->rb);
 293         --root->count;
 294 }
 295
 296 static void update_min_dispatch_time(struct throtl_rb_root *st)
 297 {
 298         struct throtl_grp *tg;
 299
 300         tg = throtl_rb_first(st);
 301         if (!tg)
 302                 return;
 303
 304         st->min_disptime = tg->disptime;
 305 }
 306
 307 static void
 308 tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
 309 {
 310         struct rb_node **node = &st->rb.rb_node;
 311         struct rb_node *parent = NULL;
 312         struct throtl_grp *__tg;
 313         unsigned long key = tg->disptime;
 314         int left = 1;
 315
 316         while (*node != NULL) {
 317                 parent = *node;
 318                 __tg = rb_entry_tg(parent);
 319
 320                 if (time_before(key, __tg->disptime))
 321                         node = &parent->rb_left;
 322                 else {
 323                         node = &parent->rb_right;
 324                         left = 0;
 325                 }
 326         }
 327
 328         if (left)
 329                 st->left = &tg->rb_node;
 330
 331         rb_link_node(&tg->rb_node, parent, node);
 332         rb_insert_color(&tg->rb_node, &st->rb);
 333 }
 334
 335 static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
 336 {
 337         struct throtl_rb_root *st = &td->tg_service_tree;
 338
 339         tg_service_tree_add(st, tg);
 340         throtl_mark_tg_on_rr(tg);
 341         st->count++;
 342 }
 343
 344 static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
 345 {
 346         if (!throtl_tg_on_rr(tg))
 347                 __throtl_enqueue_tg(td, tg);
 348 }
 349
 350 static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
 351 {
 352         throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
 353         throtl_clear_tg_on_rr(tg);
 354 }
 355
 356 static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
 357 {
 358         if (throtl_tg_on_rr(tg))
 359                 __throtl_dequeue_tg(td, tg);
 360 }
 361
 362 static void throtl_schedule_next_dispatch(struct throtl_data *td)
 363 {
 364         struct throtl_rb_root *st = &td->tg_service_tree;
 365
 366         /*
 367          * If there are more bios pending, schedule more work.
 368          */
 369         if (!total_nr_queued(td))
 370                 return;
 371
 372         BUG_ON(!st->count);
 373
 374         update_min_dispatch_time(st);
 375
 376         if (time_before_eq(st->min_disptime, jiffies))
 377                 throtl_schedule_delayed_work(td, 0);
 378         else
 379                 throtl_schedule_delayed_work(td, (st->min_disptime - jiffies));
 380 }
 381
 382 static inline void
 383 throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 384 {
 385         tg->bytes_disp[rw] = 0;
 386         tg->io_disp[rw] = 0;
 387         tg->slice_start[rw] = jiffies;
 388         tg->slice_end[rw] = jiffies + throtl_slice;
 389         throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
 390                         rw == READ ? 'R' : 'W', tg->slice_start[rw],
 391                         tg->slice_end[rw], jiffies);
 392 }
 393
 394 static inline void throtl_set_slice_end(struct throtl_data *td,
 395                 struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
 396 {
 397         tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
 398 }
 399
 400 static inline void throtl_extend_slice(struct throtl_data *td,
 401                 struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
 402 {
 403         tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
 404         throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
 405                         rw == READ ? 'R' : 'W', tg->slice_start[rw],
 406                         tg->slice_end[rw], jiffies);
 407 }
 408
 409 /* Determine if previously allocated or extended slice is complete or not */
 410 static bool
 411 throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 412 {
 413         if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
 414                 return 0;
 415
 416         return 1;
 417 }
 418
 419 /* Trim the used slices and adjust slice start accordingly */
 420 static inline void
 421 throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 422 {
 423         unsigned long nr_slices, time_elapsed, io_trim;
 424         u64 bytes_trim, tmp;
 425
 426         BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
 427
 428         /*
 429          * If bps are unlimited (-1), then time slice don't get
 430          * renewed. Don't try to trim the slice if slice is used. A new
 431          * slice will start when appropriate.
 432          */
 433         if (throtl_slice_used(td, tg, rw))
 434                 return;
 435
 436         /*
 437          * A bio has been dispatched. Also adjust slice_end. It might happen
 438          * that initially cgroup limit was very low resulting in high
 439          * slice_end, but later limit was bumped up and bio was dispached
 440          * sooner, then we need to reduce slice_end. A high bogus slice_end
 441          * is bad because it does not allow new slice to start.
 442          */
 443
 444         throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice);
 445
 446         time_elapsed = jiffies - tg->slice_start[rw];
 447
 448         nr_slices = time_elapsed / throtl_slice;
 449
 450         if (!nr_slices)
 451                 return;
 452         tmp = tg->bps[rw] * throtl_slice * nr_slices;
 453         do_div(tmp, HZ);
 454         bytes_trim = tmp;
 455
 456         io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
 457
 458         if (!bytes_trim && !io_trim)
 459                 return;
 460
 461         if (tg->bytes_disp[rw] >= bytes_trim)
 462                 tg->bytes_disp[rw] -= bytes_trim;
 463         else
 464                 tg->bytes_disp[rw] = 0;
 465
 466         if (tg->io_disp[rw] >= io_trim)
 467                 tg->io_disp[rw] -= io_trim;
 468         else
 469                 tg->io_disp[rw] = 0;
 470
 471         tg->slice_start[rw] += nr_slices * throtl_slice;
 472
 473         throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
 474                         " start=%lu end=%lu jiffies=%lu",
 475                         rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
 476                         tg->slice_start[rw], tg->slice_end[rw], jiffies);
 477 }
 478
 479 static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
 480                 struct bio *bio, unsigned long *wait)
 481 {
 482         bool rw = bio_data_dir(bio);
 483         unsigned int io_allowed;
 484         unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
 485         u64 tmp;
 486
 487         jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
 488
 489         /* Slice has just started. Consider one slice interval */
 490         if (!jiffy_elapsed)
 491                 jiffy_elapsed_rnd = throtl_slice;
 492
 493         jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
 494
 495         /*
 496          * jiffy_elapsed_rnd should not be a big value as minimum iops can be
 497          * 1 then at max jiffy elapsed should be equivalent of 1 second as we
 498          * will allow dispatch after 1 second and after that slice should
 499          * have been trimmed.
 500          */
 501
 502         tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
 503         do_div(tmp, HZ);
 504
 505         if (tmp > UINT_MAX)
 506                 io_allowed = UINT_MAX;
 507         else
 508                 io_allowed = tmp;
 509
 510         if (tg->io_disp[rw] + 1 <= io_allowed) {
 511                 if (wait)
 512                         *wait = 0;
 513                 return 1;
 514         }
 515
 516         /* Calc approx time to dispatch */
 517         jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
 518
 519         if (jiffy_wait > jiffy_elapsed)
 520                 jiffy_wait = jiffy_wait - jiffy_elapsed;
 521         else
 522                 jiffy_wait = 1;
 523
 524         if (wait)
 525                 *wait = jiffy_wait;
 526         return 0;
 527 }
 528
 529 static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
 530                 struct bio *bio, unsigned long *wait)
 531 {
 532         bool rw = bio_data_dir(bio);
 533         u64 bytes_allowed, extra_bytes, tmp;
 534         unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
 535
 536         jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
 537
 538         /* Slice has just started. Consider one slice interval */
 539         if (!jiffy_elapsed)
 540                 jiffy_elapsed_rnd = throtl_slice;
 541
 542         jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
 543
 544         tmp = tg->bps[rw] * jiffy_elapsed_rnd;
 545         do_div(tmp, HZ);
 546         bytes_allowed = tmp;
 547
 548         if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
 549                 if (wait)
 550                         *wait = 0;
 551                 return 1;
 552         }
 553
 554         /* Calc approx time to dispatch */
 555         extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
 556         jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
 557
 558         if (!jiffy_wait)
 559                 jiffy_wait = 1;
 560
 561         /*
 562          * This wait time is without taking into consideration the rounding
 563          * up we did. Add that time also.
 564          */
 565         jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
 566         if (wait)
 567                 *wait = jiffy_wait;
 568         return 0;
 569 }
 570
 571 static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
 572         if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
 573                 return 1;
 574         return 0;
 575 }
 576
 577 /*
 578  * Returns whether one can dispatch a bio or not. Also returns approx number
 579  * of jiffies to wait before this bio is with-in IO rate and can be dispatched
 580  */
 581 static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
 582                                 struct bio *bio, unsigned long *wait)
 583 {
 584         bool rw = bio_data_dir(bio);
 585         unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
 586
 587         /*
 588          * Currently whole state machine of group depends on first bio
 589          * queued in the group bio list. So one should not be calling
 590          * this function with a different bio if there are other bios
 591          * queued.
 592          */
 593         BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
 594
 595         /* If tg->bps = -1, then BW is unlimited */
 596         if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
 597                 if (wait)
 598                         *wait = 0;
 599                 return 1;
 600         }
 601
 602         /*
 603          * If previous slice expired, start a new one otherwise renew/extend
 604          * existing slice to make sure it is at least throtl_slice interval
 605          * long since now.
 606          */
 607         if (throtl_slice_used(td, tg, rw))
 608                 throtl_start_new_slice(td, tg, rw);
 609         else {
 610                 if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
 611                         throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
 612         }
 613
 614         if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
 615             && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
 616                 if (wait)
 617                         *wait = 0;
 618                 return 1;
 619         }
 620
 621         max_wait = max(bps_wait, iops_wait);
 622
 623         if (wait)
 624                 *wait = max_wait;
 625
 626         if (time_before(tg->slice_end[rw], jiffies + max_wait))
 627                 throtl_extend_slice(td, tg, rw, jiffies + max_wait);
 628
 629         return 0;
 630 }
 631
 632 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 633 {
 634         bool rw = bio_data_dir(bio);
 635         bool sync = rw_is_sync(bio->bi_rw);
 636
 637         /* Charge the bio to the group */
 638         tg->bytes_disp[rw] += bio->bi_size;
 639         tg->io_disp[rw]++;
 640
 641         blkiocg_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, rw, sync);
 642 }
 643
 644 static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
 645                         struct bio *bio)
 646 {
 647         bool rw = bio_data_dir(bio);
 648
 649         bio_list_add(&tg->bio_lists[rw], bio);
 650         /* Take a bio reference on tg */
 651         throtl_ref_get_tg(tg);
 652         tg->nr_queued[rw]++;
 653         td->nr_queued[rw]++;
 654         throtl_enqueue_tg(td, tg);
 655 }
 656
 657 static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
 658 {
 659         unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
 660         struct bio *bio;
 661
 662         if ((bio = bio_list_peek(&tg->bio_lists[READ])))
 663                 tg_may_dispatch(td, tg, bio, &read_wait);
 664
 665         if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
 666                 tg_may_dispatch(td, tg, bio, &write_wait);
 667
 668         min_wait = min(read_wait, write_wait);
 669         disptime = jiffies + min_wait;
 670
 671         /* Update dispatch time */
 672         throtl_dequeue_tg(td, tg);
 673         tg->disptime = disptime;
 674         throtl_enqueue_tg(td, tg);
 675 }
 676
 677 static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
 678                                 bool rw, struct bio_list *bl)
 679 {
 680         struct bio *bio;
 681
 682         bio = bio_list_pop(&tg->bio_lists[rw]);
 683         tg->nr_queued[rw]--;
 684         /* Drop bio reference on tg */
 685         throtl_put_tg(tg);
 686
 687         BUG_ON(td->nr_queued[rw] <= 0);
 688         td->nr_queued[rw]--;
 689
 690         throtl_charge_bio(tg, bio);
 691         bio_list_add(bl, bio);
 692         bio->bi_rw |= REQ_THROTTLED;
 693
 694         throtl_trim_slice(td, tg, rw);
 695 }
 696
 697 static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
 698                                 struct bio_list *bl)
 699 {
 700         unsigned int nr_reads = 0, nr_writes = 0;
 701         unsigned int max_nr_reads = throtl_grp_quantum*3/4;
 702         unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
 703         struct bio *bio;
 704
 705         /* Try to dispatch 75% READS and 25% WRITES */
 706
 707         while ((bio = bio_list_peek(&tg->bio_lists[READ]))
 708                 && tg_may_dispatch(td, tg, bio, NULL)) {
 709
 710                 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
 711                 nr_reads++;
 712
 713                 if (nr_reads >= max_nr_reads)
 714                         break;
 715         }
 716
 717         while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
 718                 && tg_may_dispatch(td, tg, bio, NULL)) {
 719
 720                 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
 721                 nr_writes++;
 722
 723                 if (nr_writes >= max_nr_writes)
 724                         break;
 725         }
 726
 727         return nr_reads + nr_writes;
 728 }
 729
 730 static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
 731 {
 732         unsigned int nr_disp = 0;
 733         struct throtl_grp *tg;
 734         struct throtl_rb_root *st = &td->tg_service_tree;
 735
 736         while (1) {
 737                 tg = throtl_rb_first(st);
 738
 739                 if (!tg)
 740                         break;
 741
 742                 if (time_before(jiffies, tg->disptime))
 743                         break;
 744
 745                 throtl_dequeue_tg(td, tg);
 746
 747                 nr_disp += throtl_dispatch_tg(td, tg, bl);
 748
 749                 if (tg->nr_queued[0] || tg->nr_queued[1]) {
 750                         tg_update_disptime(td, tg);
 751                         throtl_enqueue_tg(td, tg);
 752                 }
 753
 754                 if (nr_disp >= throtl_quantum)
 755                         break;
 756         }
 757
 758         return nr_disp;
 759 }
 760
 761 static void throtl_process_limit_change(struct throtl_data *td)
 762 {
 763         struct throtl_grp *tg;
 764         struct hlist_node *pos, *n;
 765
 766         if (!td->limits_changed)
 767                 return;
 768
 769         xchg(&td->limits_changed, false);
 770
 771         throtl_log(td, "limits changed");
 772
 773         hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
 774                 if (!tg->limits_changed)
 775                         continue;
 776
 777                 if (!xchg(&tg->limits_changed, false))
 778                         continue;
 779
 780                 throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
 781                         " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
 782                         tg->iops[READ], tg->iops[WRITE]);
 783
 784                 /*
 785                  * Restart the slices for both READ and WRITES. It
 786                  * might happen that a group's limit are dropped
 787                  * suddenly and we don't want to account recently
 788                  * dispatched IO with new low rate
 789                  */
 790                 throtl_start_new_slice(td, tg, 0);
 791                 throtl_start_new_slice(td, tg, 1);
 792
 793                 if (throtl_tg_on_rr(tg))
 794                         tg_update_disptime(td, tg);
 795         }
 796 }
 797
 798 /* Dispatch throttled bios. Should be called without queue lock held. */
 799 static int throtl_dispatch(struct request_queue *q)
 800 {
 801         struct throtl_data *td = q->td;
 802         unsigned int nr_disp = 0;
 803         struct bio_list bio_list_on_stack;
 804         struct bio *bio;
 805         struct blk_plug plug;
 806
 807         spin_lock_irq(q->queue_lock);
 808
 809         throtl_process_limit_change(td);
 810
 811         if (!total_nr_queued(td))
 812                 goto out;
 813
 814         bio_list_init(&bio_list_on_stack);
 815
 816         throtl_log(td, "dispatch nr_queued=%u read=%u write=%u",
 817                         total_nr_queued(td), td->nr_queued[READ],
 818                         td->nr_queued[WRITE]);
 819
 820         nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
 821
 822         if (nr_disp)
 823                 throtl_log(td, "bios disp=%u", nr_disp);
 824
 825         throtl_schedule_next_dispatch(td);
 826 out:
 827         spin_unlock_irq(q->queue_lock);
 828
 829         /*
 830          * If we dispatched some requests, unplug the queue to make sure
 831          * immediate dispatch
 832          */
 833         if (nr_disp) {
 834                 blk_start_plug(&plug);
 835                 while((bio = bio_list_pop(&bio_list_on_stack)))
 836                         generic_make_request(bio);
 837                 blk_finish_plug(&plug);
 838         }
 839         return nr_disp;
 840 }
 841
 842 void blk_throtl_work(struct work_struct *work)
 843 {
 844         struct throtl_data *td = container_of(work, struct throtl_data,
 845                                         throtl_work.work);
 846         struct request_queue *q = td->queue;
 847
 848         throtl_dispatch(q);
 849 }
 850
 851 /* Call with queue lock held */
 852 static void
 853 throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
 854 {
 855
 856         struct delayed_work *dwork = &td->throtl_work;
 857
 858         /* schedule work if limits changed even if no bio is queued */
 859         if (total_nr_queued(td) || td->limits_changed) {
 860                 /*
 861                  * We might have a work scheduled to be executed in future.
 862                  * Cancel that and schedule a new one.
 863                  */
 864                 __cancel_delayed_work(dwork);
 865                 queue_delayed_work(kthrotld_workqueue, dwork, delay);
 866                 throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
 867                                 delay, jiffies);
 868         }
 869 }
 870
 871 static void
 872 throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
 873 {
 874         /* Something wrong if we are trying to remove same group twice */
 875         BUG_ON(hlist_unhashed(&tg->tg_node));
 876
 877         hlist_del_init(&tg->tg_node);
 878
 879         /*
 880          * Put the reference taken at the time of creation so that when all
 881          * queues are gone, group can be destroyed.
 882          */
 883         throtl_put_tg(tg);
 884         td->nr_undestroyed_grps--;
 885 }
 886
 887 static bool throtl_release_tgs(struct throtl_data *td, bool release_root)
 888 {
 889         struct hlist_node *pos, *n;
 890         struct throtl_grp *tg;
 891         bool empty = true;
 892
 893         hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
 894                 /* skip root? */
 895                 if (!release_root && tg == td->root_tg)
 896                         continue;
 897
 898                 /*
 899                  * If cgroup removal path got to blk_group first and removed
 900                  * it from cgroup list, then it will take care of destroying
 901                  * cfqg also.
 902                  */
 903                 if (!blkiocg_del_blkio_group(tg_to_blkg(tg)))
 904                         throtl_destroy_tg(td, tg);
 905                 else
 906                         empty = false;
 907         }
 908         return empty;
 909 }
 910
 911 /*
 912  * Blk cgroup controller notification saying that blkio_group object is being
 913  * delinked as associated cgroup object is going away. That also means that
 914  * no new IO will come in this group. So get rid of this group as soon as
 915  * any pending IO in the group is finished.
 916  *
 917  * This function is called under rcu_read_lock(). @q is the rcu protected
 918  * pointer. That means @q is a valid request_queue pointer as long as we
 919  * are rcu read lock.
 920  *
 921  * @q was fetched from blkio_group under blkio_cgroup->lock. That means
 922  * it should not be NULL as even if queue was going away, cgroup deltion
 923  * path got to it first.
 924  */
 925 void throtl_unlink_blkio_group(struct request_queue *q,
 926                                struct blkio_group *blkg)
 927 {
 928         unsigned long flags;
 929
 930         spin_lock_irqsave(q->queue_lock, flags);
 931         throtl_destroy_tg(q->td, blkg_to_tg(blkg));
 932         spin_unlock_irqrestore(q->queue_lock, flags);
 933 }
 934
 935 static bool throtl_clear_queue(struct request_queue *q)
 936 {
 937         lockdep_assert_held(q->queue_lock);
 938
 939         /*
 940          * Clear tgs but leave the root one alone.  This is necessary
 941          * because root_tg is expected to be persistent and safe because
 942          * blk-throtl can never be disabled while @q is alive.  This is a
 943          * kludge to prepare for unified blkg.  This whole function will be
 944          * removed soon.
 945          */
 946         return throtl_release_tgs(q->td, false);
 947 }
 948
 949 static void throtl_update_blkio_group_common(struct throtl_data *td,
 950                                 struct throtl_grp *tg)
 951 {
 952         xchg(&tg->limits_changed, true);
 953         xchg(&td->limits_changed, true);
 954         /* Schedule a work now to process the limit change */
 955         throtl_schedule_delayed_work(td, 0);
 956 }
 957
 958 /*
 959  * For all update functions, @q should be a valid pointer because these
 960  * update functions are called under blkcg_lock, that means, blkg is
 961  * valid and in turn @q is valid. queue exit path can not race because
 962  * of blkcg_lock
 963  *
 964  * Can not take queue lock in update functions as queue lock under blkcg_lock
 965  * is not allowed. Under other paths we take blkcg_lock under queue_lock.
 966  */
 967 static void throtl_update_blkio_group_read_bps(struct request_queue *q,
 968                                 struct blkio_group *blkg, u64 read_bps)
 969 {
 970         struct throtl_grp *tg = blkg_to_tg(blkg);
 971
 972         tg->bps[READ] = read_bps;
 973         throtl_update_blkio_group_common(q->td, tg);
 974 }
 975
 976 static void throtl_update_blkio_group_write_bps(struct request_queue *q,
 977                                 struct blkio_group *blkg, u64 write_bps)
 978 {
 979         struct throtl_grp *tg = blkg_to_tg(blkg);
 980
 981         tg->bps[WRITE] = write_bps;
 982         throtl_update_blkio_group_common(q->td, tg);
 983 }
 984
 985 static void throtl_update_blkio_group_read_iops(struct request_queue *q,
 986                         struct blkio_group *blkg, unsigned int read_iops)
 987 {
 988         struct throtl_grp *tg = blkg_to_tg(blkg);
 989
 990         tg->iops[READ] = read_iops;
 991         throtl_update_blkio_group_common(q->td, tg);
 992 }
 993
 994 static void throtl_update_blkio_group_write_iops(struct request_queue *q,
 995                         struct blkio_group *blkg, unsigned int write_iops)
 996 {
 997         struct throtl_grp *tg = blkg_to_tg(blkg);
 998
 999         tg->iops[WRITE] = write_iops;
1000         throtl_update_blkio_group_common(q->td, tg);
1001 }
1002
1003 static void throtl_shutdown_wq(struct request_queue *q)
1004 {
1005         struct throtl_data *td = q->td;
1006
1007         cancel_delayed_work_sync(&td->throtl_work);
1008 }
1009
1010 static struct blkio_policy_type blkio_policy_throtl = {
1011         .ops = {
1012                 .blkio_init_group_fn = throtl_init_blkio_group,
1013                 .blkio_link_group_fn = throtl_link_blkio_group,
1014                 .blkio_unlink_group_fn = throtl_unlink_blkio_group,
1015                 .blkio_clear_queue_fn = throtl_clear_queue,
1016                 .blkio_update_group_read_bps_fn =
1017                                         throtl_update_blkio_group_read_bps,
1018                 .blkio_update_group_write_bps_fn =
1019                                         throtl_update_blkio_group_write_bps,
1020                 .blkio_update_group_read_iops_fn =
1021                                         throtl_update_blkio_group_read_iops,
1022                 .blkio_update_group_write_iops_fn =
1023                                         throtl_update_blkio_group_write_iops,
1024         },
1025         .plid = BLKIO_POLICY_THROTL,
1026         .pdata_size = sizeof(struct throtl_grp),
1027 };
1028
1029 bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1030 {
1031         struct throtl_data *td = q->td;
1032         struct throtl_grp *tg;
1033         bool rw = bio_data_dir(bio), update_disptime = true;
1034         struct blkio_cgroup *blkcg;
1035         bool throttled = false;
1036
1037         if (bio->bi_rw & REQ_THROTTLED) {
1038                 bio->bi_rw &= ~REQ_THROTTLED;
1039                 goto out;
1040         }
1041
1042         /*
1043          * A throtl_grp pointer retrieved under rcu can be used to access
1044          * basic fields like stats and io rates. If a group has no rules,
1045          * just update the dispatch stats in lockless manner and return.
1046          */
1047         rcu_read_lock();
1048         blkcg = task_blkio_cgroup(current);
1049         tg = throtl_lookup_tg(td, blkcg);
1050         if (tg) {
1051                 if (tg_no_rule_group(tg, rw)) {
1052                         blkiocg_update_dispatch_stats(tg_to_blkg(tg),
1053                                                       bio->bi_size, rw,
1054                                                       rw_is_sync(bio->bi_rw));
1055                         goto out_unlock_rcu;
1056                 }
1057         }
1058
1059         /*
1060          * Either group has not been allocated yet or it is not an unlimited
1061          * IO group
1062          */
1063         spin_lock_irq(q->queue_lock);
1064         tg = throtl_lookup_create_tg(td, blkcg);
1065         if (unlikely(!tg))
1066                 goto out_unlock;
1067
1068         if (tg->nr_queued[rw]) {
1069                 /*
1070                  * There is already another bio queued in same dir. No
1071                  * need to update dispatch time.
1072                  */
1073                 update_disptime = false;
1074                 goto queue_bio;
1075
1076         }
1077
1078         /* Bio is with-in rate limit of group */
1079         if (tg_may_dispatch(td, tg, bio, NULL)) {
1080                 throtl_charge_bio(tg, bio);
1081
1082                 /*
1083                  * We need to trim slice even when bios are not being queued
1084                  * otherwise it might happen that a bio is not queued for
1085                  * a long time and slice keeps on extending and trim is not
1086                  * called for a long time. Now if limits are reduced suddenly
1087                  * we take into account all the IO dispatched so far at new
1088                  * low rate and * newly queued IO gets a really long dispatch
1089                  * time.
1090                  *
1091                  * So keep on trimming slice even if bio is not queued.
1092                  */
1093                 throtl_trim_slice(td, tg, rw);
1094                 goto out_unlock;
1095         }
1096
1097 queue_bio:
1098         throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
1099                         " iodisp=%u iops=%u queued=%d/%d",
1100                         rw == READ ? 'R' : 'W',
1101                         tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
1102                         tg->io_disp[rw], tg->iops[rw],
1103                         tg->nr_queued[READ], tg->nr_queued[WRITE]);
1104
1105         throtl_add_bio_tg(q->td, tg, bio);
1106         throttled = true;
1107
1108         if (update_disptime) {
1109                 tg_update_disptime(td, tg);
1110                 throtl_schedule_next_dispatch(td);
1111         }
1112
1113 out_unlock:
1114         spin_unlock_irq(q->queue_lock);
1115 out_unlock_rcu:
1116         rcu_read_unlock();
1117 out:
1118         return throttled;
1119 }
1120
1121 /**
1122  * blk_throtl_drain - drain throttled bios
1123  * @q: request_queue to drain throttled bios for
1124  *
1125  * Dispatch all currently throttled bios on @q through ->make_request_fn().
1126  */
1127 void blk_throtl_drain(struct request_queue *q)
1128         __releases(q->queue_lock) __acquires(q->queue_lock)
1129 {
1130         struct throtl_data *td = q->td;
1131         struct throtl_rb_root *st = &td->tg_service_tree;
1132         struct throtl_grp *tg;
1133         struct bio_list bl;
1134         struct bio *bio;
1135
1136         WARN_ON_ONCE(!queue_is_locked(q));
1137
1138         bio_list_init(&bl);
1139
1140         while ((tg = throtl_rb_first(st))) {
1141                 throtl_dequeue_tg(td, tg);
1142
1143                 while ((bio = bio_list_peek(&tg->bio_lists[READ])))
1144                         tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
1145                 while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
1146                         tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
1147         }
1148         spin_unlock_irq(q->queue_lock);
1149
1150         while ((bio = bio_list_pop(&bl)))
1151                 generic_make_request(bio);
1152
1153         spin_lock_irq(q->queue_lock);
1154 }
1155
1156 int blk_throtl_init(struct request_queue *q)
1157 {
1158         struct throtl_data *td;
1159         struct blkio_group *blkg;
1160
1161         td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
1162         if (!td)
1163                 return -ENOMEM;
1164
1165         INIT_HLIST_HEAD(&td->tg_list);
1166         td->tg_service_tree = THROTL_RB_ROOT;
1167         td->limits_changed = false;
1168         INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
1169
1170         q->td = td;
1171         td->queue = q;
1172
1173         /* alloc and init root group. */
1174         rcu_read_lock();
1175         spin_lock_irq(q->queue_lock);
1176
1177         blkg = blkg_lookup_create(&blkio_root_cgroup, q, BLKIO_POLICY_THROTL,
1178                                   true);
1179         if (!IS_ERR(blkg))
1180                 td->root_tg = blkg_to_tg(blkg);
1181
1182         spin_unlock_irq(q->queue_lock);
1183         rcu_read_unlock();
1184
1185         if (!td->root_tg) {
1186                 kfree(td);
1187                 return -ENOMEM;
1188         }
1189         return 0;
1190 }
1191
1192 void blk_throtl_exit(struct request_queue *q)
1193 {
1194         struct throtl_data *td = q->td;
1195         bool wait = false;
1196
1197         BUG_ON(!td);
1198
1199         throtl_shutdown_wq(q);
1200
1201         spin_lock_irq(q->queue_lock);
1202         throtl_release_tgs(td, true);
1203
1204         /* If there are other groups */
1205         if (td->nr_undestroyed_grps > 0)
1206                 wait = true;
1207
1208         spin_unlock_irq(q->queue_lock);
1209
1210         /*
1211          * Wait for tg_to_blkg(tg)->q accessors to exit their grace periods.
1212          * Do this wait only if there are other undestroyed groups out
1213          * there (other than root group). This can happen if cgroup deletion
1214          * path claimed the responsibility of cleaning up a group before
1215          * queue cleanup code get to the group.
1216          *
1217          * Do not call synchronize_rcu() unconditionally as there are drivers
1218          * which create/delete request queue hundreds of times during scan/boot
1219          * and synchronize_rcu() can take significant time and slow down boot.
1220          */
1221         if (wait)
1222                 synchronize_rcu();
1223
1224         /*
1225          * Just being safe to make sure after previous flush if some body did
1226          * update limits through cgroup and another work got queued, cancel
1227          * it.
1228          */
1229         throtl_shutdown_wq(q);
1230
1231         kfree(q->td);
1232 }
1233
1234 static int __init throtl_init(void)
1235 {
1236         kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
1237         if (!kthrotld_workqueue)
1238                 panic("Failed to create kthrotld\n");
1239
1240         blkio_policy_register(&blkio_policy_throtl);
1241         return 0;
1242 }
1243
1244 module_init(throtl_init);