cfq-iosched: implement hierarchy-ready cfq_group charge scaling
[linux-2.6-block.git] / block / cfq-iosched.c
1 /*
2  *  CFQ, or complete fairness queueing, disk scheduler.
3  *
4  *  Based on ideas from a previously unfinished io
5  *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6  *
7  *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
8  */
9 #include <linux/module.h>
10 #include <linux/slab.h>
11 #include <linux/blkdev.h>
12 #include <linux/elevator.h>
13 #include <linux/jiffies.h>
14 #include <linux/rbtree.h>
15 #include <linux/ioprio.h>
16 #include <linux/blktrace_api.h>
17 #include "blk.h"
18 #include "blk-cgroup.h"
19
20 /*
21  * tunables
22  */
23 /* max queue in one round of service */
24 static const int cfq_quantum = 8;
25 static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
26 /* maximum backwards seek, in KiB */
27 static const int cfq_back_max = 16 * 1024;
28 /* penalty of a backwards seek */
29 static const int cfq_back_penalty = 2;
30 static const int cfq_slice_sync = HZ / 10;
31 static int cfq_slice_async = HZ / 25;
32 static const int cfq_slice_async_rq = 2;
33 static int cfq_slice_idle = HZ / 125;
34 static int cfq_group_idle = HZ / 125;
35 static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
36 static const int cfq_hist_divisor = 4;
37
38 /*
39  * offset from end of service tree
40  */
41 #define CFQ_IDLE_DELAY          (HZ / 5)
42
43 /*
44  * below this threshold, we consider thinktime immediate
45  */
46 #define CFQ_MIN_TT              (2)
47
48 #define CFQ_SLICE_SCALE         (5)
49 #define CFQ_HW_QUEUE_MIN        (5)
50 #define CFQ_SERVICE_SHIFT       12
51
52 #define CFQQ_SEEK_THR           (sector_t)(8 * 100)
53 #define CFQQ_CLOSE_THR          (sector_t)(8 * 1024)
54 #define CFQQ_SECT_THR_NONROT    (sector_t)(2 * 32)
55 #define CFQQ_SEEKY(cfqq)        (hweight32(cfqq->seek_history) > 32/8)
56
57 #define RQ_CIC(rq)              icq_to_cic((rq)->elv.icq)
58 #define RQ_CFQQ(rq)             (struct cfq_queue *) ((rq)->elv.priv[0])
59 #define RQ_CFQG(rq)             (struct cfq_group *) ((rq)->elv.priv[1])
60
61 static struct kmem_cache *cfq_pool;
62
63 #define CFQ_PRIO_LISTS          IOPRIO_BE_NR
64 #define cfq_class_idle(cfqq)    ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
65 #define cfq_class_rt(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
66
67 #define sample_valid(samples)   ((samples) > 80)
68 #define rb_entry_cfqg(node)     rb_entry((node), struct cfq_group, rb_node)
69
70 struct cfq_ttime {
71         unsigned long last_end_request;
72
73         unsigned long ttime_total;
74         unsigned long ttime_samples;
75         unsigned long ttime_mean;
76 };
77
78 /*
79  * Most of our rbtree usage is for sorting with min extraction, so
80  * if we cache the leftmost node we don't have to walk down the tree
81  * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
82  * move this into the elevator for the rq sorting as well.
83  */
84 struct cfq_rb_root {
85         struct rb_root rb;
86         struct rb_node *left;
87         unsigned count;
88         unsigned total_weight;
89         u64 min_vdisktime;
90         struct cfq_ttime ttime;
91 };
92 #define CFQ_RB_ROOT     (struct cfq_rb_root) { .rb = RB_ROOT, \
93                         .ttime = {.last_end_request = jiffies,},}
94
95 /*
96  * Per process-grouping structure
97  */
98 struct cfq_queue {
99         /* reference count */
100         int ref;
101         /* various state flags, see below */
102         unsigned int flags;
103         /* parent cfq_data */
104         struct cfq_data *cfqd;
105         /* service_tree member */
106         struct rb_node rb_node;
107         /* service_tree key */
108         unsigned long rb_key;
109         /* prio tree member */
110         struct rb_node p_node;
111         /* prio tree root we belong to, if any */
112         struct rb_root *p_root;
113         /* sorted list of pending requests */
114         struct rb_root sort_list;
115         /* if fifo isn't expired, next request to serve */
116         struct request *next_rq;
117         /* requests queued in sort_list */
118         int queued[2];
119         /* currently allocated requests */
120         int allocated[2];
121         /* fifo list of requests in sort_list */
122         struct list_head fifo;
123
124         /* time when queue got scheduled in to dispatch first request. */
125         unsigned long dispatch_start;
126         unsigned int allocated_slice;
127         unsigned int slice_dispatch;
128         /* time when first request from queue completed and slice started. */
129         unsigned long slice_start;
130         unsigned long slice_end;
131         long slice_resid;
132
133         /* pending priority requests */
134         int prio_pending;
135         /* number of requests that are on the dispatch list or inside driver */
136         int dispatched;
137
138         /* io prio of this group */
139         unsigned short ioprio, org_ioprio;
140         unsigned short ioprio_class;
141
142         pid_t pid;
143
144         u32 seek_history;
145         sector_t last_request_pos;
146
147         struct cfq_rb_root *service_tree;
148         struct cfq_queue *new_cfqq;
149         struct cfq_group *cfqg;
150         /* Number of sectors dispatched from queue in single dispatch round */
151         unsigned long nr_sectors;
152 };
153
154 /*
155  * First index in the service_trees.
156  * IDLE is handled separately, so it has negative index
157  */
158 enum wl_class_t {
159         BE_WORKLOAD = 0,
160         RT_WORKLOAD = 1,
161         IDLE_WORKLOAD = 2,
162         CFQ_PRIO_NR,
163 };
164
165 /*
166  * Second index in the service_trees.
167  */
168 enum wl_type_t {
169         ASYNC_WORKLOAD = 0,
170         SYNC_NOIDLE_WORKLOAD = 1,
171         SYNC_WORKLOAD = 2
172 };
173
174 struct cfqg_stats {
175 #ifdef CONFIG_CFQ_GROUP_IOSCHED
176         /* total bytes transferred */
177         struct blkg_rwstat              service_bytes;
178         /* total IOs serviced, post merge */
179         struct blkg_rwstat              serviced;
180         /* number of ios merged */
181         struct blkg_rwstat              merged;
182         /* total time spent on device in ns, may not be accurate w/ queueing */
183         struct blkg_rwstat              service_time;
184         /* total time spent waiting in scheduler queue in ns */
185         struct blkg_rwstat              wait_time;
186         /* number of IOs queued up */
187         struct blkg_rwstat              queued;
188         /* total sectors transferred */
189         struct blkg_stat                sectors;
190         /* total disk time and nr sectors dispatched by this group */
191         struct blkg_stat                time;
192 #ifdef CONFIG_DEBUG_BLK_CGROUP
193         /* time not charged to this cgroup */
194         struct blkg_stat                unaccounted_time;
195         /* sum of number of ios queued across all samples */
196         struct blkg_stat                avg_queue_size_sum;
197         /* count of samples taken for average */
198         struct blkg_stat                avg_queue_size_samples;
199         /* how many times this group has been removed from service tree */
200         struct blkg_stat                dequeue;
201         /* total time spent waiting for it to be assigned a timeslice. */
202         struct blkg_stat                group_wait_time;
203         /* time spent idling for this blkcg_gq */
204         struct blkg_stat                idle_time;
205         /* total time with empty current active q with other requests queued */
206         struct blkg_stat                empty_time;
207         /* fields after this shouldn't be cleared on stat reset */
208         uint64_t                        start_group_wait_time;
209         uint64_t                        start_idle_time;
210         uint64_t                        start_empty_time;
211         uint16_t                        flags;
212 #endif  /* CONFIG_DEBUG_BLK_CGROUP */
213 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
214 };
215
216 /* This is per cgroup per device grouping structure */
217 struct cfq_group {
218         /* must be the first member */
219         struct blkg_policy_data pd;
220
221         /* group service_tree member */
222         struct rb_node rb_node;
223
224         /* group service_tree key */
225         u64 vdisktime;
226
227         /*
228          * The number of active cfqgs and sum of their weights under this
229          * cfqg.  This covers this cfqg's leaf_weight and all children's
230          * weights, but does not cover weights of further descendants.
231          *
232          * If a cfqg is on the service tree, it's active.  An active cfqg
233          * also activates its parent and contributes to the children_weight
234          * of the parent.
235          */
236         int nr_active;
237         unsigned int children_weight;
238
239         /*
240          * vfraction is the fraction of vdisktime that the tasks in this
241          * cfqg are entitled to.  This is determined by compounding the
242          * ratios walking up from this cfqg to the root.
243          *
244          * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
245          * vfractions on a service tree is approximately 1.  The sum may
246          * deviate a bit due to rounding errors and fluctuations caused by
247          * cfqgs entering and leaving the service tree.
248          */
249         unsigned int vfraction;
250
251         /*
252          * There are two weights - (internal) weight is the weight of this
253          * cfqg against the sibling cfqgs.  leaf_weight is the wight of
254          * this cfqg against the child cfqgs.  For the root cfqg, both
255          * weights are kept in sync for backward compatibility.
256          */
257         unsigned int weight;
258         unsigned int new_weight;
259         unsigned int dev_weight;
260
261         unsigned int leaf_weight;
262         unsigned int new_leaf_weight;
263         unsigned int dev_leaf_weight;
264
265         /* number of cfqq currently on this group */
266         int nr_cfqq;
267
268         /*
269          * Per group busy queues average. Useful for workload slice calc. We
270          * create the array for each prio class but at run time it is used
271          * only for RT and BE class and slot for IDLE class remains unused.
272          * This is primarily done to avoid confusion and a gcc warning.
273          */
274         unsigned int busy_queues_avg[CFQ_PRIO_NR];
275         /*
276          * rr lists of queues with requests. We maintain service trees for
277          * RT and BE classes. These trees are subdivided in subclasses
278          * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
279          * class there is no subclassification and all the cfq queues go on
280          * a single tree service_tree_idle.
281          * Counts are embedded in the cfq_rb_root
282          */
283         struct cfq_rb_root service_trees[2][3];
284         struct cfq_rb_root service_tree_idle;
285
286         unsigned long saved_wl_slice;
287         enum wl_type_t saved_wl_type;
288         enum wl_class_t saved_wl_class;
289
290         /* number of requests that are on the dispatch list or inside driver */
291         int dispatched;
292         struct cfq_ttime ttime;
293         struct cfqg_stats stats;
294 };
295
296 struct cfq_io_cq {
297         struct io_cq            icq;            /* must be the first member */
298         struct cfq_queue        *cfqq[2];
299         struct cfq_ttime        ttime;
300         int                     ioprio;         /* the current ioprio */
301 #ifdef CONFIG_CFQ_GROUP_IOSCHED
302         uint64_t                blkcg_id;       /* the current blkcg ID */
303 #endif
304 };
305
306 /*
307  * Per block device queue structure
308  */
309 struct cfq_data {
310         struct request_queue *queue;
311         /* Root service tree for cfq_groups */
312         struct cfq_rb_root grp_service_tree;
313         struct cfq_group *root_group;
314
315         /*
316          * The priority currently being served
317          */
318         enum wl_class_t serving_wl_class;
319         enum wl_type_t serving_wl_type;
320         unsigned long workload_expires;
321         struct cfq_group *serving_group;
322
323         /*
324          * Each priority tree is sorted by next_request position.  These
325          * trees are used when determining if two or more queues are
326          * interleaving requests (see cfq_close_cooperator).
327          */
328         struct rb_root prio_trees[CFQ_PRIO_LISTS];
329
330         unsigned int busy_queues;
331         unsigned int busy_sync_queues;
332
333         int rq_in_driver;
334         int rq_in_flight[2];
335
336         /*
337          * queue-depth detection
338          */
339         int rq_queued;
340         int hw_tag;
341         /*
342          * hw_tag can be
343          * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
344          *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
345          *  0 => no NCQ
346          */
347         int hw_tag_est_depth;
348         unsigned int hw_tag_samples;
349
350         /*
351          * idle window management
352          */
353         struct timer_list idle_slice_timer;
354         struct work_struct unplug_work;
355
356         struct cfq_queue *active_queue;
357         struct cfq_io_cq *active_cic;
358
359         /*
360          * async queue for each priority case
361          */
362         struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
363         struct cfq_queue *async_idle_cfqq;
364
365         sector_t last_position;
366
367         /*
368          * tunables, see top of file
369          */
370         unsigned int cfq_quantum;
371         unsigned int cfq_fifo_expire[2];
372         unsigned int cfq_back_penalty;
373         unsigned int cfq_back_max;
374         unsigned int cfq_slice[2];
375         unsigned int cfq_slice_async_rq;
376         unsigned int cfq_slice_idle;
377         unsigned int cfq_group_idle;
378         unsigned int cfq_latency;
379         unsigned int cfq_target_latency;
380
381         /*
382          * Fallback dummy cfqq for extreme OOM conditions
383          */
384         struct cfq_queue oom_cfqq;
385
386         unsigned long last_delayed_sync;
387 };
388
389 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
390
391 static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
392                                             enum wl_class_t class,
393                                             enum wl_type_t type)
394 {
395         if (!cfqg)
396                 return NULL;
397
398         if (class == IDLE_WORKLOAD)
399                 return &cfqg->service_tree_idle;
400
401         return &cfqg->service_trees[class][type];
402 }
403
404 enum cfqq_state_flags {
405         CFQ_CFQQ_FLAG_on_rr = 0,        /* on round-robin busy list */
406         CFQ_CFQQ_FLAG_wait_request,     /* waiting for a request */
407         CFQ_CFQQ_FLAG_must_dispatch,    /* must be allowed a dispatch */
408         CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
409         CFQ_CFQQ_FLAG_fifo_expire,      /* FIFO checked in this slice */
410         CFQ_CFQQ_FLAG_idle_window,      /* slice idling enabled */
411         CFQ_CFQQ_FLAG_prio_changed,     /* task priority has changed */
412         CFQ_CFQQ_FLAG_slice_new,        /* no requests dispatched in slice */
413         CFQ_CFQQ_FLAG_sync,             /* synchronous queue */
414         CFQ_CFQQ_FLAG_coop,             /* cfqq is shared */
415         CFQ_CFQQ_FLAG_split_coop,       /* shared cfqq will be splitted */
416         CFQ_CFQQ_FLAG_deep,             /* sync cfqq experienced large depth */
417         CFQ_CFQQ_FLAG_wait_busy,        /* Waiting for next request */
418 };
419
420 #define CFQ_CFQQ_FNS(name)                                              \
421 static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)         \
422 {                                                                       \
423         (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);                   \
424 }                                                                       \
425 static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)        \
426 {                                                                       \
427         (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);                  \
428 }                                                                       \
429 static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)         \
430 {                                                                       \
431         return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;      \
432 }
433
434 CFQ_CFQQ_FNS(on_rr);
435 CFQ_CFQQ_FNS(wait_request);
436 CFQ_CFQQ_FNS(must_dispatch);
437 CFQ_CFQQ_FNS(must_alloc_slice);
438 CFQ_CFQQ_FNS(fifo_expire);
439 CFQ_CFQQ_FNS(idle_window);
440 CFQ_CFQQ_FNS(prio_changed);
441 CFQ_CFQQ_FNS(slice_new);
442 CFQ_CFQQ_FNS(sync);
443 CFQ_CFQQ_FNS(coop);
444 CFQ_CFQQ_FNS(split_coop);
445 CFQ_CFQQ_FNS(deep);
446 CFQ_CFQQ_FNS(wait_busy);
447 #undef CFQ_CFQQ_FNS
448
449 static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
450 {
451         return pd ? container_of(pd, struct cfq_group, pd) : NULL;
452 }
453
454 static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
455 {
456         return pd_to_blkg(&cfqg->pd);
457 }
458
459 #if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
460
461 /* cfqg stats flags */
462 enum cfqg_stats_flags {
463         CFQG_stats_waiting = 0,
464         CFQG_stats_idling,
465         CFQG_stats_empty,
466 };
467
468 #define CFQG_FLAG_FNS(name)                                             \
469 static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats)     \
470 {                                                                       \
471         stats->flags |= (1 << CFQG_stats_##name);                       \
472 }                                                                       \
473 static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats)    \
474 {                                                                       \
475         stats->flags &= ~(1 << CFQG_stats_##name);                      \
476 }                                                                       \
477 static inline int cfqg_stats_##name(struct cfqg_stats *stats)           \
478 {                                                                       \
479         return (stats->flags & (1 << CFQG_stats_##name)) != 0;          \
480 }                                                                       \
481
482 CFQG_FLAG_FNS(waiting)
483 CFQG_FLAG_FNS(idling)
484 CFQG_FLAG_FNS(empty)
485 #undef CFQG_FLAG_FNS
486
487 /* This should be called with the queue_lock held. */
488 static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
489 {
490         unsigned long long now;
491
492         if (!cfqg_stats_waiting(stats))
493                 return;
494
495         now = sched_clock();
496         if (time_after64(now, stats->start_group_wait_time))
497                 blkg_stat_add(&stats->group_wait_time,
498                               now - stats->start_group_wait_time);
499         cfqg_stats_clear_waiting(stats);
500 }
501
502 /* This should be called with the queue_lock held. */
503 static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
504                                                  struct cfq_group *curr_cfqg)
505 {
506         struct cfqg_stats *stats = &cfqg->stats;
507
508         if (cfqg_stats_waiting(stats))
509                 return;
510         if (cfqg == curr_cfqg)
511                 return;
512         stats->start_group_wait_time = sched_clock();
513         cfqg_stats_mark_waiting(stats);
514 }
515
516 /* This should be called with the queue_lock held. */
517 static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
518 {
519         unsigned long long now;
520
521         if (!cfqg_stats_empty(stats))
522                 return;
523
524         now = sched_clock();
525         if (time_after64(now, stats->start_empty_time))
526                 blkg_stat_add(&stats->empty_time,
527                               now - stats->start_empty_time);
528         cfqg_stats_clear_empty(stats);
529 }
530
531 static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
532 {
533         blkg_stat_add(&cfqg->stats.dequeue, 1);
534 }
535
536 static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
537 {
538         struct cfqg_stats *stats = &cfqg->stats;
539
540         if (blkg_rwstat_sum(&stats->queued))
541                 return;
542
543         /*
544          * group is already marked empty. This can happen if cfqq got new
545          * request in parent group and moved to this group while being added
546          * to service tree. Just ignore the event and move on.
547          */
548         if (cfqg_stats_empty(stats))
549                 return;
550
551         stats->start_empty_time = sched_clock();
552         cfqg_stats_mark_empty(stats);
553 }
554
555 static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
556 {
557         struct cfqg_stats *stats = &cfqg->stats;
558
559         if (cfqg_stats_idling(stats)) {
560                 unsigned long long now = sched_clock();
561
562                 if (time_after64(now, stats->start_idle_time))
563                         blkg_stat_add(&stats->idle_time,
564                                       now - stats->start_idle_time);
565                 cfqg_stats_clear_idling(stats);
566         }
567 }
568
569 static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
570 {
571         struct cfqg_stats *stats = &cfqg->stats;
572
573         BUG_ON(cfqg_stats_idling(stats));
574
575         stats->start_idle_time = sched_clock();
576         cfqg_stats_mark_idling(stats);
577 }
578
579 static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
580 {
581         struct cfqg_stats *stats = &cfqg->stats;
582
583         blkg_stat_add(&stats->avg_queue_size_sum,
584                       blkg_rwstat_sum(&stats->queued));
585         blkg_stat_add(&stats->avg_queue_size_samples, 1);
586         cfqg_stats_update_group_wait_time(stats);
587 }
588
589 #else   /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
590
591 static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
592 static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
593 static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
594 static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
595 static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
596 static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
597 static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
598
599 #endif  /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
600
601 #ifdef CONFIG_CFQ_GROUP_IOSCHED
602
603 static struct blkcg_policy blkcg_policy_cfq;
604
605 static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
606 {
607         return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
608 }
609
610 /*
611  * Determine the parent cfqg for weight calculation.  Currently, cfqg
612  * scheduling is flat and the root is the parent of everyone else.
613  */
614 static inline struct cfq_group *cfqg_flat_parent(struct cfq_group *cfqg)
615 {
616         struct blkcg_gq *blkg = cfqg_to_blkg(cfqg);
617         struct cfq_group *root;
618
619         while (blkg->parent)
620                 blkg = blkg->parent;
621         root = blkg_to_cfqg(blkg);
622
623         return root != cfqg ? root : NULL;
624 }
625
626 static inline void cfqg_get(struct cfq_group *cfqg)
627 {
628         return blkg_get(cfqg_to_blkg(cfqg));
629 }
630
631 static inline void cfqg_put(struct cfq_group *cfqg)
632 {
633         return blkg_put(cfqg_to_blkg(cfqg));
634 }
635
636 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  do {                    \
637         char __pbuf[128];                                               \
638                                                                         \
639         blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf));  \
640         blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c %s " fmt, (cfqq)->pid, \
641                         cfq_cfqq_sync((cfqq)) ? 'S' : 'A',              \
642                         cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
643                           __pbuf, ##args);                              \
644 } while (0)
645
646 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)  do {                    \
647         char __pbuf[128];                                               \
648                                                                         \
649         blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf));          \
650         blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args);    \
651 } while (0)
652
653 static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
654                                             struct cfq_group *curr_cfqg, int rw)
655 {
656         blkg_rwstat_add(&cfqg->stats.queued, rw, 1);
657         cfqg_stats_end_empty_time(&cfqg->stats);
658         cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
659 }
660
661 static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
662                         unsigned long time, unsigned long unaccounted_time)
663 {
664         blkg_stat_add(&cfqg->stats.time, time);
665 #ifdef CONFIG_DEBUG_BLK_CGROUP
666         blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
667 #endif
668 }
669
670 static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw)
671 {
672         blkg_rwstat_add(&cfqg->stats.queued, rw, -1);
673 }
674
675 static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw)
676 {
677         blkg_rwstat_add(&cfqg->stats.merged, rw, 1);
678 }
679
680 static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
681                                               uint64_t bytes, int rw)
682 {
683         blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);
684         blkg_rwstat_add(&cfqg->stats.serviced, rw, 1);
685         blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes);
686 }
687
688 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
689                         uint64_t start_time, uint64_t io_start_time, int rw)
690 {
691         struct cfqg_stats *stats = &cfqg->stats;
692         unsigned long long now = sched_clock();
693
694         if (time_after64(now, io_start_time))
695                 blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
696         if (time_after64(io_start_time, start_time))
697                 blkg_rwstat_add(&stats->wait_time, rw,
698                                 io_start_time - start_time);
699 }
700
701 static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
702 {
703         struct cfq_group *cfqg = blkg_to_cfqg(blkg);
704         struct cfqg_stats *stats = &cfqg->stats;
705
706         /* queued stats shouldn't be cleared */
707         blkg_rwstat_reset(&stats->service_bytes);
708         blkg_rwstat_reset(&stats->serviced);
709         blkg_rwstat_reset(&stats->merged);
710         blkg_rwstat_reset(&stats->service_time);
711         blkg_rwstat_reset(&stats->wait_time);
712         blkg_stat_reset(&stats->time);
713 #ifdef CONFIG_DEBUG_BLK_CGROUP
714         blkg_stat_reset(&stats->unaccounted_time);
715         blkg_stat_reset(&stats->avg_queue_size_sum);
716         blkg_stat_reset(&stats->avg_queue_size_samples);
717         blkg_stat_reset(&stats->dequeue);
718         blkg_stat_reset(&stats->group_wait_time);
719         blkg_stat_reset(&stats->idle_time);
720         blkg_stat_reset(&stats->empty_time);
721 #endif
722 }
723
724 #else   /* CONFIG_CFQ_GROUP_IOSCHED */
725
726 static inline struct cfq_group *cfqg_flat_parent(struct cfq_group *cfqg) { return NULL; }
727 static inline void cfqg_get(struct cfq_group *cfqg) { }
728 static inline void cfqg_put(struct cfq_group *cfqg) { }
729
730 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  \
731         blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \
732                         cfq_cfqq_sync((cfqq)) ? 'S' : 'A',              \
733                         cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
734                                 ##args)
735 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)          do {} while (0)
736
737 static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
738                         struct cfq_group *curr_cfqg, int rw) { }
739 static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
740                         unsigned long time, unsigned long unaccounted_time) { }
741 static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { }
742 static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { }
743 static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
744                                               uint64_t bytes, int rw) { }
745 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
746                         uint64_t start_time, uint64_t io_start_time, int rw) { }
747
748 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
749
750 #define cfq_log(cfqd, fmt, args...)     \
751         blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
752
753 /* Traverses through cfq group service trees */
754 #define for_each_cfqg_st(cfqg, i, j, st) \
755         for (i = 0; i <= IDLE_WORKLOAD; i++) \
756                 for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
757                         : &cfqg->service_tree_idle; \
758                         (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
759                         (i == IDLE_WORKLOAD && j == 0); \
760                         j++, st = i < IDLE_WORKLOAD ? \
761                         &cfqg->service_trees[i][j]: NULL) \
762
763 static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
764         struct cfq_ttime *ttime, bool group_idle)
765 {
766         unsigned long slice;
767         if (!sample_valid(ttime->ttime_samples))
768                 return false;
769         if (group_idle)
770                 slice = cfqd->cfq_group_idle;
771         else
772                 slice = cfqd->cfq_slice_idle;
773         return ttime->ttime_mean > slice;
774 }
775
776 static inline bool iops_mode(struct cfq_data *cfqd)
777 {
778         /*
779          * If we are not idling on queues and it is a NCQ drive, parallel
780          * execution of requests is on and measuring time is not possible
781          * in most of the cases until and unless we drive shallower queue
782          * depths and that becomes a performance bottleneck. In such cases
783          * switch to start providing fairness in terms of number of IOs.
784          */
785         if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
786                 return true;
787         else
788                 return false;
789 }
790
791 static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq)
792 {
793         if (cfq_class_idle(cfqq))
794                 return IDLE_WORKLOAD;
795         if (cfq_class_rt(cfqq))
796                 return RT_WORKLOAD;
797         return BE_WORKLOAD;
798 }
799
800
801 static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
802 {
803         if (!cfq_cfqq_sync(cfqq))
804                 return ASYNC_WORKLOAD;
805         if (!cfq_cfqq_idle_window(cfqq))
806                 return SYNC_NOIDLE_WORKLOAD;
807         return SYNC_WORKLOAD;
808 }
809
810 static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
811                                         struct cfq_data *cfqd,
812                                         struct cfq_group *cfqg)
813 {
814         if (wl_class == IDLE_WORKLOAD)
815                 return cfqg->service_tree_idle.count;
816
817         return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count +
818                 cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count +
819                 cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
820 }
821
822 static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
823                                         struct cfq_group *cfqg)
824 {
825         return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count +
826                 cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
827 }
828
829 static void cfq_dispatch_insert(struct request_queue *, struct request *);
830 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
831                                        struct cfq_io_cq *cic, struct bio *bio,
832                                        gfp_t gfp_mask);
833
834 static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
835 {
836         /* cic->icq is the first member, %NULL will convert to %NULL */
837         return container_of(icq, struct cfq_io_cq, icq);
838 }
839
840 static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
841                                                struct io_context *ioc)
842 {
843         if (ioc)
844                 return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
845         return NULL;
846 }
847
848 static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
849 {
850         return cic->cfqq[is_sync];
851 }
852
853 static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
854                                 bool is_sync)
855 {
856         cic->cfqq[is_sync] = cfqq;
857 }
858
859 static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
860 {
861         return cic->icq.q->elevator->elevator_data;
862 }
863
864 /*
865  * We regard a request as SYNC, if it's either a read or has the SYNC bit
866  * set (in which case it could also be direct WRITE).
867  */
868 static inline bool cfq_bio_sync(struct bio *bio)
869 {
870         return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC);
871 }
872
873 /*
874  * scheduler run of queue, if there are requests pending and no one in the
875  * driver that will restart queueing
876  */
877 static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
878 {
879         if (cfqd->busy_queues) {
880                 cfq_log(cfqd, "schedule dispatch");
881                 kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
882         }
883 }
884
885 /*
886  * Scale schedule slice based on io priority. Use the sync time slice only
887  * if a queue is marked sync and has sync io queued. A sync queue with async
888  * io only, should not get full sync slice length.
889  */
890 static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,
891                                  unsigned short prio)
892 {
893         const int base_slice = cfqd->cfq_slice[sync];
894
895         WARN_ON(prio >= IOPRIO_BE_NR);
896
897         return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
898 }
899
900 static inline int
901 cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
902 {
903         return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
904 }
905
906 /**
907  * cfqg_scale_charge - scale disk time charge according to cfqg weight
908  * @charge: disk time being charged
909  * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
910  *
911  * Scale @charge according to @vfraction, which is in range (0, 1].  The
912  * scaling is inversely proportional.
913  *
914  * scaled = charge / vfraction
915  *
916  * The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
917  */
918 static inline u64 cfqg_scale_charge(unsigned long charge,
919                                     unsigned int vfraction)
920 {
921         u64 c = charge << CFQ_SERVICE_SHIFT;    /* make it fixed point */
922
923         /* charge / vfraction */
924         c <<= CFQ_SERVICE_SHIFT;
925         do_div(c, vfraction);
926         return c;
927 }
928
929 static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
930 {
931         s64 delta = (s64)(vdisktime - min_vdisktime);
932         if (delta > 0)
933                 min_vdisktime = vdisktime;
934
935         return min_vdisktime;
936 }
937
938 static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
939 {
940         s64 delta = (s64)(vdisktime - min_vdisktime);
941         if (delta < 0)
942                 min_vdisktime = vdisktime;
943
944         return min_vdisktime;
945 }
946
947 static void update_min_vdisktime(struct cfq_rb_root *st)
948 {
949         struct cfq_group *cfqg;
950
951         if (st->left) {
952                 cfqg = rb_entry_cfqg(st->left);
953                 st->min_vdisktime = max_vdisktime(st->min_vdisktime,
954                                                   cfqg->vdisktime);
955         }
956 }
957
958 /*
959  * get averaged number of queues of RT/BE priority.
960  * average is updated, with a formula that gives more weight to higher numbers,
961  * to quickly follows sudden increases and decrease slowly
962  */
963
964 static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
965                                         struct cfq_group *cfqg, bool rt)
966 {
967         unsigned min_q, max_q;
968         unsigned mult  = cfq_hist_divisor - 1;
969         unsigned round = cfq_hist_divisor / 2;
970         unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
971
972         min_q = min(cfqg->busy_queues_avg[rt], busy);
973         max_q = max(cfqg->busy_queues_avg[rt], busy);
974         cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
975                 cfq_hist_divisor;
976         return cfqg->busy_queues_avg[rt];
977 }
978
979 static inline unsigned
980 cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
981 {
982         struct cfq_rb_root *st = &cfqd->grp_service_tree;
983
984         return cfqd->cfq_target_latency * cfqg->weight / st->total_weight;
985 }
986
987 static inline unsigned
988 cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
989 {
990         unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
991         if (cfqd->cfq_latency) {
992                 /*
993                  * interested queues (we consider only the ones with the same
994                  * priority class in the cfq group)
995                  */
996                 unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
997                                                 cfq_class_rt(cfqq));
998                 unsigned sync_slice = cfqd->cfq_slice[1];
999                 unsigned expect_latency = sync_slice * iq;
1000                 unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
1001
1002                 if (expect_latency > group_slice) {
1003                         unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
1004                         /* scale low_slice according to IO priority
1005                          * and sync vs async */
1006                         unsigned low_slice =
1007                                 min(slice, base_low_slice * slice / sync_slice);
1008                         /* the adapted slice value is scaled to fit all iqs
1009                          * into the target latency */
1010                         slice = max(slice * group_slice / expect_latency,
1011                                     low_slice);
1012                 }
1013         }
1014         return slice;
1015 }
1016
1017 static inline void
1018 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1019 {
1020         unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
1021
1022         cfqq->slice_start = jiffies;
1023         cfqq->slice_end = jiffies + slice;
1024         cfqq->allocated_slice = slice;
1025         cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
1026 }
1027
1028 /*
1029  * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
1030  * isn't valid until the first request from the dispatch is activated
1031  * and the slice time set.
1032  */
1033 static inline bool cfq_slice_used(struct cfq_queue *cfqq)
1034 {
1035         if (cfq_cfqq_slice_new(cfqq))
1036                 return false;
1037         if (time_before(jiffies, cfqq->slice_end))
1038                 return false;
1039
1040         return true;
1041 }
1042
1043 /*
1044  * Lifted from AS - choose which of rq1 and rq2 that is best served now.
1045  * We choose the request that is closest to the head right now. Distance
1046  * behind the head is penalized and only allowed to a certain extent.
1047  */
1048 static struct request *
1049 cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
1050 {
1051         sector_t s1, s2, d1 = 0, d2 = 0;
1052         unsigned long back_max;
1053 #define CFQ_RQ1_WRAP    0x01 /* request 1 wraps */
1054 #define CFQ_RQ2_WRAP    0x02 /* request 2 wraps */
1055         unsigned wrap = 0; /* bit mask: requests behind the disk head? */
1056
1057         if (rq1 == NULL || rq1 == rq2)
1058                 return rq2;
1059         if (rq2 == NULL)
1060                 return rq1;
1061
1062         if (rq_is_sync(rq1) != rq_is_sync(rq2))
1063                 return rq_is_sync(rq1) ? rq1 : rq2;
1064
1065         if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)
1066                 return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;
1067
1068         s1 = blk_rq_pos(rq1);
1069         s2 = blk_rq_pos(rq2);
1070
1071         /*
1072          * by definition, 1KiB is 2 sectors
1073          */
1074         back_max = cfqd->cfq_back_max * 2;
1075
1076         /*
1077          * Strict one way elevator _except_ in the case where we allow
1078          * short backward seeks which are biased as twice the cost of a
1079          * similar forward seek.
1080          */
1081         if (s1 >= last)
1082                 d1 = s1 - last;
1083         else if (s1 + back_max >= last)
1084                 d1 = (last - s1) * cfqd->cfq_back_penalty;
1085         else
1086                 wrap |= CFQ_RQ1_WRAP;
1087
1088         if (s2 >= last)
1089                 d2 = s2 - last;
1090         else if (s2 + back_max >= last)
1091                 d2 = (last - s2) * cfqd->cfq_back_penalty;
1092         else
1093                 wrap |= CFQ_RQ2_WRAP;
1094
1095         /* Found required data */
1096
1097         /*
1098          * By doing switch() on the bit mask "wrap" we avoid having to
1099          * check two variables for all permutations: --> faster!
1100          */
1101         switch (wrap) {
1102         case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
1103                 if (d1 < d2)
1104                         return rq1;
1105                 else if (d2 < d1)
1106                         return rq2;
1107                 else {
1108                         if (s1 >= s2)
1109                                 return rq1;
1110                         else
1111                                 return rq2;
1112                 }
1113
1114         case CFQ_RQ2_WRAP:
1115                 return rq1;
1116         case CFQ_RQ1_WRAP:
1117                 return rq2;
1118         case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
1119         default:
1120                 /*
1121                  * Since both rqs are wrapped,
1122                  * start with the one that's further behind head
1123                  * (--> only *one* back seek required),
1124                  * since back seek takes more time than forward.
1125                  */
1126                 if (s1 <= s2)
1127                         return rq1;
1128                 else
1129                         return rq2;
1130         }
1131 }
1132
1133 /*
1134  * The below is leftmost cache rbtree addon
1135  */
1136 static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
1137 {
1138         /* Service tree is empty */
1139         if (!root->count)
1140                 return NULL;
1141
1142         if (!root->left)
1143                 root->left = rb_first(&root->rb);
1144
1145         if (root->left)
1146                 return rb_entry(root->left, struct cfq_queue, rb_node);
1147
1148         return NULL;
1149 }
1150
1151 static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
1152 {
1153         if (!root->left)
1154                 root->left = rb_first(&root->rb);
1155
1156         if (root->left)
1157                 return rb_entry_cfqg(root->left);
1158
1159         return NULL;
1160 }
1161
1162 static void rb_erase_init(struct rb_node *n, struct rb_root *root)
1163 {
1164         rb_erase(n, root);
1165         RB_CLEAR_NODE(n);
1166 }
1167
1168 static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
1169 {
1170         if (root->left == n)
1171                 root->left = NULL;
1172         rb_erase_init(n, &root->rb);
1173         --root->count;
1174 }
1175
1176 /*
1177  * would be nice to take fifo expire time into account as well
1178  */
1179 static struct request *
1180 cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1181                   struct request *last)
1182 {
1183         struct rb_node *rbnext = rb_next(&last->rb_node);
1184         struct rb_node *rbprev = rb_prev(&last->rb_node);
1185         struct request *next = NULL, *prev = NULL;
1186
1187         BUG_ON(RB_EMPTY_NODE(&last->rb_node));
1188
1189         if (rbprev)
1190                 prev = rb_entry_rq(rbprev);
1191
1192         if (rbnext)
1193                 next = rb_entry_rq(rbnext);
1194         else {
1195                 rbnext = rb_first(&cfqq->sort_list);
1196                 if (rbnext && rbnext != &last->rb_node)
1197                         next = rb_entry_rq(rbnext);
1198         }
1199
1200         return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
1201 }
1202
1203 static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
1204                                       struct cfq_queue *cfqq)
1205 {
1206         /*
1207          * just an approximation, should be ok.
1208          */
1209         return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
1210                        cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
1211 }
1212
1213 static inline s64
1214 cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
1215 {
1216         return cfqg->vdisktime - st->min_vdisktime;
1217 }
1218
1219 static void
1220 __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1221 {
1222         struct rb_node **node = &st->rb.rb_node;
1223         struct rb_node *parent = NULL;
1224         struct cfq_group *__cfqg;
1225         s64 key = cfqg_key(st, cfqg);
1226         int left = 1;
1227
1228         while (*node != NULL) {
1229                 parent = *node;
1230                 __cfqg = rb_entry_cfqg(parent);
1231
1232                 if (key < cfqg_key(st, __cfqg))
1233                         node = &parent->rb_left;
1234                 else {
1235                         node = &parent->rb_right;
1236                         left = 0;
1237                 }
1238         }
1239
1240         if (left)
1241                 st->left = &cfqg->rb_node;
1242
1243         rb_link_node(&cfqg->rb_node, parent, node);
1244         rb_insert_color(&cfqg->rb_node, &st->rb);
1245 }
1246
1247 static void
1248 cfq_update_group_weight(struct cfq_group *cfqg)
1249 {
1250         BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1251
1252         if (cfqg->new_weight) {
1253                 cfqg->weight = cfqg->new_weight;
1254                 cfqg->new_weight = 0;
1255         }
1256
1257         if (cfqg->new_leaf_weight) {
1258                 cfqg->leaf_weight = cfqg->new_leaf_weight;
1259                 cfqg->new_leaf_weight = 0;
1260         }
1261 }
1262
1263 static void
1264 cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1265 {
1266         unsigned int vfr = 1 << CFQ_SERVICE_SHIFT;      /* start with 1 */
1267         struct cfq_group *pos = cfqg;
1268         struct cfq_group *parent;
1269         bool propagate;
1270
1271         /* add to the service tree */
1272         BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1273
1274         cfq_update_group_weight(cfqg);
1275         __cfq_group_service_tree_add(st, cfqg);
1276         st->total_weight += cfqg->weight;
1277
1278         /*
1279          * Activate @cfqg and calculate the portion of vfraction @cfqg is
1280          * entitled to.  vfraction is calculated by walking the tree
1281          * towards the root calculating the fraction it has at each level.
1282          * The compounded ratio is how much vfraction @cfqg owns.
1283          *
1284          * Start with the proportion tasks in this cfqg has against active
1285          * children cfqgs - its leaf_weight against children_weight.
1286          */
1287         propagate = !pos->nr_active++;
1288         pos->children_weight += pos->leaf_weight;
1289         vfr = vfr * pos->leaf_weight / pos->children_weight;
1290
1291         /*
1292          * Compound ->weight walking up the tree.  Both activation and
1293          * vfraction calculation are done in the same loop.  Propagation
1294          * stops once an already activated node is met.  vfraction
1295          * calculation should always continue to the root.
1296          */
1297         while ((parent = cfqg_flat_parent(pos))) {
1298                 if (propagate) {
1299                         propagate = !parent->nr_active++;
1300                         parent->children_weight += pos->weight;
1301                 }
1302                 vfr = vfr * pos->weight / parent->children_weight;
1303                 pos = parent;
1304         }
1305
1306         cfqg->vfraction = max_t(unsigned, vfr, 1);
1307 }
1308
1309 static void
1310 cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
1311 {
1312         struct cfq_rb_root *st = &cfqd->grp_service_tree;
1313         struct cfq_group *__cfqg;
1314         struct rb_node *n;
1315
1316         cfqg->nr_cfqq++;
1317         if (!RB_EMPTY_NODE(&cfqg->rb_node))
1318                 return;
1319
1320         /*
1321          * Currently put the group at the end. Later implement something
1322          * so that groups get lesser vtime based on their weights, so that
1323          * if group does not loose all if it was not continuously backlogged.
1324          */
1325         n = rb_last(&st->rb);
1326         if (n) {
1327                 __cfqg = rb_entry_cfqg(n);
1328                 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
1329         } else
1330                 cfqg->vdisktime = st->min_vdisktime;
1331         cfq_group_service_tree_add(st, cfqg);
1332 }
1333
1334 static void
1335 cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
1336 {
1337         struct cfq_group *pos = cfqg;
1338         bool propagate;
1339
1340         /*
1341          * Undo activation from cfq_group_service_tree_add().  Deactivate
1342          * @cfqg and propagate deactivation upwards.
1343          */
1344         propagate = !--pos->nr_active;
1345         pos->children_weight -= pos->leaf_weight;
1346
1347         while (propagate) {
1348                 struct cfq_group *parent = cfqg_flat_parent(pos);
1349
1350                 /* @pos has 0 nr_active at this point */
1351                 WARN_ON_ONCE(pos->children_weight);
1352                 pos->vfraction = 0;
1353
1354                 if (!parent)
1355                         break;
1356
1357                 propagate = !--parent->nr_active;
1358                 parent->children_weight -= pos->weight;
1359                 pos = parent;
1360         }
1361
1362         /* remove from the service tree */
1363         st->total_weight -= cfqg->weight;
1364         if (!RB_EMPTY_NODE(&cfqg->rb_node))
1365                 cfq_rb_erase(&cfqg->rb_node, st);
1366 }
1367
1368 static void
1369 cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
1370 {
1371         struct cfq_rb_root *st = &cfqd->grp_service_tree;
1372
1373         BUG_ON(cfqg->nr_cfqq < 1);
1374         cfqg->nr_cfqq--;
1375
1376         /* If there are other cfq queues under this group, don't delete it */
1377         if (cfqg->nr_cfqq)
1378                 return;
1379
1380         cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
1381         cfq_group_service_tree_del(st, cfqg);
1382         cfqg->saved_wl_slice = 0;
1383         cfqg_stats_update_dequeue(cfqg);
1384 }
1385
1386 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
1387                                                 unsigned int *unaccounted_time)
1388 {
1389         unsigned int slice_used;
1390
1391         /*
1392          * Queue got expired before even a single request completed or
1393          * got expired immediately after first request completion.
1394          */
1395         if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
1396                 /*
1397                  * Also charge the seek time incurred to the group, otherwise
1398                  * if there are mutiple queues in the group, each can dispatch
1399                  * a single request on seeky media and cause lots of seek time
1400                  * and group will never know it.
1401                  */
1402                 slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),
1403                                         1);
1404         } else {
1405                 slice_used = jiffies - cfqq->slice_start;
1406                 if (slice_used > cfqq->allocated_slice) {
1407                         *unaccounted_time = slice_used - cfqq->allocated_slice;
1408                         slice_used = cfqq->allocated_slice;
1409                 }
1410                 if (time_after(cfqq->slice_start, cfqq->dispatch_start))
1411                         *unaccounted_time += cfqq->slice_start -
1412                                         cfqq->dispatch_start;
1413         }
1414
1415         return slice_used;
1416 }
1417
1418 static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
1419                                 struct cfq_queue *cfqq)
1420 {
1421         struct cfq_rb_root *st = &cfqd->grp_service_tree;
1422         unsigned int used_sl, charge, unaccounted_sl = 0;
1423         int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
1424                         - cfqg->service_tree_idle.count;
1425         unsigned int vfr;
1426
1427         BUG_ON(nr_sync < 0);
1428         used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
1429
1430         if (iops_mode(cfqd))
1431                 charge = cfqq->slice_dispatch;
1432         else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
1433                 charge = cfqq->allocated_slice;
1434
1435         /*
1436          * Can't update vdisktime while on service tree and cfqg->vfraction
1437          * is valid only while on it.  Cache vfr, leave the service tree,
1438          * update vdisktime and go back on.  The re-addition to the tree
1439          * will also update the weights as necessary.
1440          */
1441         vfr = cfqg->vfraction;
1442         cfq_group_service_tree_del(st, cfqg);
1443         cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
1444         cfq_group_service_tree_add(st, cfqg);
1445
1446         /* This group is being expired. Save the context */
1447         if (time_after(cfqd->workload_expires, jiffies)) {
1448                 cfqg->saved_wl_slice = cfqd->workload_expires
1449                                                 - jiffies;
1450                 cfqg->saved_wl_type = cfqd->serving_wl_type;
1451                 cfqg->saved_wl_class = cfqd->serving_wl_class;
1452         } else
1453                 cfqg->saved_wl_slice = 0;
1454
1455         cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
1456                                         st->min_vdisktime);
1457         cfq_log_cfqq(cfqq->cfqd, cfqq,
1458                      "sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
1459                      used_sl, cfqq->slice_dispatch, charge,
1460                      iops_mode(cfqd), cfqq->nr_sectors);
1461         cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);
1462         cfqg_stats_set_start_empty_time(cfqg);
1463 }
1464
1465 /**
1466  * cfq_init_cfqg_base - initialize base part of a cfq_group
1467  * @cfqg: cfq_group to initialize
1468  *
1469  * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
1470  * is enabled or not.
1471  */
1472 static void cfq_init_cfqg_base(struct cfq_group *cfqg)
1473 {
1474         struct cfq_rb_root *st;
1475         int i, j;
1476
1477         for_each_cfqg_st(cfqg, i, j, st)
1478                 *st = CFQ_RB_ROOT;
1479         RB_CLEAR_NODE(&cfqg->rb_node);
1480
1481         cfqg->ttime.last_end_request = jiffies;
1482 }
1483
1484 #ifdef CONFIG_CFQ_GROUP_IOSCHED
1485 static void cfq_pd_init(struct blkcg_gq *blkg)
1486 {
1487         struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1488
1489         cfq_init_cfqg_base(cfqg);
1490         cfqg->weight = blkg->blkcg->cfq_weight;
1491         cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight;
1492 }
1493
1494 /*
1495  * Search for the cfq group current task belongs to. request_queue lock must
1496  * be held.
1497  */
1498 static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
1499                                                 struct blkcg *blkcg)
1500 {
1501         struct request_queue *q = cfqd->queue;
1502         struct cfq_group *cfqg = NULL;
1503
1504         /* avoid lookup for the common case where there's no blkcg */
1505         if (blkcg == &blkcg_root) {
1506                 cfqg = cfqd->root_group;
1507         } else {
1508                 struct blkcg_gq *blkg;
1509
1510                 blkg = blkg_lookup_create(blkcg, q);
1511                 if (!IS_ERR(blkg))
1512                         cfqg = blkg_to_cfqg(blkg);
1513         }
1514
1515         return cfqg;
1516 }
1517
1518 static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1519 {
1520         /* Currently, all async queues are mapped to root group */
1521         if (!cfq_cfqq_sync(cfqq))
1522                 cfqg = cfqq->cfqd->root_group;
1523
1524         cfqq->cfqg = cfqg;
1525         /* cfqq reference on cfqg */
1526         cfqg_get(cfqg);
1527 }
1528
1529 static u64 cfqg_prfill_weight_device(struct seq_file *sf,
1530                                      struct blkg_policy_data *pd, int off)
1531 {
1532         struct cfq_group *cfqg = pd_to_cfqg(pd);
1533
1534         if (!cfqg->dev_weight)
1535                 return 0;
1536         return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
1537 }
1538
1539 static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
1540                                     struct seq_file *sf)
1541 {
1542         blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
1543                           cfqg_prfill_weight_device, &blkcg_policy_cfq, 0,
1544                           false);
1545         return 0;
1546 }
1547
1548 static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
1549                                           struct blkg_policy_data *pd, int off)
1550 {
1551         struct cfq_group *cfqg = pd_to_cfqg(pd);
1552
1553         if (!cfqg->dev_leaf_weight)
1554                 return 0;
1555         return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
1556 }
1557
1558 static int cfqg_print_leaf_weight_device(struct cgroup *cgrp,
1559                                          struct cftype *cft,
1560                                          struct seq_file *sf)
1561 {
1562         blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
1563                           cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, 0,
1564                           false);
1565         return 0;
1566 }
1567
1568 static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft,
1569                             struct seq_file *sf)
1570 {
1571         seq_printf(sf, "%u\n", cgroup_to_blkcg(cgrp)->cfq_weight);
1572         return 0;
1573 }
1574
1575 static int cfq_print_leaf_weight(struct cgroup *cgrp, struct cftype *cft,
1576                                  struct seq_file *sf)
1577 {
1578         seq_printf(sf, "%u\n",
1579                    cgroup_to_blkcg(cgrp)->cfq_leaf_weight);
1580         return 0;
1581 }
1582
1583 static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
1584                                     const char *buf, bool is_leaf_weight)
1585 {
1586         struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1587         struct blkg_conf_ctx ctx;
1588         struct cfq_group *cfqg;
1589         int ret;
1590
1591         ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
1592         if (ret)
1593                 return ret;
1594
1595         ret = -EINVAL;
1596         cfqg = blkg_to_cfqg(ctx.blkg);
1597         if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
1598                 if (!is_leaf_weight) {
1599                         cfqg->dev_weight = ctx.v;
1600                         cfqg->new_weight = ctx.v ?: blkcg->cfq_weight;
1601                 } else {
1602                         cfqg->dev_leaf_weight = ctx.v;
1603                         cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight;
1604                 }
1605                 ret = 0;
1606         }
1607
1608         blkg_conf_finish(&ctx);
1609         return ret;
1610 }
1611
1612 static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
1613                                   const char *buf)
1614 {
1615         return __cfqg_set_weight_device(cgrp, cft, buf, false);
1616 }
1617
1618 static int cfqg_set_leaf_weight_device(struct cgroup *cgrp, struct cftype *cft,
1619                                        const char *buf)
1620 {
1621         return __cfqg_set_weight_device(cgrp, cft, buf, true);
1622 }
1623
1624 static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val,
1625                             bool is_leaf_weight)
1626 {
1627         struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1628         struct blkcg_gq *blkg;
1629         struct hlist_node *n;
1630
1631         if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
1632                 return -EINVAL;
1633
1634         spin_lock_irq(&blkcg->lock);
1635
1636         if (!is_leaf_weight)
1637                 blkcg->cfq_weight = val;
1638         else
1639                 blkcg->cfq_leaf_weight = val;
1640
1641         hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1642                 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1643
1644                 if (!cfqg)
1645                         continue;
1646
1647                 if (!is_leaf_weight) {
1648                         if (!cfqg->dev_weight)
1649                                 cfqg->new_weight = blkcg->cfq_weight;
1650                 } else {
1651                         if (!cfqg->dev_leaf_weight)
1652                                 cfqg->new_leaf_weight = blkcg->cfq_leaf_weight;
1653                 }
1654         }
1655
1656         spin_unlock_irq(&blkcg->lock);
1657         return 0;
1658 }
1659
1660 static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
1661 {
1662         return __cfq_set_weight(cgrp, cft, val, false);
1663 }
1664
1665 static int cfq_set_leaf_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
1666 {
1667         return __cfq_set_weight(cgrp, cft, val, true);
1668 }
1669
1670 static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft,
1671                            struct seq_file *sf)
1672 {
1673         struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1674
1675         blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq,
1676                           cft->private, false);
1677         return 0;
1678 }
1679
1680 static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
1681                              struct seq_file *sf)
1682 {
1683         struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1684
1685         blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq,
1686                           cft->private, true);
1687         return 0;
1688 }
1689
1690 #ifdef CONFIG_DEBUG_BLK_CGROUP
1691 static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
1692                                       struct blkg_policy_data *pd, int off)
1693 {
1694         struct cfq_group *cfqg = pd_to_cfqg(pd);
1695         u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
1696         u64 v = 0;
1697
1698         if (samples) {
1699                 v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
1700                 do_div(v, samples);
1701         }
1702         __blkg_prfill_u64(sf, pd, v);
1703         return 0;
1704 }
1705
1706 /* print avg_queue_size */
1707 static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
1708                                      struct seq_file *sf)
1709 {
1710         struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1711
1712         blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size,
1713                           &blkcg_policy_cfq, 0, false);
1714         return 0;
1715 }
1716 #endif  /* CONFIG_DEBUG_BLK_CGROUP */
1717
1718 static struct cftype cfq_blkcg_files[] = {
1719         /* on root, weight is mapped to leaf_weight */
1720         {
1721                 .name = "weight_device",
1722                 .flags = CFTYPE_ONLY_ON_ROOT,
1723                 .read_seq_string = cfqg_print_leaf_weight_device,
1724                 .write_string = cfqg_set_leaf_weight_device,
1725                 .max_write_len = 256,
1726         },
1727         {
1728                 .name = "weight",
1729                 .flags = CFTYPE_ONLY_ON_ROOT,
1730                 .read_seq_string = cfq_print_leaf_weight,
1731                 .write_u64 = cfq_set_leaf_weight,
1732         },
1733
1734         /* no such mapping necessary for !roots */
1735         {
1736                 .name = "weight_device",
1737                 .flags = CFTYPE_NOT_ON_ROOT,
1738                 .read_seq_string = cfqg_print_weight_device,
1739                 .write_string = cfqg_set_weight_device,
1740                 .max_write_len = 256,
1741         },
1742         {
1743                 .name = "weight",
1744                 .flags = CFTYPE_NOT_ON_ROOT,
1745                 .read_seq_string = cfq_print_weight,
1746                 .write_u64 = cfq_set_weight,
1747         },
1748
1749         {
1750                 .name = "leaf_weight_device",
1751                 .read_seq_string = cfqg_print_leaf_weight_device,
1752                 .write_string = cfqg_set_leaf_weight_device,
1753                 .max_write_len = 256,
1754         },
1755         {
1756                 .name = "leaf_weight",
1757                 .read_seq_string = cfq_print_leaf_weight,
1758                 .write_u64 = cfq_set_leaf_weight,
1759         },
1760
1761         {
1762                 .name = "time",
1763                 .private = offsetof(struct cfq_group, stats.time),
1764                 .read_seq_string = cfqg_print_stat,
1765         },
1766         {
1767                 .name = "sectors",
1768                 .private = offsetof(struct cfq_group, stats.sectors),
1769                 .read_seq_string = cfqg_print_stat,
1770         },
1771         {
1772                 .name = "io_service_bytes",
1773                 .private = offsetof(struct cfq_group, stats.service_bytes),
1774                 .read_seq_string = cfqg_print_rwstat,
1775         },
1776         {
1777                 .name = "io_serviced",
1778                 .private = offsetof(struct cfq_group, stats.serviced),
1779                 .read_seq_string = cfqg_print_rwstat,
1780         },
1781         {
1782                 .name = "io_service_time",
1783                 .private = offsetof(struct cfq_group, stats.service_time),
1784                 .read_seq_string = cfqg_print_rwstat,
1785         },
1786         {
1787                 .name = "io_wait_time",
1788                 .private = offsetof(struct cfq_group, stats.wait_time),
1789                 .read_seq_string = cfqg_print_rwstat,
1790         },
1791         {
1792                 .name = "io_merged",
1793                 .private = offsetof(struct cfq_group, stats.merged),
1794                 .read_seq_string = cfqg_print_rwstat,
1795         },
1796         {
1797                 .name = "io_queued",
1798                 .private = offsetof(struct cfq_group, stats.queued),
1799                 .read_seq_string = cfqg_print_rwstat,
1800         },
1801 #ifdef CONFIG_DEBUG_BLK_CGROUP
1802         {
1803                 .name = "avg_queue_size",
1804                 .read_seq_string = cfqg_print_avg_queue_size,
1805         },
1806         {
1807                 .name = "group_wait_time",
1808                 .private = offsetof(struct cfq_group, stats.group_wait_time),
1809                 .read_seq_string = cfqg_print_stat,
1810         },
1811         {
1812                 .name = "idle_time",
1813                 .private = offsetof(struct cfq_group, stats.idle_time),
1814                 .read_seq_string = cfqg_print_stat,
1815         },
1816         {
1817                 .name = "empty_time",
1818                 .private = offsetof(struct cfq_group, stats.empty_time),
1819                 .read_seq_string = cfqg_print_stat,
1820         },
1821         {
1822                 .name = "dequeue",
1823                 .private = offsetof(struct cfq_group, stats.dequeue),
1824                 .read_seq_string = cfqg_print_stat,
1825         },
1826         {
1827                 .name = "unaccounted_time",
1828                 .private = offsetof(struct cfq_group, stats.unaccounted_time),
1829                 .read_seq_string = cfqg_print_stat,
1830         },
1831 #endif  /* CONFIG_DEBUG_BLK_CGROUP */
1832         { }     /* terminate */
1833 };
1834 #else /* GROUP_IOSCHED */
1835 static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
1836                                                 struct blkcg *blkcg)
1837 {
1838         return cfqd->root_group;
1839 }
1840
1841 static inline void
1842 cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
1843         cfqq->cfqg = cfqg;
1844 }
1845
1846 #endif /* GROUP_IOSCHED */
1847
1848 /*
1849  * The cfqd->service_trees holds all pending cfq_queue's that have
1850  * requests waiting to be processed. It is sorted in the order that
1851  * we will service the queues.
1852  */
1853 static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1854                                  bool add_front)
1855 {
1856         struct rb_node **p, *parent;
1857         struct cfq_queue *__cfqq;
1858         unsigned long rb_key;
1859         struct cfq_rb_root *st;
1860         int left;
1861         int new_cfqq = 1;
1862
1863         st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq));
1864         if (cfq_class_idle(cfqq)) {
1865                 rb_key = CFQ_IDLE_DELAY;
1866                 parent = rb_last(&st->rb);
1867                 if (parent && parent != &cfqq->rb_node) {
1868                         __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1869                         rb_key += __cfqq->rb_key;
1870                 } else
1871                         rb_key += jiffies;
1872         } else if (!add_front) {
1873                 /*
1874                  * Get our rb key offset. Subtract any residual slice
1875                  * value carried from last service. A negative resid
1876                  * count indicates slice overrun, and this should position
1877                  * the next service time further away in the tree.
1878                  */
1879                 rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
1880                 rb_key -= cfqq->slice_resid;
1881                 cfqq->slice_resid = 0;
1882         } else {
1883                 rb_key = -HZ;
1884                 __cfqq = cfq_rb_first(st);
1885                 rb_key += __cfqq ? __cfqq->rb_key : jiffies;
1886         }
1887
1888         if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
1889                 new_cfqq = 0;
1890                 /*
1891                  * same position, nothing more to do
1892                  */
1893                 if (rb_key == cfqq->rb_key && cfqq->service_tree == st)
1894                         return;
1895
1896                 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1897                 cfqq->service_tree = NULL;
1898         }
1899
1900         left = 1;
1901         parent = NULL;
1902         cfqq->service_tree = st;
1903         p = &st->rb.rb_node;
1904         while (*p) {
1905                 parent = *p;
1906                 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1907
1908                 /*
1909                  * sort by key, that represents service time.
1910                  */
1911                 if (time_before(rb_key, __cfqq->rb_key))
1912                         p = &parent->rb_left;
1913                 else {
1914                         p = &parent->rb_right;
1915                         left = 0;
1916                 }
1917         }
1918
1919         if (left)
1920                 st->left = &cfqq->rb_node;
1921
1922         cfqq->rb_key = rb_key;
1923         rb_link_node(&cfqq->rb_node, parent, p);
1924         rb_insert_color(&cfqq->rb_node, &st->rb);
1925         st->count++;
1926         if (add_front || !new_cfqq)
1927                 return;
1928         cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
1929 }
1930
1931 static struct cfq_queue *
1932 cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
1933                      sector_t sector, struct rb_node **ret_parent,
1934                      struct rb_node ***rb_link)
1935 {
1936         struct rb_node **p, *parent;
1937         struct cfq_queue *cfqq = NULL;
1938
1939         parent = NULL;
1940         p = &root->rb_node;
1941         while (*p) {
1942                 struct rb_node **n;
1943
1944                 parent = *p;
1945                 cfqq = rb_entry(parent, struct cfq_queue, p_node);
1946
1947                 /*
1948                  * Sort strictly based on sector.  Smallest to the left,
1949                  * largest to the right.
1950                  */
1951                 if (sector > blk_rq_pos(cfqq->next_rq))
1952                         n = &(*p)->rb_right;
1953                 else if (sector < blk_rq_pos(cfqq->next_rq))
1954                         n = &(*p)->rb_left;
1955                 else
1956                         break;
1957                 p = n;
1958                 cfqq = NULL;
1959         }
1960
1961         *ret_parent = parent;
1962         if (rb_link)
1963                 *rb_link = p;
1964         return cfqq;
1965 }
1966
1967 static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1968 {
1969         struct rb_node **p, *parent;
1970         struct cfq_queue *__cfqq;
1971
1972         if (cfqq->p_root) {
1973                 rb_erase(&cfqq->p_node, cfqq->p_root);
1974                 cfqq->p_root = NULL;
1975         }
1976
1977         if (cfq_class_idle(cfqq))
1978                 return;
1979         if (!cfqq->next_rq)
1980                 return;
1981
1982         cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
1983         __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
1984                                       blk_rq_pos(cfqq->next_rq), &parent, &p);
1985         if (!__cfqq) {
1986                 rb_link_node(&cfqq->p_node, parent, p);
1987                 rb_insert_color(&cfqq->p_node, cfqq->p_root);
1988         } else
1989                 cfqq->p_root = NULL;
1990 }
1991
1992 /*
1993  * Update cfqq's position in the service tree.
1994  */
1995 static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1996 {
1997         /*
1998          * Resorting requires the cfqq to be on the RR list already.
1999          */
2000         if (cfq_cfqq_on_rr(cfqq)) {
2001                 cfq_service_tree_add(cfqd, cfqq, 0);
2002                 cfq_prio_tree_add(cfqd, cfqq);
2003         }
2004 }
2005
2006 /*
2007  * add to busy list of queues for service, trying to be fair in ordering
2008  * the pending list according to last request service
2009  */
2010 static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2011 {
2012         cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
2013         BUG_ON(cfq_cfqq_on_rr(cfqq));
2014         cfq_mark_cfqq_on_rr(cfqq);
2015         cfqd->busy_queues++;
2016         if (cfq_cfqq_sync(cfqq))
2017                 cfqd->busy_sync_queues++;
2018
2019         cfq_resort_rr_list(cfqd, cfqq);
2020 }
2021
2022 /*
2023  * Called when the cfqq no longer has requests pending, remove it from
2024  * the service tree.
2025  */
2026 static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2027 {
2028         cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
2029         BUG_ON(!cfq_cfqq_on_rr(cfqq));
2030         cfq_clear_cfqq_on_rr(cfqq);
2031
2032         if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
2033                 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
2034                 cfqq->service_tree = NULL;
2035         }
2036         if (cfqq->p_root) {
2037                 rb_erase(&cfqq->p_node, cfqq->p_root);
2038                 cfqq->p_root = NULL;
2039         }
2040
2041         cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
2042         BUG_ON(!cfqd->busy_queues);
2043         cfqd->busy_queues--;
2044         if (cfq_cfqq_sync(cfqq))
2045                 cfqd->busy_sync_queues--;
2046 }
2047
2048 /*
2049  * rb tree support functions
2050  */
2051 static void cfq_del_rq_rb(struct request *rq)
2052 {
2053         struct cfq_queue *cfqq = RQ_CFQQ(rq);
2054         const int sync = rq_is_sync(rq);
2055
2056         BUG_ON(!cfqq->queued[sync]);
2057         cfqq->queued[sync]--;
2058
2059         elv_rb_del(&cfqq->sort_list, rq);
2060
2061         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
2062                 /*
2063                  * Queue will be deleted from service tree when we actually
2064                  * expire it later. Right now just remove it from prio tree
2065                  * as it is empty.
2066                  */
2067                 if (cfqq->p_root) {
2068                         rb_erase(&cfqq->p_node, cfqq->p_root);
2069                         cfqq->p_root = NULL;
2070                 }
2071         }
2072 }
2073
2074 static void cfq_add_rq_rb(struct request *rq)
2075 {
2076         struct cfq_queue *cfqq = RQ_CFQQ(rq);
2077         struct cfq_data *cfqd = cfqq->cfqd;
2078         struct request *prev;
2079
2080         cfqq->queued[rq_is_sync(rq)]++;
2081
2082         elv_rb_add(&cfqq->sort_list, rq);
2083
2084         if (!cfq_cfqq_on_rr(cfqq))
2085                 cfq_add_cfqq_rr(cfqd, cfqq);
2086
2087         /*
2088          * check if this request is a better next-serve candidate
2089          */
2090         prev = cfqq->next_rq;
2091         cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
2092
2093         /*
2094          * adjust priority tree position, if ->next_rq changes
2095          */
2096         if (prev != cfqq->next_rq)
2097                 cfq_prio_tree_add(cfqd, cfqq);
2098
2099         BUG_ON(!cfqq->next_rq);
2100 }
2101
2102 static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
2103 {
2104         elv_rb_del(&cfqq->sort_list, rq);
2105         cfqq->queued[rq_is_sync(rq)]--;
2106         cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
2107         cfq_add_rq_rb(rq);
2108         cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
2109                                  rq->cmd_flags);
2110 }
2111
2112 static struct request *
2113 cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
2114 {
2115         struct task_struct *tsk = current;
2116         struct cfq_io_cq *cic;
2117         struct cfq_queue *cfqq;
2118
2119         cic = cfq_cic_lookup(cfqd, tsk->io_context);
2120         if (!cic)
2121                 return NULL;
2122
2123         cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
2124         if (cfqq) {
2125                 sector_t sector = bio->bi_sector + bio_sectors(bio);
2126
2127                 return elv_rb_find(&cfqq->sort_list, sector);
2128         }
2129
2130         return NULL;
2131 }
2132
2133 static void cfq_activate_request(struct request_queue *q, struct request *rq)
2134 {
2135         struct cfq_data *cfqd = q->elevator->elevator_data;
2136
2137         cfqd->rq_in_driver++;
2138         cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
2139                                                 cfqd->rq_in_driver);
2140
2141         cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
2142 }
2143
2144 static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
2145 {
2146         struct cfq_data *cfqd = q->elevator->elevator_data;
2147
2148         WARN_ON(!cfqd->rq_in_driver);
2149         cfqd->rq_in_driver--;
2150         cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
2151                                                 cfqd->rq_in_driver);
2152 }
2153
2154 static void cfq_remove_request(struct request *rq)
2155 {
2156         struct cfq_queue *cfqq = RQ_CFQQ(rq);
2157
2158         if (cfqq->next_rq == rq)
2159                 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
2160
2161         list_del_init(&rq->queuelist);
2162         cfq_del_rq_rb(rq);
2163
2164         cfqq->cfqd->rq_queued--;
2165         cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
2166         if (rq->cmd_flags & REQ_PRIO) {
2167                 WARN_ON(!cfqq->prio_pending);
2168                 cfqq->prio_pending--;
2169         }
2170 }
2171
2172 static int cfq_merge(struct request_queue *q, struct request **req,
2173                      struct bio *bio)
2174 {
2175         struct cfq_data *cfqd = q->elevator->elevator_data;
2176         struct request *__rq;
2177
2178         __rq = cfq_find_rq_fmerge(cfqd, bio);
2179         if (__rq && elv_rq_merge_ok(__rq, bio)) {
2180                 *req = __rq;
2181                 return ELEVATOR_FRONT_MERGE;
2182         }
2183
2184         return ELEVATOR_NO_MERGE;
2185 }
2186
2187 static void cfq_merged_request(struct request_queue *q, struct request *req,
2188                                int type)
2189 {
2190         if (type == ELEVATOR_FRONT_MERGE) {
2191                 struct cfq_queue *cfqq = RQ_CFQQ(req);
2192
2193                 cfq_reposition_rq_rb(cfqq, req);
2194         }
2195 }
2196
2197 static void cfq_bio_merged(struct request_queue *q, struct request *req,
2198                                 struct bio *bio)
2199 {
2200         cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_rw);
2201 }
2202
2203 static void
2204 cfq_merged_requests(struct request_queue *q, struct request *rq,
2205                     struct request *next)
2206 {
2207         struct cfq_queue *cfqq = RQ_CFQQ(rq);
2208         struct cfq_data *cfqd = q->elevator->elevator_data;
2209
2210         /*
2211          * reposition in fifo if next is older than rq
2212          */
2213         if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
2214             time_before(rq_fifo_time(next), rq_fifo_time(rq)) &&
2215             cfqq == RQ_CFQQ(next)) {
2216                 list_move(&rq->queuelist, &next->queuelist);
2217                 rq_set_fifo_time(rq, rq_fifo_time(next));
2218         }
2219
2220         if (cfqq->next_rq == next)
2221                 cfqq->next_rq = rq;
2222         cfq_remove_request(next);
2223         cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
2224
2225         cfqq = RQ_CFQQ(next);
2226         /*
2227          * all requests of this queue are merged to other queues, delete it
2228          * from the service tree. If it's the active_queue,
2229          * cfq_dispatch_requests() will choose to expire it or do idle
2230          */
2231         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
2232             cfqq != cfqd->active_queue)
2233                 cfq_del_cfqq_rr(cfqd, cfqq);
2234 }
2235
2236 static int cfq_allow_merge(struct request_queue *q, struct request *rq,
2237                            struct bio *bio)
2238 {
2239         struct cfq_data *cfqd = q->elevator->elevator_data;
2240         struct cfq_io_cq *cic;
2241         struct cfq_queue *cfqq;
2242
2243         /*
2244          * Disallow merge of a sync bio into an async request.
2245          */
2246         if (cfq_bio_sync(bio) && !rq_is_sync(rq))
2247                 return false;
2248
2249         /*
2250          * Lookup the cfqq that this bio will be queued with and allow
2251          * merge only if rq is queued there.
2252          */
2253         cic = cfq_cic_lookup(cfqd, current->io_context);
2254         if (!cic)
2255                 return false;
2256
2257         cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
2258         return cfqq == RQ_CFQQ(rq);
2259 }
2260
2261 static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2262 {
2263         del_timer(&cfqd->idle_slice_timer);
2264         cfqg_stats_update_idle_time(cfqq->cfqg);
2265 }
2266
2267 static void __cfq_set_active_queue(struct cfq_data *cfqd,
2268                                    struct cfq_queue *cfqq)
2269 {
2270         if (cfqq) {
2271                 cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d",
2272                                 cfqd->serving_wl_class, cfqd->serving_wl_type);
2273                 cfqg_stats_update_avg_queue_size(cfqq->cfqg);
2274                 cfqq->slice_start = 0;
2275                 cfqq->dispatch_start = jiffies;
2276                 cfqq->allocated_slice = 0;
2277                 cfqq->slice_end = 0;
2278                 cfqq->slice_dispatch = 0;
2279                 cfqq->nr_sectors = 0;
2280
2281                 cfq_clear_cfqq_wait_request(cfqq);
2282                 cfq_clear_cfqq_must_dispatch(cfqq);
2283                 cfq_clear_cfqq_must_alloc_slice(cfqq);
2284                 cfq_clear_cfqq_fifo_expire(cfqq);
2285                 cfq_mark_cfqq_slice_new(cfqq);
2286
2287                 cfq_del_timer(cfqd, cfqq);
2288         }
2289
2290         cfqd->active_queue = cfqq;
2291 }
2292
2293 /*
2294  * current cfqq expired its slice (or was too idle), select new one
2295  */
2296 static void
2297 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2298                     bool timed_out)
2299 {
2300         cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
2301
2302         if (cfq_cfqq_wait_request(cfqq))
2303                 cfq_del_timer(cfqd, cfqq);
2304
2305         cfq_clear_cfqq_wait_request(cfqq);
2306         cfq_clear_cfqq_wait_busy(cfqq);
2307
2308         /*
2309          * If this cfqq is shared between multiple processes, check to
2310          * make sure that those processes are still issuing I/Os within
2311          * the mean seek distance.  If not, it may be time to break the
2312          * queues apart again.
2313          */
2314         if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
2315                 cfq_mark_cfqq_split_coop(cfqq);
2316
2317         /*
2318          * store what was left of this slice, if the queue idled/timed out
2319          */
2320         if (timed_out) {
2321                 if (cfq_cfqq_slice_new(cfqq))
2322                         cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
2323                 else
2324                         cfqq->slice_resid = cfqq->slice_end - jiffies;
2325                 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
2326         }
2327
2328         cfq_group_served(cfqd, cfqq->cfqg, cfqq);
2329
2330         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
2331                 cfq_del_cfqq_rr(cfqd, cfqq);
2332
2333         cfq_resort_rr_list(cfqd, cfqq);
2334
2335         if (cfqq == cfqd->active_queue)
2336                 cfqd->active_queue = NULL;
2337
2338         if (cfqd->active_cic) {
2339                 put_io_context(cfqd->active_cic->icq.ioc);
2340                 cfqd->active_cic = NULL;
2341         }
2342 }
2343
2344 static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
2345 {
2346         struct cfq_queue *cfqq = cfqd->active_queue;
2347
2348         if (cfqq)
2349                 __cfq_slice_expired(cfqd, cfqq, timed_out);
2350 }
2351
2352 /*
2353  * Get next queue for service. Unless we have a queue preemption,
2354  * we'll simply select the first cfqq in the service tree.
2355  */
2356 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
2357 {
2358         struct cfq_rb_root *st = st_for(cfqd->serving_group,
2359                         cfqd->serving_wl_class, cfqd->serving_wl_type);
2360
2361         if (!cfqd->rq_queued)
2362                 return NULL;
2363
2364         /* There is nothing to dispatch */
2365         if (!st)
2366                 return NULL;
2367         if (RB_EMPTY_ROOT(&st->rb))
2368                 return NULL;
2369         return cfq_rb_first(st);
2370 }
2371
2372 static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
2373 {
2374         struct cfq_group *cfqg;
2375         struct cfq_queue *cfqq;
2376         int i, j;
2377         struct cfq_rb_root *st;
2378
2379         if (!cfqd->rq_queued)
2380                 return NULL;
2381
2382         cfqg = cfq_get_next_cfqg(cfqd);
2383         if (!cfqg)
2384                 return NULL;
2385
2386         for_each_cfqg_st(cfqg, i, j, st)
2387                 if ((cfqq = cfq_rb_first(st)) != NULL)
2388                         return cfqq;
2389         return NULL;
2390 }
2391
2392 /*
2393  * Get and set a new active queue for service.
2394  */
2395 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
2396                                               struct cfq_queue *cfqq)
2397 {
2398         if (!cfqq)
2399                 cfqq = cfq_get_next_queue(cfqd);
2400
2401         __cfq_set_active_queue(cfqd, cfqq);
2402         return cfqq;
2403 }
2404
2405 static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
2406                                           struct request *rq)
2407 {
2408         if (blk_rq_pos(rq) >= cfqd->last_position)
2409                 return blk_rq_pos(rq) - cfqd->last_position;
2410         else
2411                 return cfqd->last_position - blk_rq_pos(rq);
2412 }
2413
2414 static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2415                                struct request *rq)
2416 {
2417         return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
2418 }
2419
2420 static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
2421                                     struct cfq_queue *cur_cfqq)
2422 {
2423         struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
2424         struct rb_node *parent, *node;
2425         struct cfq_queue *__cfqq;
2426         sector_t sector = cfqd->last_position;
2427
2428         if (RB_EMPTY_ROOT(root))
2429                 return NULL;
2430
2431         /*
2432          * First, if we find a request starting at the end of the last
2433          * request, choose it.
2434          */
2435         __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
2436         if (__cfqq)
2437                 return __cfqq;
2438
2439         /*
2440          * If the exact sector wasn't found, the parent of the NULL leaf
2441          * will contain the closest sector.
2442          */
2443         __cfqq = rb_entry(parent, struct cfq_queue, p_node);
2444         if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2445                 return __cfqq;
2446
2447         if (blk_rq_pos(__cfqq->next_rq) < sector)
2448                 node = rb_next(&__cfqq->p_node);
2449         else
2450                 node = rb_prev(&__cfqq->p_node);
2451         if (!node)
2452                 return NULL;
2453
2454         __cfqq = rb_entry(node, struct cfq_queue, p_node);
2455         if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2456                 return __cfqq;
2457
2458         return NULL;
2459 }
2460
2461 /*
2462  * cfqd - obvious
2463  * cur_cfqq - passed in so that we don't decide that the current queue is
2464  *            closely cooperating with itself.
2465  *
2466  * So, basically we're assuming that that cur_cfqq has dispatched at least
2467  * one request, and that cfqd->last_position reflects a position on the disk
2468  * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
2469  * assumption.
2470  */
2471 static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
2472                                               struct cfq_queue *cur_cfqq)
2473 {
2474         struct cfq_queue *cfqq;
2475
2476         if (cfq_class_idle(cur_cfqq))
2477                 return NULL;
2478         if (!cfq_cfqq_sync(cur_cfqq))
2479                 return NULL;
2480         if (CFQQ_SEEKY(cur_cfqq))
2481                 return NULL;
2482
2483         /*
2484          * Don't search priority tree if it's the only queue in the group.
2485          */
2486         if (cur_cfqq->cfqg->nr_cfqq == 1)
2487                 return NULL;
2488
2489         /*
2490          * We should notice if some of the queues are cooperating, eg
2491          * working closely on the same area of the disk. In that case,
2492          * we can group them together and don't waste time idling.
2493          */
2494         cfqq = cfqq_close(cfqd, cur_cfqq);
2495         if (!cfqq)
2496                 return NULL;
2497
2498         /* If new queue belongs to different cfq_group, don't choose it */
2499         if (cur_cfqq->cfqg != cfqq->cfqg)
2500                 return NULL;
2501
2502         /*
2503          * It only makes sense to merge sync queues.
2504          */
2505         if (!cfq_cfqq_sync(cfqq))
2506                 return NULL;
2507         if (CFQQ_SEEKY(cfqq))
2508                 return NULL;
2509
2510         /*
2511          * Do not merge queues of different priority classes
2512          */
2513         if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
2514                 return NULL;
2515
2516         return cfqq;
2517 }
2518
2519 /*
2520  * Determine whether we should enforce idle window for this queue.
2521  */
2522
2523 static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2524 {
2525         enum wl_class_t wl_class = cfqq_class(cfqq);
2526         struct cfq_rb_root *st = cfqq->service_tree;
2527
2528         BUG_ON(!st);
2529         BUG_ON(!st->count);
2530
2531         if (!cfqd->cfq_slice_idle)
2532                 return false;
2533
2534         /* We never do for idle class queues. */
2535         if (wl_class == IDLE_WORKLOAD)
2536                 return false;
2537
2538         /* We do for queues that were marked with idle window flag. */
2539         if (cfq_cfqq_idle_window(cfqq) &&
2540            !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
2541                 return true;
2542
2543         /*
2544          * Otherwise, we do only if they are the last ones
2545          * in their service tree.
2546          */
2547         if (st->count == 1 && cfq_cfqq_sync(cfqq) &&
2548            !cfq_io_thinktime_big(cfqd, &st->ttime, false))
2549                 return true;
2550         cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count);
2551         return false;
2552 }
2553
2554 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
2555 {
2556         struct cfq_queue *cfqq = cfqd->active_queue;
2557         struct cfq_io_cq *cic;
2558         unsigned long sl, group_idle = 0;
2559
2560         /*
2561          * SSD device without seek penalty, disable idling. But only do so
2562          * for devices that support queuing, otherwise we still have a problem
2563          * with sync vs async workloads.
2564          */
2565         if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
2566                 return;
2567
2568         WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
2569         WARN_ON(cfq_cfqq_slice_new(cfqq));
2570
2571         /*
2572          * idle is disabled, either manually or by past process history
2573          */
2574         if (!cfq_should_idle(cfqd, cfqq)) {
2575                 /* no queue idling. Check for group idling */
2576                 if (cfqd->cfq_group_idle)
2577                         group_idle = cfqd->cfq_group_idle;
2578                 else
2579                         return;
2580         }
2581
2582         /*
2583          * still active requests from this queue, don't idle
2584          */
2585         if (cfqq->dispatched)
2586                 return;
2587
2588         /*
2589          * task has exited, don't wait
2590          */
2591         cic = cfqd->active_cic;
2592         if (!cic || !atomic_read(&cic->icq.ioc->active_ref))
2593                 return;
2594
2595         /*
2596          * If our average think time is larger than the remaining time
2597          * slice, then don't idle. This avoids overrunning the allotted
2598          * time slice.
2599          */
2600         if (sample_valid(cic->ttime.ttime_samples) &&
2601             (cfqq->slice_end - jiffies < cic->ttime.ttime_mean)) {
2602                 cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu",
2603                              cic->ttime.ttime_mean);
2604                 return;
2605         }
2606
2607         /* There are other queues in the group, don't do group idle */
2608         if (group_idle && cfqq->cfqg->nr_cfqq > 1)
2609                 return;
2610
2611         cfq_mark_cfqq_wait_request(cfqq);
2612
2613         if (group_idle)
2614                 sl = cfqd->cfq_group_idle;
2615         else
2616                 sl = cfqd->cfq_slice_idle;
2617
2618         mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
2619         cfqg_stats_set_start_idle_time(cfqq->cfqg);
2620         cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
2621                         group_idle ? 1 : 0);
2622 }
2623
2624 /*
2625  * Move request from internal lists to the request queue dispatch list.
2626  */
2627 static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
2628 {
2629         struct cfq_data *cfqd = q->elevator->elevator_data;
2630         struct cfq_queue *cfqq = RQ_CFQQ(rq);
2631
2632         cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
2633
2634         cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
2635         cfq_remove_request(rq);
2636         cfqq->dispatched++;
2637         (RQ_CFQG(rq))->dispatched++;
2638         elv_dispatch_sort(q, rq);
2639
2640         cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
2641         cfqq->nr_sectors += blk_rq_sectors(rq);
2642         cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags);
2643 }
2644
2645 /*
2646  * return expired entry, or NULL to just start from scratch in rbtree
2647  */
2648 static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
2649 {
2650         struct request *rq = NULL;
2651
2652         if (cfq_cfqq_fifo_expire(cfqq))
2653                 return NULL;
2654
2655         cfq_mark_cfqq_fifo_expire(cfqq);
2656
2657         if (list_empty(&cfqq->fifo))
2658                 return NULL;
2659
2660         rq = rq_entry_fifo(cfqq->fifo.next);
2661         if (time_before(jiffies, rq_fifo_time(rq)))
2662                 rq = NULL;
2663
2664         cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
2665         return rq;
2666 }
2667
2668 static inline int
2669 cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2670 {
2671         const int base_rq = cfqd->cfq_slice_async_rq;
2672
2673         WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
2674
2675         return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
2676 }
2677
2678 /*
2679  * Must be called with the queue_lock held.
2680  */
2681 static int cfqq_process_refs(struct cfq_queue *cfqq)
2682 {
2683         int process_refs, io_refs;
2684
2685         io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
2686         process_refs = cfqq->ref - io_refs;
2687         BUG_ON(process_refs < 0);
2688         return process_refs;
2689 }
2690
2691 static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
2692 {
2693         int process_refs, new_process_refs;
2694         struct cfq_queue *__cfqq;
2695
2696         /*
2697          * If there are no process references on the new_cfqq, then it is
2698          * unsafe to follow the ->new_cfqq chain as other cfqq's in the
2699          * chain may have dropped their last reference (not just their
2700          * last process reference).
2701          */
2702         if (!cfqq_process_refs(new_cfqq))
2703                 return;
2704
2705         /* Avoid a circular list and skip interim queue merges */
2706         while ((__cfqq = new_cfqq->new_cfqq)) {
2707                 if (__cfqq == cfqq)
2708                         return;
2709                 new_cfqq = __cfqq;
2710         }
2711
2712         process_refs = cfqq_process_refs(cfqq);
2713         new_process_refs = cfqq_process_refs(new_cfqq);
2714         /*
2715          * If the process for the cfqq has gone away, there is no
2716          * sense in merging the queues.
2717          */
2718         if (process_refs == 0 || new_process_refs == 0)
2719                 return;
2720
2721         /*
2722          * Merge in the direction of the lesser amount of work.
2723          */
2724         if (new_process_refs >= process_refs) {
2725                 cfqq->new_cfqq = new_cfqq;
2726                 new_cfqq->ref += process_refs;
2727         } else {
2728                 new_cfqq->new_cfqq = cfqq;
2729                 cfqq->ref += new_process_refs;
2730         }
2731 }
2732
2733 static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd,
2734                         struct cfq_group *cfqg, enum wl_class_t wl_class)
2735 {
2736         struct cfq_queue *queue;
2737         int i;
2738         bool key_valid = false;
2739         unsigned long lowest_key = 0;
2740         enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
2741
2742         for (i = 0; i <= SYNC_WORKLOAD; ++i) {
2743                 /* select the one with lowest rb_key */
2744                 queue = cfq_rb_first(st_for(cfqg, wl_class, i));
2745                 if (queue &&
2746                     (!key_valid || time_before(queue->rb_key, lowest_key))) {
2747                         lowest_key = queue->rb_key;
2748                         cur_best = i;
2749                         key_valid = true;
2750                 }
2751         }
2752
2753         return cur_best;
2754 }
2755
2756 static void
2757 choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg)
2758 {
2759         unsigned slice;
2760         unsigned count;
2761         struct cfq_rb_root *st;
2762         unsigned group_slice;
2763         enum wl_class_t original_class = cfqd->serving_wl_class;
2764
2765         /* Choose next priority. RT > BE > IDLE */
2766         if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
2767                 cfqd->serving_wl_class = RT_WORKLOAD;
2768         else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
2769                 cfqd->serving_wl_class = BE_WORKLOAD;
2770         else {
2771                 cfqd->serving_wl_class = IDLE_WORKLOAD;
2772                 cfqd->workload_expires = jiffies + 1;
2773                 return;
2774         }
2775
2776         if (original_class != cfqd->serving_wl_class)
2777                 goto new_workload;
2778
2779         /*
2780          * For RT and BE, we have to choose also the type
2781          * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
2782          * expiration time
2783          */
2784         st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
2785         count = st->count;
2786
2787         /*
2788          * check workload expiration, and that we still have other queues ready
2789          */
2790         if (count && !time_after(jiffies, cfqd->workload_expires))
2791                 return;
2792
2793 new_workload:
2794         /* otherwise select new workload type */
2795         cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg,
2796                                         cfqd->serving_wl_class);
2797         st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
2798         count = st->count;
2799
2800         /*
2801          * the workload slice is computed as a fraction of target latency
2802          * proportional to the number of queues in that workload, over
2803          * all the queues in the same priority class
2804          */
2805         group_slice = cfq_group_slice(cfqd, cfqg);
2806
2807         slice = group_slice * count /
2808                 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class],
2809                       cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd,
2810                                         cfqg));
2811
2812         if (cfqd->serving_wl_type == ASYNC_WORKLOAD) {
2813                 unsigned int tmp;
2814
2815                 /*
2816                  * Async queues are currently system wide. Just taking
2817                  * proportion of queues with-in same group will lead to higher
2818                  * async ratio system wide as generally root group is going
2819                  * to have higher weight. A more accurate thing would be to
2820                  * calculate system wide asnc/sync ratio.
2821                  */
2822                 tmp = cfqd->cfq_target_latency *
2823                         cfqg_busy_async_queues(cfqd, cfqg);
2824                 tmp = tmp/cfqd->busy_queues;
2825                 slice = min_t(unsigned, slice, tmp);
2826
2827                 /* async workload slice is scaled down according to
2828                  * the sync/async slice ratio. */
2829                 slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
2830         } else
2831                 /* sync workload slice is at least 2 * cfq_slice_idle */
2832                 slice = max(slice, 2 * cfqd->cfq_slice_idle);
2833
2834         slice = max_t(unsigned, slice, CFQ_MIN_TT);
2835         cfq_log(cfqd, "workload slice:%d", slice);
2836         cfqd->workload_expires = jiffies + slice;
2837 }
2838
2839 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
2840 {
2841         struct cfq_rb_root *st = &cfqd->grp_service_tree;
2842         struct cfq_group *cfqg;
2843
2844         if (RB_EMPTY_ROOT(&st->rb))
2845                 return NULL;
2846         cfqg = cfq_rb_first_group(st);
2847         update_min_vdisktime(st);
2848         return cfqg;
2849 }
2850
2851 static void cfq_choose_cfqg(struct cfq_data *cfqd)
2852 {
2853         struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
2854
2855         cfqd->serving_group = cfqg;
2856
2857         /* Restore the workload type data */
2858         if (cfqg->saved_wl_slice) {
2859                 cfqd->workload_expires = jiffies + cfqg->saved_wl_slice;
2860                 cfqd->serving_wl_type = cfqg->saved_wl_type;
2861                 cfqd->serving_wl_class = cfqg->saved_wl_class;
2862         } else
2863                 cfqd->workload_expires = jiffies - 1;
2864
2865         choose_wl_class_and_type(cfqd, cfqg);
2866 }
2867
2868 /*
2869  * Select a queue for service. If we have a current active queue,
2870  * check whether to continue servicing it, or retrieve and set a new one.
2871  */
2872 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
2873 {
2874         struct cfq_queue *cfqq, *new_cfqq = NULL;
2875
2876         cfqq = cfqd->active_queue;
2877         if (!cfqq)
2878                 goto new_queue;
2879
2880         if (!cfqd->rq_queued)
2881                 return NULL;
2882
2883         /*
2884          * We were waiting for group to get backlogged. Expire the queue
2885          */
2886         if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
2887                 goto expire;
2888
2889         /*
2890          * The active queue has run out of time, expire it and select new.
2891          */
2892         if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
2893                 /*
2894                  * If slice had not expired at the completion of last request
2895                  * we might not have turned on wait_busy flag. Don't expire
2896                  * the queue yet. Allow the group to get backlogged.
2897                  *
2898                  * The very fact that we have used the slice, that means we
2899                  * have been idling all along on this queue and it should be
2900                  * ok to wait for this request to complete.
2901                  */
2902                 if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
2903                     && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2904                         cfqq = NULL;
2905                         goto keep_queue;
2906                 } else
2907                         goto check_group_idle;
2908         }
2909
2910         /*
2911          * The active queue has requests and isn't expired, allow it to
2912          * dispatch.
2913          */
2914         if (!RB_EMPTY_ROOT(&cfqq->sort_list))
2915                 goto keep_queue;
2916
2917         /*
2918          * If another queue has a request waiting within our mean seek
2919          * distance, let it run.  The expire code will check for close
2920          * cooperators and put the close queue at the front of the service
2921          * tree.  If possible, merge the expiring queue with the new cfqq.
2922          */
2923         new_cfqq = cfq_close_cooperator(cfqd, cfqq);
2924         if (new_cfqq) {
2925                 if (!cfqq->new_cfqq)
2926                         cfq_setup_merge(cfqq, new_cfqq);
2927                 goto expire;
2928         }
2929
2930         /*
2931          * No requests pending. If the active queue still has requests in
2932          * flight or is idling for a new request, allow either of these
2933          * conditions to happen (or time out) before selecting a new queue.
2934          */
2935         if (timer_pending(&cfqd->idle_slice_timer)) {
2936                 cfqq = NULL;
2937                 goto keep_queue;
2938         }
2939
2940         /*
2941          * This is a deep seek queue, but the device is much faster than
2942          * the queue can deliver, don't idle
2943          **/
2944         if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
2945             (cfq_cfqq_slice_new(cfqq) ||
2946             (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
2947                 cfq_clear_cfqq_deep(cfqq);
2948                 cfq_clear_cfqq_idle_window(cfqq);
2949         }
2950
2951         if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2952                 cfqq = NULL;
2953                 goto keep_queue;
2954         }
2955
2956         /*
2957          * If group idle is enabled and there are requests dispatched from
2958          * this group, wait for requests to complete.
2959          */
2960 check_group_idle:
2961         if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 &&
2962             cfqq->cfqg->dispatched &&
2963             !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {
2964                 cfqq = NULL;
2965                 goto keep_queue;
2966         }
2967
2968 expire:
2969         cfq_slice_expired(cfqd, 0);
2970 new_queue:
2971         /*
2972          * Current queue expired. Check if we have to switch to a new
2973          * service tree
2974          */
2975         if (!new_cfqq)
2976                 cfq_choose_cfqg(cfqd);
2977
2978         cfqq = cfq_set_active_queue(cfqd, new_cfqq);
2979 keep_queue:
2980         return cfqq;
2981 }
2982
2983 static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
2984 {
2985         int dispatched = 0;
2986
2987         while (cfqq->next_rq) {
2988                 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
2989                 dispatched++;
2990         }
2991
2992         BUG_ON(!list_empty(&cfqq->fifo));
2993
2994         /* By default cfqq is not expired if it is empty. Do it explicitly */
2995         __cfq_slice_expired(cfqq->cfqd, cfqq, 0);
2996         return dispatched;
2997 }
2998
2999 /*
3000  * Drain our current requests. Used for barriers and when switching
3001  * io schedulers on-the-fly.
3002  */
3003 static int cfq_forced_dispatch(struct cfq_data *cfqd)
3004 {
3005         struct cfq_queue *cfqq;
3006         int dispatched = 0;
3007
3008         /* Expire the timeslice of the current active queue first */
3009         cfq_slice_expired(cfqd, 0);
3010         while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
3011                 __cfq_set_active_queue(cfqd, cfqq);
3012                 dispatched += __cfq_forced_dispatch_cfqq(cfqq);
3013         }
3014
3015         BUG_ON(cfqd->busy_queues);
3016
3017         cfq_log(cfqd, "forced_dispatch=%d", dispatched);
3018         return dispatched;
3019 }
3020
3021 static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
3022         struct cfq_queue *cfqq)
3023 {
3024         /* the queue hasn't finished any request, can't estimate */
3025         if (cfq_cfqq_slice_new(cfqq))
3026                 return true;
3027         if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
3028                 cfqq->slice_end))
3029                 return true;
3030
3031         return false;
3032 }
3033
3034 static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3035 {
3036         unsigned int max_dispatch;
3037
3038         /*
3039          * Drain async requests before we start sync IO
3040          */
3041         if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
3042                 return false;
3043
3044         /*
3045          * If this is an async queue and we have sync IO in flight, let it wait
3046          */
3047         if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
3048                 return false;
3049
3050         max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
3051         if (cfq_class_idle(cfqq))
3052                 max_dispatch = 1;
3053
3054         /*
3055          * Does this cfqq already have too much IO in flight?
3056          */
3057         if (cfqq->dispatched >= max_dispatch) {
3058                 bool promote_sync = false;
3059                 /*
3060                  * idle queue must always only have a single IO in flight
3061                  */
3062                 if (cfq_class_idle(cfqq))
3063                         return false;
3064
3065                 /*
3066                  * If there is only one sync queue
3067                  * we can ignore async queue here and give the sync
3068                  * queue no dispatch limit. The reason is a sync queue can
3069                  * preempt async queue, limiting the sync queue doesn't make
3070                  * sense. This is useful for aiostress test.
3071                  */
3072                 if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
3073                         promote_sync = true;
3074
3075                 /*
3076                  * We have other queues, don't allow more IO from this one
3077                  */
3078                 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
3079                                 !promote_sync)
3080                         return false;
3081
3082                 /*
3083                  * Sole queue user, no limit
3084                  */
3085                 if (cfqd->busy_queues == 1 || promote_sync)
3086                         max_dispatch = -1;
3087                 else
3088                         /*
3089                          * Normally we start throttling cfqq when cfq_quantum/2
3090                          * requests have been dispatched. But we can drive
3091                          * deeper queue depths at the beginning of slice
3092                          * subjected to upper limit of cfq_quantum.
3093                          * */
3094                         max_dispatch = cfqd->cfq_quantum;
3095         }
3096
3097         /*
3098          * Async queues must wait a bit before being allowed dispatch.
3099          * We also ramp up the dispatch depth gradually for async IO,
3100          * based on the last sync IO we serviced
3101          */
3102         if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
3103                 unsigned long last_sync = jiffies - cfqd->last_delayed_sync;
3104                 unsigned int depth;
3105
3106                 depth = last_sync / cfqd->cfq_slice[1];
3107                 if (!depth && !cfqq->dispatched)
3108                         depth = 1;
3109                 if (depth < max_dispatch)
3110                         max_dispatch = depth;
3111         }
3112
3113         /*
3114          * If we're below the current max, allow a dispatch
3115          */
3116         return cfqq->dispatched < max_dispatch;
3117 }
3118
3119 /*
3120  * Dispatch a request from cfqq, moving them to the request queue
3121  * dispatch list.
3122  */
3123 static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3124 {
3125         struct request *rq;
3126
3127         BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
3128
3129         if (!cfq_may_dispatch(cfqd, cfqq))
3130                 return false;
3131
3132         /*
3133          * follow expired path, else get first next available
3134          */
3135         rq = cfq_check_fifo(cfqq);
3136         if (!rq)
3137                 rq = cfqq->next_rq;
3138
3139         /*
3140          * insert request into driver dispatch list
3141          */
3142         cfq_dispatch_insert(cfqd->queue, rq);
3143
3144         if (!cfqd->active_cic) {
3145                 struct cfq_io_cq *cic = RQ_CIC(rq);
3146
3147                 atomic_long_inc(&cic->icq.ioc->refcount);
3148                 cfqd->active_cic = cic;
3149         }
3150
3151         return true;
3152 }
3153
3154 /*
3155  * Find the cfqq that we need to service and move a request from that to the
3156  * dispatch list
3157  */
3158 static int cfq_dispatch_requests(struct request_queue *q, int force)
3159 {
3160         struct cfq_data *cfqd = q->elevator->elevator_data;
3161         struct cfq_queue *cfqq;
3162
3163         if (!cfqd->busy_queues)
3164                 return 0;
3165
3166         if (unlikely(force))
3167                 return cfq_forced_dispatch(cfqd);
3168
3169         cfqq = cfq_select_queue(cfqd);
3170         if (!cfqq)
3171                 return 0;
3172
3173         /*
3174          * Dispatch a request from this cfqq, if it is allowed
3175          */
3176         if (!cfq_dispatch_request(cfqd, cfqq))
3177                 return 0;
3178
3179         cfqq->slice_dispatch++;
3180         cfq_clear_cfqq_must_dispatch(cfqq);
3181
3182         /*
3183          * expire an async queue immediately if it has used up its slice. idle
3184          * queue always expire after 1 dispatch round.
3185          */
3186         if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
3187             cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
3188             cfq_class_idle(cfqq))) {
3189                 cfqq->slice_end = jiffies + 1;
3190                 cfq_slice_expired(cfqd, 0);
3191         }
3192
3193         cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
3194         return 1;
3195 }
3196
3197 /*
3198  * task holds one reference to the queue, dropped when task exits. each rq
3199  * in-flight on this queue also holds a reference, dropped when rq is freed.
3200  *
3201  * Each cfq queue took a reference on the parent group. Drop it now.
3202  * queue lock must be held here.
3203  */
3204 static void cfq_put_queue(struct cfq_queue *cfqq)
3205 {
3206         struct cfq_data *cfqd = cfqq->cfqd;
3207         struct cfq_group *cfqg;
3208
3209         BUG_ON(cfqq->ref <= 0);
3210
3211         cfqq->ref--;
3212         if (cfqq->ref)
3213                 return;
3214
3215         cfq_log_cfqq(cfqd, cfqq, "put_queue");
3216         BUG_ON(rb_first(&cfqq->sort_list));
3217         BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
3218         cfqg = cfqq->cfqg;
3219
3220         if (unlikely(cfqd->active_queue == cfqq)) {
3221                 __cfq_slice_expired(cfqd, cfqq, 0);
3222                 cfq_schedule_dispatch(cfqd);
3223         }
3224
3225         BUG_ON(cfq_cfqq_on_rr(cfqq));
3226         kmem_cache_free(cfq_pool, cfqq);
3227         cfqg_put(cfqg);
3228 }
3229
3230 static void cfq_put_cooperator(struct cfq_queue *cfqq)
3231 {
3232         struct cfq_queue *__cfqq, *next;
3233
3234         /*
3235          * If this queue was scheduled to merge with another queue, be
3236          * sure to drop the reference taken on that queue (and others in
3237          * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
3238          */
3239         __cfqq = cfqq->new_cfqq;
3240         while (__cfqq) {
3241                 if (__cfqq == cfqq) {
3242                         WARN(1, "cfqq->new_cfqq loop detected\n");
3243                         break;
3244                 }
3245                 next = __cfqq->new_cfqq;
3246                 cfq_put_queue(__cfqq);
3247                 __cfqq = next;
3248         }
3249 }
3250
3251 static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3252 {
3253         if (unlikely(cfqq == cfqd->active_queue)) {
3254                 __cfq_slice_expired(cfqd, cfqq, 0);
3255                 cfq_schedule_dispatch(cfqd);
3256         }
3257
3258         cfq_put_cooperator(cfqq);
3259
3260         cfq_put_queue(cfqq);
3261 }
3262
3263 static void cfq_init_icq(struct io_cq *icq)
3264 {
3265         struct cfq_io_cq *cic = icq_to_cic(icq);
3266
3267         cic->ttime.last_end_request = jiffies;
3268 }
3269
3270 static void cfq_exit_icq(struct io_cq *icq)
3271 {
3272         struct cfq_io_cq *cic = icq_to_cic(icq);
3273         struct cfq_data *cfqd = cic_to_cfqd(cic);
3274
3275         if (cic->cfqq[BLK_RW_ASYNC]) {
3276                 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
3277                 cic->cfqq[BLK_RW_ASYNC] = NULL;
3278         }
3279
3280         if (cic->cfqq[BLK_RW_SYNC]) {
3281                 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
3282                 cic->cfqq[BLK_RW_SYNC] = NULL;
3283         }
3284 }
3285
3286 static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
3287 {
3288         struct task_struct *tsk = current;
3289         int ioprio_class;
3290
3291         if (!cfq_cfqq_prio_changed(cfqq))
3292                 return;
3293
3294         ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3295         switch (ioprio_class) {
3296         default:
3297                 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
3298         case IOPRIO_CLASS_NONE:
3299                 /*
3300                  * no prio set, inherit CPU scheduling settings
3301                  */
3302                 cfqq->ioprio = task_nice_ioprio(tsk);
3303                 cfqq->ioprio_class = task_nice_ioclass(tsk);
3304                 break;
3305         case IOPRIO_CLASS_RT:
3306                 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3307                 cfqq->ioprio_class = IOPRIO_CLASS_RT;
3308                 break;
3309         case IOPRIO_CLASS_BE:
3310                 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3311                 cfqq->ioprio_class = IOPRIO_CLASS_BE;
3312                 break;
3313         case IOPRIO_CLASS_IDLE:
3314                 cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
3315                 cfqq->ioprio = 7;
3316                 cfq_clear_cfqq_idle_window(cfqq);
3317                 break;
3318         }
3319
3320         /*
3321          * keep track of original prio settings in case we have to temporarily
3322          * elevate the priority of this queue
3323          */
3324         cfqq->org_ioprio = cfqq->ioprio;
3325         cfq_clear_cfqq_prio_changed(cfqq);
3326 }
3327
3328 static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
3329 {
3330         int ioprio = cic->icq.ioc->ioprio;
3331         struct cfq_data *cfqd = cic_to_cfqd(cic);
3332         struct cfq_queue *cfqq;
3333
3334         /*
3335          * Check whether ioprio has changed.  The condition may trigger
3336          * spuriously on a newly created cic but there's no harm.
3337          */
3338         if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
3339                 return;
3340
3341         cfqq = cic->cfqq[BLK_RW_ASYNC];
3342         if (cfqq) {
3343                 struct cfq_queue *new_cfqq;
3344                 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio,
3345                                          GFP_ATOMIC);
3346                 if (new_cfqq) {
3347                         cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
3348                         cfq_put_queue(cfqq);
3349                 }
3350         }
3351
3352         cfqq = cic->cfqq[BLK_RW_SYNC];
3353         if (cfqq)
3354                 cfq_mark_cfqq_prio_changed(cfqq);
3355
3356         cic->ioprio = ioprio;
3357 }
3358
3359 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3360                           pid_t pid, bool is_sync)
3361 {
3362         RB_CLEAR_NODE(&cfqq->rb_node);
3363         RB_CLEAR_NODE(&cfqq->p_node);
3364         INIT_LIST_HEAD(&cfqq->fifo);
3365
3366         cfqq->ref = 0;
3367         cfqq->cfqd = cfqd;
3368
3369         cfq_mark_cfqq_prio_changed(cfqq);
3370
3371         if (is_sync) {
3372                 if (!cfq_class_idle(cfqq))
3373                         cfq_mark_cfqq_idle_window(cfqq);
3374                 cfq_mark_cfqq_sync(cfqq);
3375         }
3376         cfqq->pid = pid;
3377 }
3378
3379 #ifdef CONFIG_CFQ_GROUP_IOSCHED
3380 static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3381 {
3382         struct cfq_data *cfqd = cic_to_cfqd(cic);
3383         struct cfq_queue *sync_cfqq;
3384         uint64_t id;
3385
3386         rcu_read_lock();
3387         id = bio_blkcg(bio)->id;
3388         rcu_read_unlock();
3389
3390         /*
3391          * Check whether blkcg has changed.  The condition may trigger
3392          * spuriously on a newly created cic but there's no harm.
3393          */
3394         if (unlikely(!cfqd) || likely(cic->blkcg_id == id))
3395                 return;
3396
3397         sync_cfqq = cic_to_cfqq(cic, 1);
3398         if (sync_cfqq) {
3399                 /*
3400                  * Drop reference to sync queue. A new sync queue will be
3401                  * assigned in new group upon arrival of a fresh request.
3402                  */
3403                 cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
3404                 cic_set_cfqq(cic, NULL, 1);
3405                 cfq_put_queue(sync_cfqq);
3406         }
3407
3408         cic->blkcg_id = id;
3409 }
3410 #else
3411 static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { }
3412 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
3413
3414 static struct cfq_queue *
3415 cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
3416                      struct bio *bio, gfp_t gfp_mask)
3417 {
3418         struct blkcg *blkcg;
3419         struct cfq_queue *cfqq, *new_cfqq = NULL;
3420         struct cfq_group *cfqg;
3421
3422 retry:
3423         rcu_read_lock();
3424
3425         blkcg = bio_blkcg(bio);
3426         cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
3427         cfqq = cic_to_cfqq(cic, is_sync);
3428
3429         /*
3430          * Always try a new alloc if we fell back to the OOM cfqq
3431          * originally, since it should just be a temporary situation.
3432          */
3433         if (!cfqq || cfqq == &cfqd->oom_cfqq) {
3434                 cfqq = NULL;
3435                 if (new_cfqq) {
3436                         cfqq = new_cfqq;
3437                         new_cfqq = NULL;
3438                 } else if (gfp_mask & __GFP_WAIT) {
3439                         rcu_read_unlock();
3440                         spin_unlock_irq(cfqd->queue->queue_lock);
3441                         new_cfqq = kmem_cache_alloc_node(cfq_pool,
3442                                         gfp_mask | __GFP_ZERO,
3443                                         cfqd->queue->node);
3444                         spin_lock_irq(cfqd->queue->queue_lock);
3445                         if (new_cfqq)
3446                                 goto retry;
3447                 } else {
3448                         cfqq = kmem_cache_alloc_node(cfq_pool,
3449                                         gfp_mask | __GFP_ZERO,
3450                                         cfqd->queue->node);
3451                 }
3452
3453                 if (cfqq) {
3454                         cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
3455                         cfq_init_prio_data(cfqq, cic);
3456                         cfq_link_cfqq_cfqg(cfqq, cfqg);
3457                         cfq_log_cfqq(cfqd, cfqq, "alloced");
3458                 } else
3459                         cfqq = &cfqd->oom_cfqq;
3460         }
3461
3462         if (new_cfqq)
3463                 kmem_cache_free(cfq_pool, new_cfqq);
3464
3465         rcu_read_unlock();
3466         return cfqq;
3467 }
3468
3469 static struct cfq_queue **
3470 cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
3471 {
3472         switch (ioprio_class) {
3473         case IOPRIO_CLASS_RT:
3474                 return &cfqd->async_cfqq[0][ioprio];
3475         case IOPRIO_CLASS_NONE:
3476                 ioprio = IOPRIO_NORM;
3477                 /* fall through */
3478         case IOPRIO_CLASS_BE:
3479                 return &cfqd->async_cfqq[1][ioprio];
3480         case IOPRIO_CLASS_IDLE:
3481                 return &cfqd->async_idle_cfqq;
3482         default:
3483                 BUG();
3484         }
3485 }
3486
3487 static struct cfq_queue *
3488 cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
3489               struct bio *bio, gfp_t gfp_mask)
3490 {
3491         const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3492         const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3493         struct cfq_queue **async_cfqq = NULL;
3494         struct cfq_queue *cfqq = NULL;
3495
3496         if (!is_sync) {
3497                 async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
3498                 cfqq = *async_cfqq;
3499         }
3500
3501         if (!cfqq)
3502                 cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);
3503
3504         /*
3505          * pin the queue now that it's allocated, scheduler exit will prune it
3506          */
3507         if (!is_sync && !(*async_cfqq)) {
3508                 cfqq->ref++;
3509                 *async_cfqq = cfqq;
3510         }
3511
3512         cfqq->ref++;
3513         return cfqq;
3514 }
3515
3516 static void
3517 __cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
3518 {
3519         unsigned long elapsed = jiffies - ttime->last_end_request;
3520         elapsed = min(elapsed, 2UL * slice_idle);
3521
3522         ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
3523         ttime->ttime_total = (7*ttime->ttime_total + 256*elapsed) / 8;
3524         ttime->ttime_mean = (ttime->ttime_total + 128) / ttime->ttime_samples;
3525 }
3526
3527 static void
3528 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3529                         struct cfq_io_cq *cic)
3530 {
3531         if (cfq_cfqq_sync(cfqq)) {
3532                 __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
3533                 __cfq_update_io_thinktime(&cfqq->service_tree->ttime,
3534                         cfqd->cfq_slice_idle);
3535         }
3536 #ifdef CONFIG_CFQ_GROUP_IOSCHED
3537         __cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle);
3538 #endif
3539 }
3540
3541 static void
3542 cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3543                        struct request *rq)
3544 {
3545         sector_t sdist = 0;
3546         sector_t n_sec = blk_rq_sectors(rq);
3547         if (cfqq->last_request_pos) {
3548                 if (cfqq->last_request_pos < blk_rq_pos(rq))
3549                         sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
3550                 else
3551                         sdist = cfqq->last_request_pos - blk_rq_pos(rq);
3552         }
3553
3554         cfqq->seek_history <<= 1;
3555         if (blk_queue_nonrot(cfqd->queue))
3556                 cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
3557         else
3558                 cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
3559 }
3560
3561 /*
3562  * Disable idle window if the process thinks too long or seeks so much that
3563  * it doesn't matter
3564  */
3565 static void
3566 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3567                        struct cfq_io_cq *cic)
3568 {
3569         int old_idle, enable_idle;
3570
3571         /*
3572          * Don't idle for async or idle io prio class
3573          */
3574         if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
3575                 return;
3576
3577         enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
3578
3579         if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3580                 cfq_mark_cfqq_deep(cfqq);
3581
3582         if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
3583                 enable_idle = 0;
3584         else if (!atomic_read(&cic->icq.ioc->active_ref) ||
3585                  !cfqd->cfq_slice_idle ||
3586                  (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3587                 enable_idle = 0;
3588         else if (sample_valid(cic->ttime.ttime_samples)) {
3589                 if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
3590                         enable_idle = 0;
3591                 else
3592                         enable_idle = 1;
3593         }
3594
3595         if (old_idle != enable_idle) {
3596                 cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
3597                 if (enable_idle)
3598                         cfq_mark_cfqq_idle_window(cfqq);
3599                 else
3600                         cfq_clear_cfqq_idle_window(cfqq);
3601         }
3602 }
3603
3604 /*
3605  * Check if new_cfqq should preempt the currently active queue. Return 0 for
3606  * no or if we aren't sure, a 1 will cause a preempt.
3607  */
3608 static bool
3609 cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3610                    struct request *rq)
3611 {
3612         struct cfq_queue *cfqq;
3613
3614         cfqq = cfqd->active_queue;
3615         if (!cfqq)
3616                 return false;
3617
3618         if (cfq_class_idle(new_cfqq))
3619                 return false;
3620
3621         if (cfq_class_idle(cfqq))
3622                 return true;
3623
3624         /*
3625          * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
3626          */
3627         if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
3628                 return false;
3629
3630         /*
3631          * if the new request is sync, but the currently running queue is
3632          * not, let the sync request have priority.
3633          */
3634         if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
3635                 return true;
3636
3637         if (new_cfqq->cfqg != cfqq->cfqg)
3638                 return false;
3639
3640         if (cfq_slice_used(cfqq))
3641                 return true;
3642
3643         /* Allow preemption only if we are idling on sync-noidle tree */
3644         if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD &&
3645             cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
3646             new_cfqq->service_tree->count == 2 &&
3647             RB_EMPTY_ROOT(&cfqq->sort_list))
3648                 return true;
3649
3650         /*
3651          * So both queues are sync. Let the new request get disk time if
3652          * it's a metadata request and the current queue is doing regular IO.
3653          */
3654         if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
3655                 return true;
3656
3657         /*
3658          * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
3659          */
3660         if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
3661                 return true;
3662
3663         /* An idle queue should not be idle now for some reason */
3664         if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
3665                 return true;
3666
3667         if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
3668                 return false;
3669
3670         /*
3671          * if this request is as-good as one we would expect from the
3672          * current cfqq, let it preempt
3673          */
3674         if (cfq_rq_close(cfqd, cfqq, rq))
3675                 return true;
3676
3677         return false;
3678 }
3679
3680 /*
3681  * cfqq preempts the active queue. if we allowed preempt with no slice left,
3682  * let it have half of its nominal slice.
3683  */
3684 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3685 {
3686         enum wl_type_t old_type = cfqq_type(cfqd->active_queue);
3687
3688         cfq_log_cfqq(cfqd, cfqq, "preempt");
3689         cfq_slice_expired(cfqd, 1);
3690
3691         /*
3692          * workload type is changed, don't save slice, otherwise preempt
3693          * doesn't happen
3694          */
3695         if (old_type != cfqq_type(cfqq))
3696                 cfqq->cfqg->saved_wl_slice = 0;
3697
3698         /*
3699          * Put the new queue at the front of the of the current list,
3700          * so we know that it will be selected next.
3701          */
3702         BUG_ON(!cfq_cfqq_on_rr(cfqq));
3703
3704         cfq_service_tree_add(cfqd, cfqq, 1);
3705
3706         cfqq->slice_end = 0;
3707         cfq_mark_cfqq_slice_new(cfqq);
3708 }
3709
3710 /*
3711  * Called when a new fs request (rq) is added (to cfqq). Check if there's
3712  * something we should do about it
3713  */
3714 static void
3715 cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3716                 struct request *rq)
3717 {
3718         struct cfq_io_cq *cic = RQ_CIC(rq);
3719
3720         cfqd->rq_queued++;
3721         if (rq->cmd_flags & REQ_PRIO)
3722                 cfqq->prio_pending++;
3723
3724         cfq_update_io_thinktime(cfqd, cfqq, cic);
3725         cfq_update_io_seektime(cfqd, cfqq, rq);
3726         cfq_update_idle_window(cfqd, cfqq, cic);
3727
3728         cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
3729
3730         if (cfqq == cfqd->active_queue) {
3731                 /*
3732                  * Remember that we saw a request from this process, but
3733                  * don't start queuing just yet. Otherwise we risk seeing lots
3734                  * of tiny requests, because we disrupt the normal plugging
3735                  * and merging. If the request is already larger than a single
3736                  * page, let it rip immediately. For that case we assume that
3737                  * merging is already done. Ditto for a busy system that
3738                  * has other work pending, don't risk delaying until the
3739                  * idle timer unplug to continue working.
3740                  */
3741                 if (cfq_cfqq_wait_request(cfqq)) {
3742                         if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
3743                             cfqd->busy_queues > 1) {
3744                                 cfq_del_timer(cfqd, cfqq);
3745                                 cfq_clear_cfqq_wait_request(cfqq);
3746                                 __blk_run_queue(cfqd->queue);
3747                         } else {
3748                                 cfqg_stats_update_idle_time(cfqq->cfqg);
3749                                 cfq_mark_cfqq_must_dispatch(cfqq);
3750                         }
3751                 }
3752         } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
3753                 /*
3754                  * not the active queue - expire current slice if it is
3755                  * idle and has expired it's mean thinktime or this new queue
3756                  * has some old slice time left and is of higher priority or
3757                  * this new queue is RT and the current one is BE
3758                  */
3759                 cfq_preempt_queue(cfqd, cfqq);
3760                 __blk_run_queue(cfqd->queue);
3761         }
3762 }
3763
3764 static void cfq_insert_request(struct request_queue *q, struct request *rq)
3765 {
3766         struct cfq_data *cfqd = q->elevator->elevator_data;
3767         struct cfq_queue *cfqq = RQ_CFQQ(rq);
3768
3769         cfq_log_cfqq(cfqd, cfqq, "insert_request");
3770         cfq_init_prio_data(cfqq, RQ_CIC(rq));
3771
3772         rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
3773         list_add_tail(&rq->queuelist, &cfqq->fifo);
3774         cfq_add_rq_rb(rq);
3775         cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
3776                                  rq->cmd_flags);
3777         cfq_rq_enqueued(cfqd, cfqq, rq);
3778 }
3779
3780 /*
3781  * Update hw_tag based on peak queue depth over 50 samples under
3782  * sufficient load.
3783  */
3784 static void cfq_update_hw_tag(struct cfq_data *cfqd)
3785 {
3786         struct cfq_queue *cfqq = cfqd->active_queue;
3787
3788         if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
3789                 cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
3790
3791         if (cfqd->hw_tag == 1)
3792                 return;
3793
3794         if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
3795             cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
3796                 return;
3797
3798         /*
3799          * If active queue hasn't enough requests and can idle, cfq might not
3800          * dispatch sufficient requests to hardware. Don't zero hw_tag in this
3801          * case
3802          */
3803         if (cfqq && cfq_cfqq_idle_window(cfqq) &&
3804             cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
3805             CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
3806                 return;
3807
3808         if (cfqd->hw_tag_samples++ < 50)
3809                 return;
3810
3811         if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
3812                 cfqd->hw_tag = 1;
3813         else
3814                 cfqd->hw_tag = 0;
3815 }
3816
3817 static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3818 {
3819         struct cfq_io_cq *cic = cfqd->active_cic;
3820
3821         /* If the queue already has requests, don't wait */
3822         if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3823                 return false;
3824
3825         /* If there are other queues in the group, don't wait */
3826         if (cfqq->cfqg->nr_cfqq > 1)
3827                 return false;
3828
3829         /* the only queue in the group, but think time is big */
3830         if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true))
3831                 return false;
3832
3833         if (cfq_slice_used(cfqq))
3834                 return true;
3835
3836         /* if slice left is less than think time, wait busy */
3837         if (cic && sample_valid(cic->ttime.ttime_samples)
3838             && (cfqq->slice_end - jiffies < cic->ttime.ttime_mean))
3839                 return true;
3840
3841         /*
3842          * If think times is less than a jiffy than ttime_mean=0 and above
3843          * will not be true. It might happen that slice has not expired yet
3844          * but will expire soon (4-5 ns) during select_queue(). To cover the
3845          * case where think time is less than a jiffy, mark the queue wait
3846          * busy if only 1 jiffy is left in the slice.
3847          */
3848         if (cfqq->slice_end - jiffies == 1)
3849                 return true;
3850
3851         return false;
3852 }
3853
3854 static void cfq_completed_request(struct request_queue *q, struct request *rq)
3855 {
3856         struct cfq_queue *cfqq = RQ_CFQQ(rq);
3857         struct cfq_data *cfqd = cfqq->cfqd;
3858         const int sync = rq_is_sync(rq);
3859         unsigned long now;
3860
3861         now = jiffies;
3862         cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d",
3863                      !!(rq->cmd_flags & REQ_NOIDLE));
3864
3865         cfq_update_hw_tag(cfqd);
3866
3867         WARN_ON(!cfqd->rq_in_driver);
3868         WARN_ON(!cfqq->dispatched);
3869         cfqd->rq_in_driver--;
3870         cfqq->dispatched--;
3871         (RQ_CFQG(rq))->dispatched--;
3872         cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq),
3873                                      rq_io_start_time_ns(rq), rq->cmd_flags);
3874
3875         cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
3876
3877         if (sync) {
3878                 struct cfq_rb_root *st;
3879
3880                 RQ_CIC(rq)->ttime.last_end_request = now;
3881
3882                 if (cfq_cfqq_on_rr(cfqq))
3883                         st = cfqq->service_tree;
3884                 else
3885                         st = st_for(cfqq->cfqg, cfqq_class(cfqq),
3886                                         cfqq_type(cfqq));
3887
3888                 st->ttime.last_end_request = now;
3889                 if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
3890                         cfqd->last_delayed_sync = now;
3891         }
3892
3893 #ifdef CONFIG_CFQ_GROUP_IOSCHED
3894         cfqq->cfqg->ttime.last_end_request = now;
3895 #endif
3896
3897         /*
3898          * If this is the active queue, check if it needs to be expired,
3899          * or if we want to idle in case it has no pending requests.
3900          */
3901         if (cfqd->active_queue == cfqq) {
3902                 const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
3903
3904                 if (cfq_cfqq_slice_new(cfqq)) {
3905                         cfq_set_prio_slice(cfqd, cfqq);
3906                         cfq_clear_cfqq_slice_new(cfqq);
3907                 }
3908
3909                 /*
3910                  * Should we wait for next request to come in before we expire
3911                  * the queue.
3912                  */
3913                 if (cfq_should_wait_busy(cfqd, cfqq)) {
3914                         unsigned long extend_sl = cfqd->cfq_slice_idle;
3915                         if (!cfqd->cfq_slice_idle)
3916                                 extend_sl = cfqd->cfq_group_idle;
3917                         cfqq->slice_end = jiffies + extend_sl;
3918                         cfq_mark_cfqq_wait_busy(cfqq);
3919                         cfq_log_cfqq(cfqd, cfqq, "will busy wait");
3920                 }
3921
3922                 /*
3923                  * Idling is not enabled on:
3924                  * - expired queues
3925                  * - idle-priority queues
3926                  * - async queues
3927                  * - queues with still some requests queued
3928                  * - when there is a close cooperator
3929                  */
3930                 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
3931                         cfq_slice_expired(cfqd, 1);
3932                 else if (sync && cfqq_empty &&
3933                          !cfq_close_cooperator(cfqd, cfqq)) {
3934                         cfq_arm_slice_timer(cfqd);
3935                 }
3936         }
3937
3938         if (!cfqd->rq_in_driver)
3939                 cfq_schedule_dispatch(cfqd);
3940 }
3941
3942 static inline int __cfq_may_queue(struct cfq_queue *cfqq)
3943 {
3944         if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
3945                 cfq_mark_cfqq_must_alloc_slice(cfqq);
3946                 return ELV_MQUEUE_MUST;
3947         }
3948
3949         return ELV_MQUEUE_MAY;
3950 }
3951
3952 static int cfq_may_queue(struct request_queue *q, int rw)
3953 {
3954         struct cfq_data *cfqd = q->elevator->elevator_data;
3955         struct task_struct *tsk = current;
3956         struct cfq_io_cq *cic;
3957         struct cfq_queue *cfqq;
3958
3959         /*
3960          * don't force setup of a queue from here, as a call to may_queue
3961          * does not necessarily imply that a request actually will be queued.
3962          * so just lookup a possibly existing queue, or return 'may queue'
3963          * if that fails
3964          */
3965         cic = cfq_cic_lookup(cfqd, tsk->io_context);
3966         if (!cic)
3967                 return ELV_MQUEUE_MAY;
3968
3969         cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
3970         if (cfqq) {
3971                 cfq_init_prio_data(cfqq, cic);
3972
3973                 return __cfq_may_queue(cfqq);
3974         }
3975
3976         return ELV_MQUEUE_MAY;
3977 }
3978
3979 /*
3980  * queue lock held here
3981  */
3982 static void cfq_put_request(struct request *rq)
3983 {
3984         struct cfq_queue *cfqq = RQ_CFQQ(rq);
3985
3986         if (cfqq) {
3987                 const int rw = rq_data_dir(rq);
3988
3989                 BUG_ON(!cfqq->allocated[rw]);
3990                 cfqq->allocated[rw]--;
3991
3992                 /* Put down rq reference on cfqg */
3993                 cfqg_put(RQ_CFQG(rq));
3994                 rq->elv.priv[0] = NULL;
3995                 rq->elv.priv[1] = NULL;
3996
3997                 cfq_put_queue(cfqq);
3998         }
3999 }
4000
4001 static struct cfq_queue *
4002 cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
4003                 struct cfq_queue *cfqq)
4004 {
4005         cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
4006         cic_set_cfqq(cic, cfqq->new_cfqq, 1);
4007         cfq_mark_cfqq_coop(cfqq->new_cfqq);
4008         cfq_put_queue(cfqq);
4009         return cic_to_cfqq(cic, 1);
4010 }
4011
4012 /*
4013  * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
4014  * was the last process referring to said cfqq.
4015  */
4016 static struct cfq_queue *
4017 split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
4018 {
4019         if (cfqq_process_refs(cfqq) == 1) {
4020                 cfqq->pid = current->pid;
4021                 cfq_clear_cfqq_coop(cfqq);
4022                 cfq_clear_cfqq_split_coop(cfqq);
4023                 return cfqq;
4024         }
4025
4026         cic_set_cfqq(cic, NULL, 1);
4027
4028         cfq_put_cooperator(cfqq);
4029
4030         cfq_put_queue(cfqq);
4031         return NULL;
4032 }
4033 /*
4034  * Allocate cfq data structures associated with this request.
4035  */
4036 static int
4037 cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
4038                 gfp_t gfp_mask)
4039 {
4040         struct cfq_data *cfqd = q->elevator->elevator_data;
4041         struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
4042         const int rw = rq_data_dir(rq);
4043         const bool is_sync = rq_is_sync(rq);
4044         struct cfq_queue *cfqq;
4045
4046         might_sleep_if(gfp_mask & __GFP_WAIT);
4047
4048         spin_lock_irq(q->queue_lock);
4049
4050         check_ioprio_changed(cic, bio);
4051         check_blkcg_changed(cic, bio);
4052 new_queue:
4053         cfqq = cic_to_cfqq(cic, is_sync);
4054         if (!cfqq || cfqq == &cfqd->oom_cfqq) {
4055                 cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask);
4056                 cic_set_cfqq(cic, cfqq, is_sync);
4057         } else {
4058                 /*
4059                  * If the queue was seeky for too long, break it apart.
4060                  */
4061                 if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
4062                         cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
4063                         cfqq = split_cfqq(cic, cfqq);
4064                         if (!cfqq)
4065                                 goto new_queue;
4066                 }
4067
4068                 /*
4069                  * Check to see if this queue is scheduled to merge with
4070                  * another, closely cooperating queue.  The merging of
4071                  * queues happens here as it must be done in process context.
4072                  * The reference on new_cfqq was taken in merge_cfqqs.
4073                  */
4074                 if (cfqq->new_cfqq)
4075                         cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
4076         }
4077
4078         cfqq->allocated[rw]++;
4079
4080         cfqq->ref++;
4081         cfqg_get(cfqq->cfqg);
4082         rq->elv.priv[0] = cfqq;
4083         rq->elv.priv[1] = cfqq->cfqg;
4084         spin_unlock_irq(q->queue_lock);
4085         return 0;
4086 }
4087
4088 static void cfq_kick_queue(struct work_struct *work)
4089 {
4090         struct cfq_data *cfqd =
4091                 container_of(work, struct cfq_data, unplug_work);
4092         struct request_queue *q = cfqd->queue;
4093
4094         spin_lock_irq(q->queue_lock);
4095         __blk_run_queue(cfqd->queue);
4096         spin_unlock_irq(q->queue_lock);
4097 }
4098
4099 /*
4100  * Timer running if the active_queue is currently idling inside its time slice
4101  */
4102 static void cfq_idle_slice_timer(unsigned long data)
4103 {
4104         struct cfq_data *cfqd = (struct cfq_data *) data;
4105         struct cfq_queue *cfqq;
4106         unsigned long flags;
4107         int timed_out = 1;
4108
4109         cfq_log(cfqd, "idle timer fired");
4110
4111         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
4112
4113         cfqq = cfqd->active_queue;
4114         if (cfqq) {
4115                 timed_out = 0;
4116
4117                 /*
4118                  * We saw a request before the queue expired, let it through
4119                  */
4120                 if (cfq_cfqq_must_dispatch(cfqq))
4121                         goto out_kick;
4122
4123                 /*
4124                  * expired
4125                  */
4126                 if (cfq_slice_used(cfqq))
4127                         goto expire;
4128
4129                 /*
4130                  * only expire and reinvoke request handler, if there are
4131                  * other queues with pending requests
4132                  */
4133                 if (!cfqd->busy_queues)
4134                         goto out_cont;
4135
4136                 /*
4137                  * not expired and it has a request pending, let it dispatch
4138                  */
4139                 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
4140                         goto out_kick;
4141
4142                 /*
4143                  * Queue depth flag is reset only when the idle didn't succeed
4144                  */
4145                 cfq_clear_cfqq_deep(cfqq);
4146         }
4147 expire:
4148         cfq_slice_expired(cfqd, timed_out);
4149 out_kick:
4150         cfq_schedule_dispatch(cfqd);
4151 out_cont:
4152         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
4153 }
4154
4155 static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
4156 {
4157         del_timer_sync(&cfqd->idle_slice_timer);
4158         cancel_work_sync(&cfqd->unplug_work);
4159 }
4160
4161 static void cfq_put_async_queues(struct cfq_data *cfqd)
4162 {
4163         int i;
4164
4165         for (i = 0; i < IOPRIO_BE_NR; i++) {
4166                 if (cfqd->async_cfqq[0][i])
4167                         cfq_put_queue(cfqd->async_cfqq[0][i]);
4168                 if (cfqd->async_cfqq[1][i])
4169                         cfq_put_queue(cfqd->async_cfqq[1][i]);
4170         }
4171
4172         if (cfqd->async_idle_cfqq)
4173                 cfq_put_queue(cfqd->async_idle_cfqq);
4174 }
4175
4176 static void cfq_exit_queue(struct elevator_queue *e)
4177 {
4178         struct cfq_data *cfqd = e->elevator_data;
4179         struct request_queue *q = cfqd->queue;
4180
4181         cfq_shutdown_timer_wq(cfqd);
4182
4183         spin_lock_irq(q->queue_lock);
4184
4185         if (cfqd->active_queue)
4186                 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
4187
4188         cfq_put_async_queues(cfqd);
4189
4190         spin_unlock_irq(q->queue_lock);
4191
4192         cfq_shutdown_timer_wq(cfqd);
4193
4194 #ifdef CONFIG_CFQ_GROUP_IOSCHED
4195         blkcg_deactivate_policy(q, &blkcg_policy_cfq);
4196 #else
4197         kfree(cfqd->root_group);
4198 #endif
4199         kfree(cfqd);
4200 }
4201
4202 static int cfq_init_queue(struct request_queue *q)
4203 {
4204         struct cfq_data *cfqd;
4205         struct blkcg_gq *blkg __maybe_unused;
4206         int i, ret;
4207
4208         cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
4209         if (!cfqd)
4210                 return -ENOMEM;
4211
4212         cfqd->queue = q;
4213         q->elevator->elevator_data = cfqd;
4214
4215         /* Init root service tree */
4216         cfqd->grp_service_tree = CFQ_RB_ROOT;
4217
4218         /* Init root group and prefer root group over other groups by default */
4219 #ifdef CONFIG_CFQ_GROUP_IOSCHED
4220         ret = blkcg_activate_policy(q, &blkcg_policy_cfq);
4221         if (ret)
4222                 goto out_free;
4223
4224         cfqd->root_group = blkg_to_cfqg(q->root_blkg);
4225 #else
4226         ret = -ENOMEM;
4227         cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
4228                                         GFP_KERNEL, cfqd->queue->node);
4229         if (!cfqd->root_group)
4230                 goto out_free;
4231
4232         cfq_init_cfqg_base(cfqd->root_group);
4233 #endif
4234         cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
4235         cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
4236
4237         /*
4238          * Not strictly needed (since RB_ROOT just clears the node and we
4239          * zeroed cfqd on alloc), but better be safe in case someone decides
4240          * to add magic to the rb code
4241          */
4242         for (i = 0; i < CFQ_PRIO_LISTS; i++)
4243                 cfqd->prio_trees[i] = RB_ROOT;
4244
4245         /*
4246          * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
4247          * Grab a permanent reference to it, so that the normal code flow
4248          * will not attempt to free it.  oom_cfqq is linked to root_group
4249          * but shouldn't hold a reference as it'll never be unlinked.  Lose
4250          * the reference from linking right away.
4251          */
4252         cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
4253         cfqd->oom_cfqq.ref++;
4254
4255         spin_lock_irq(q->queue_lock);
4256         cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
4257         cfqg_put(cfqd->root_group);
4258         spin_unlock_irq(q->queue_lock);
4259
4260         init_timer(&cfqd->idle_slice_timer);
4261         cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
4262         cfqd->idle_slice_timer.data = (unsigned long) cfqd;
4263
4264         INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
4265
4266         cfqd->cfq_quantum = cfq_quantum;
4267         cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
4268         cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
4269         cfqd->cfq_back_max = cfq_back_max;
4270         cfqd->cfq_back_penalty = cfq_back_penalty;
4271         cfqd->cfq_slice[0] = cfq_slice_async;
4272         cfqd->cfq_slice[1] = cfq_slice_sync;
4273         cfqd->cfq_target_latency = cfq_target_latency;
4274         cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
4275         cfqd->cfq_slice_idle = cfq_slice_idle;
4276         cfqd->cfq_group_idle = cfq_group_idle;
4277         cfqd->cfq_latency = 1;
4278         cfqd->hw_tag = -1;
4279         /*
4280          * we optimistically start assuming sync ops weren't delayed in last
4281          * second, in order to have larger depth for async operations.
4282          */
4283         cfqd->last_delayed_sync = jiffies - HZ;
4284         return 0;
4285
4286 out_free:
4287         kfree(cfqd);
4288         return ret;
4289 }
4290
4291 /*
4292  * sysfs parts below -->
4293  */
4294 static ssize_t
4295 cfq_var_show(unsigned int var, char *page)
4296 {
4297         return sprintf(page, "%d\n", var);
4298 }
4299
4300 static ssize_t
4301 cfq_var_store(unsigned int *var, const char *page, size_t count)
4302 {
4303         char *p = (char *) page;
4304
4305         *var = simple_strtoul(p, &p, 10);
4306         return count;
4307 }
4308
4309 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                            \
4310 static ssize_t __FUNC(struct elevator_queue *e, char *page)             \
4311 {                                                                       \
4312         struct cfq_data *cfqd = e->elevator_data;                       \
4313         unsigned int __data = __VAR;                                    \
4314         if (__CONV)                                                     \
4315                 __data = jiffies_to_msecs(__data);                      \
4316         return cfq_var_show(__data, (page));                            \
4317 }
4318 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
4319 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
4320 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
4321 SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
4322 SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
4323 SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
4324 SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
4325 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
4326 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
4327 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
4328 SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
4329 SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1);
4330 #undef SHOW_FUNCTION
4331
4332 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                 \
4333 static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
4334 {                                                                       \
4335         struct cfq_data *cfqd = e->elevator_data;                       \
4336         unsigned int __data;                                            \
4337         int ret = cfq_var_store(&__data, (page), count);                \
4338         if (__data < (MIN))                                             \
4339                 __data = (MIN);                                         \
4340         else if (__data > (MAX))                                        \
4341                 __data = (MAX);                                         \
4342         if (__CONV)                                                     \
4343                 *(__PTR) = msecs_to_jiffies(__data);                    \
4344         else                                                            \
4345                 *(__PTR) = __data;                                      \
4346         return ret;                                                     \
4347 }
4348 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
4349 STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
4350                 UINT_MAX, 1);
4351 STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
4352                 UINT_MAX, 1);
4353 STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
4354 STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
4355                 UINT_MAX, 0);
4356 STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
4357 STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
4358 STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
4359 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
4360 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
4361                 UINT_MAX, 0);
4362 STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
4363 STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1);
4364 #undef STORE_FUNCTION
4365
4366 #define CFQ_ATTR(name) \
4367         __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
4368
4369 static struct elv_fs_entry cfq_attrs[] = {
4370         CFQ_ATTR(quantum),
4371         CFQ_ATTR(fifo_expire_sync),
4372         CFQ_ATTR(fifo_expire_async),
4373         CFQ_ATTR(back_seek_max),
4374         CFQ_ATTR(back_seek_penalty),
4375         CFQ_ATTR(slice_sync),
4376         CFQ_ATTR(slice_async),
4377         CFQ_ATTR(slice_async_rq),
4378         CFQ_ATTR(slice_idle),
4379         CFQ_ATTR(group_idle),
4380         CFQ_ATTR(low_latency),
4381         CFQ_ATTR(target_latency),
4382         __ATTR_NULL
4383 };
4384
4385 static struct elevator_type iosched_cfq = {
4386         .ops = {
4387                 .elevator_merge_fn =            cfq_merge,
4388                 .elevator_merged_fn =           cfq_merged_request,
4389                 .elevator_merge_req_fn =        cfq_merged_requests,
4390                 .elevator_allow_merge_fn =      cfq_allow_merge,
4391                 .elevator_bio_merged_fn =       cfq_bio_merged,
4392                 .elevator_dispatch_fn =         cfq_dispatch_requests,
4393                 .elevator_add_req_fn =          cfq_insert_request,
4394                 .elevator_activate_req_fn =     cfq_activate_request,
4395                 .elevator_deactivate_req_fn =   cfq_deactivate_request,
4396                 .elevator_completed_req_fn =    cfq_completed_request,
4397                 .elevator_former_req_fn =       elv_rb_former_request,
4398                 .elevator_latter_req_fn =       elv_rb_latter_request,
4399                 .elevator_init_icq_fn =         cfq_init_icq,
4400                 .elevator_exit_icq_fn =         cfq_exit_icq,
4401                 .elevator_set_req_fn =          cfq_set_request,
4402                 .elevator_put_req_fn =          cfq_put_request,
4403                 .elevator_may_queue_fn =        cfq_may_queue,
4404                 .elevator_init_fn =             cfq_init_queue,
4405                 .elevator_exit_fn =             cfq_exit_queue,
4406         },
4407         .icq_size       =       sizeof(struct cfq_io_cq),
4408         .icq_align      =       __alignof__(struct cfq_io_cq),
4409         .elevator_attrs =       cfq_attrs,
4410         .elevator_name  =       "cfq",
4411         .elevator_owner =       THIS_MODULE,
4412 };
4413
4414 #ifdef CONFIG_CFQ_GROUP_IOSCHED
4415 static struct blkcg_policy blkcg_policy_cfq = {
4416         .pd_size                = sizeof(struct cfq_group),
4417         .cftypes                = cfq_blkcg_files,
4418
4419         .pd_init_fn             = cfq_pd_init,
4420         .pd_reset_stats_fn      = cfq_pd_reset_stats,
4421 };
4422 #endif
4423
4424 static int __init cfq_init(void)
4425 {
4426         int ret;
4427
4428         /*
4429          * could be 0 on HZ < 1000 setups
4430          */
4431         if (!cfq_slice_async)
4432                 cfq_slice_async = 1;
4433         if (!cfq_slice_idle)
4434                 cfq_slice_idle = 1;
4435
4436 #ifdef CONFIG_CFQ_GROUP_IOSCHED
4437         if (!cfq_group_idle)
4438                 cfq_group_idle = 1;
4439
4440         ret = blkcg_policy_register(&blkcg_policy_cfq);
4441         if (ret)
4442                 return ret;
4443 #else
4444         cfq_group_idle = 0;
4445 #endif
4446
4447         ret = -ENOMEM;
4448         cfq_pool = KMEM_CACHE(cfq_queue, 0);
4449         if (!cfq_pool)
4450                 goto err_pol_unreg;
4451
4452         ret = elv_register(&iosched_cfq);
4453         if (ret)
4454                 goto err_free_pool;
4455
4456         return 0;
4457
4458 err_free_pool:
4459         kmem_cache_destroy(cfq_pool);
4460 err_pol_unreg:
4461 #ifdef CONFIG_CFQ_GROUP_IOSCHED
4462         blkcg_policy_unregister(&blkcg_policy_cfq);
4463 #endif
4464         return ret;
4465 }
4466
4467 static void __exit cfq_exit(void)
4468 {
4469 #ifdef CONFIG_CFQ_GROUP_IOSCHED
4470         blkcg_policy_unregister(&blkcg_policy_cfq);
4471 #endif
4472         elv_unregister(&iosched_cfq);
4473         kmem_cache_destroy(cfq_pool);
4474 }
4475
4476 module_init(cfq_init);
4477 module_exit(cfq_exit);
4478
4479 MODULE_AUTHOR("Jens Axboe");
4480 MODULE_LICENSE("GPL");
4481 MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");