cfq-iosched: Do group share accounting in IOPS when slice_idle=0
[linux-2.6-block.git] / block / cfq-iosched.c
1 /*
2  *  CFQ, or complete fairness queueing, disk scheduler.
3  *
4  *  Based on ideas from a previously unfinished io
5  *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6  *
7  *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
8  */
9 #include <linux/module.h>
10 #include <linux/slab.h>
11 #include <linux/blkdev.h>
12 #include <linux/elevator.h>
13 #include <linux/jiffies.h>
14 #include <linux/rbtree.h>
15 #include <linux/ioprio.h>
16 #include <linux/blktrace_api.h>
17 #include "cfq.h"
18
19 /*
20  * tunables
21  */
22 /* max queue in one round of service */
23 static const int cfq_quantum = 8;
24 static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
25 /* maximum backwards seek, in KiB */
26 static const int cfq_back_max = 16 * 1024;
27 /* penalty of a backwards seek */
28 static const int cfq_back_penalty = 2;
29 static const int cfq_slice_sync = HZ / 10;
30 static int cfq_slice_async = HZ / 25;
31 static const int cfq_slice_async_rq = 2;
32 static int cfq_slice_idle = HZ / 125;
33 static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
34 static const int cfq_hist_divisor = 4;
35
36 /*
37  * offset from end of service tree
38  */
39 #define CFQ_IDLE_DELAY          (HZ / 5)
40
41 /*
42  * below this threshold, we consider thinktime immediate
43  */
44 #define CFQ_MIN_TT              (2)
45
46 #define CFQ_SLICE_SCALE         (5)
47 #define CFQ_HW_QUEUE_MIN        (5)
48 #define CFQ_SERVICE_SHIFT       12
49
50 #define CFQQ_SEEK_THR           (sector_t)(8 * 100)
51 #define CFQQ_CLOSE_THR          (sector_t)(8 * 1024)
52 #define CFQQ_SECT_THR_NONROT    (sector_t)(2 * 32)
53 #define CFQQ_SEEKY(cfqq)        (hweight32(cfqq->seek_history) > 32/8)
54
55 #define RQ_CIC(rq)              \
56         ((struct cfq_io_context *) (rq)->elevator_private)
57 #define RQ_CFQQ(rq)             (struct cfq_queue *) ((rq)->elevator_private2)
58 #define RQ_CFQG(rq)             (struct cfq_group *) ((rq)->elevator_private3)
59
60 static struct kmem_cache *cfq_pool;
61 static struct kmem_cache *cfq_ioc_pool;
62
63 static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
64 static struct completion *ioc_gone;
65 static DEFINE_SPINLOCK(ioc_gone_lock);
66
67 static DEFINE_SPINLOCK(cic_index_lock);
68 static DEFINE_IDA(cic_index_ida);
69
70 #define CFQ_PRIO_LISTS          IOPRIO_BE_NR
71 #define cfq_class_idle(cfqq)    ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
72 #define cfq_class_rt(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
73
74 #define sample_valid(samples)   ((samples) > 80)
75 #define rb_entry_cfqg(node)     rb_entry((node), struct cfq_group, rb_node)
76
77 /*
78  * Most of our rbtree usage is for sorting with min extraction, so
79  * if we cache the leftmost node we don't have to walk down the tree
80  * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
81  * move this into the elevator for the rq sorting as well.
82  */
83 struct cfq_rb_root {
84         struct rb_root rb;
85         struct rb_node *left;
86         unsigned count;
87         unsigned total_weight;
88         u64 min_vdisktime;
89         struct rb_node *active;
90 };
91 #define CFQ_RB_ROOT     (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
92                         .count = 0, .min_vdisktime = 0, }
93
94 /*
95  * Per process-grouping structure
96  */
97 struct cfq_queue {
98         /* reference count */
99         atomic_t ref;
100         /* various state flags, see below */
101         unsigned int flags;
102         /* parent cfq_data */
103         struct cfq_data *cfqd;
104         /* service_tree member */
105         struct rb_node rb_node;
106         /* service_tree key */
107         unsigned long rb_key;
108         /* prio tree member */
109         struct rb_node p_node;
110         /* prio tree root we belong to, if any */
111         struct rb_root *p_root;
112         /* sorted list of pending requests */
113         struct rb_root sort_list;
114         /* if fifo isn't expired, next request to serve */
115         struct request *next_rq;
116         /* requests queued in sort_list */
117         int queued[2];
118         /* currently allocated requests */
119         int allocated[2];
120         /* fifo list of requests in sort_list */
121         struct list_head fifo;
122
123         /* time when queue got scheduled in to dispatch first request. */
124         unsigned long dispatch_start;
125         unsigned int allocated_slice;
126         unsigned int slice_dispatch;
127         /* time when first request from queue completed and slice started. */
128         unsigned long slice_start;
129         unsigned long slice_end;
130         long slice_resid;
131
132         /* pending metadata requests */
133         int meta_pending;
134         /* number of requests that are on the dispatch list or inside driver */
135         int dispatched;
136
137         /* io prio of this group */
138         unsigned short ioprio, org_ioprio;
139         unsigned short ioprio_class, org_ioprio_class;
140
141         pid_t pid;
142
143         u32 seek_history;
144         sector_t last_request_pos;
145
146         struct cfq_rb_root *service_tree;
147         struct cfq_queue *new_cfqq;
148         struct cfq_group *cfqg;
149         struct cfq_group *orig_cfqg;
150 };
151
152 /*
153  * First index in the service_trees.
154  * IDLE is handled separately, so it has negative index
155  */
156 enum wl_prio_t {
157         BE_WORKLOAD = 0,
158         RT_WORKLOAD = 1,
159         IDLE_WORKLOAD = 2,
160 };
161
162 /*
163  * Second index in the service_trees.
164  */
165 enum wl_type_t {
166         ASYNC_WORKLOAD = 0,
167         SYNC_NOIDLE_WORKLOAD = 1,
168         SYNC_WORKLOAD = 2
169 };
170
171 /* This is per cgroup per device grouping structure */
172 struct cfq_group {
173         /* group service_tree member */
174         struct rb_node rb_node;
175
176         /* group service_tree key */
177         u64 vdisktime;
178         unsigned int weight;
179         bool on_st;
180
181         /* number of cfqq currently on this group */
182         int nr_cfqq;
183
184         /* Per group busy queus average. Useful for workload slice calc. */
185         unsigned int busy_queues_avg[2];
186         /*
187          * rr lists of queues with requests, onle rr for each priority class.
188          * Counts are embedded in the cfq_rb_root
189          */
190         struct cfq_rb_root service_trees[2][3];
191         struct cfq_rb_root service_tree_idle;
192
193         unsigned long saved_workload_slice;
194         enum wl_type_t saved_workload;
195         enum wl_prio_t saved_serving_prio;
196         struct blkio_group blkg;
197 #ifdef CONFIG_CFQ_GROUP_IOSCHED
198         struct hlist_node cfqd_node;
199         atomic_t ref;
200 #endif
201 };
202
203 /*
204  * Per block device queue structure
205  */
206 struct cfq_data {
207         struct request_queue *queue;
208         /* Root service tree for cfq_groups */
209         struct cfq_rb_root grp_service_tree;
210         struct cfq_group root_group;
211
212         /*
213          * The priority currently being served
214          */
215         enum wl_prio_t serving_prio;
216         enum wl_type_t serving_type;
217         unsigned long workload_expires;
218         struct cfq_group *serving_group;
219         bool noidle_tree_requires_idle;
220
221         /*
222          * Each priority tree is sorted by next_request position.  These
223          * trees are used when determining if two or more queues are
224          * interleaving requests (see cfq_close_cooperator).
225          */
226         struct rb_root prio_trees[CFQ_PRIO_LISTS];
227
228         unsigned int busy_queues;
229
230         int rq_in_driver;
231         int rq_in_flight[2];
232
233         /*
234          * queue-depth detection
235          */
236         int rq_queued;
237         int hw_tag;
238         /*
239          * hw_tag can be
240          * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
241          *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
242          *  0 => no NCQ
243          */
244         int hw_tag_est_depth;
245         unsigned int hw_tag_samples;
246
247         /*
248          * idle window management
249          */
250         struct timer_list idle_slice_timer;
251         struct work_struct unplug_work;
252
253         struct cfq_queue *active_queue;
254         struct cfq_io_context *active_cic;
255
256         /*
257          * async queue for each priority case
258          */
259         struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
260         struct cfq_queue *async_idle_cfqq;
261
262         sector_t last_position;
263
264         /*
265          * tunables, see top of file
266          */
267         unsigned int cfq_quantum;
268         unsigned int cfq_fifo_expire[2];
269         unsigned int cfq_back_penalty;
270         unsigned int cfq_back_max;
271         unsigned int cfq_slice[2];
272         unsigned int cfq_slice_async_rq;
273         unsigned int cfq_slice_idle;
274         unsigned int cfq_latency;
275         unsigned int cfq_group_isolation;
276
277         unsigned int cic_index;
278         struct list_head cic_list;
279
280         /*
281          * Fallback dummy cfqq for extreme OOM conditions
282          */
283         struct cfq_queue oom_cfqq;
284
285         unsigned long last_delayed_sync;
286
287         /* List of cfq groups being managed on this device*/
288         struct hlist_head cfqg_list;
289         struct rcu_head rcu;
290 };
291
292 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
293
294 static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
295                                             enum wl_prio_t prio,
296                                             enum wl_type_t type)
297 {
298         if (!cfqg)
299                 return NULL;
300
301         if (prio == IDLE_WORKLOAD)
302                 return &cfqg->service_tree_idle;
303
304         return &cfqg->service_trees[prio][type];
305 }
306
307 enum cfqq_state_flags {
308         CFQ_CFQQ_FLAG_on_rr = 0,        /* on round-robin busy list */
309         CFQ_CFQQ_FLAG_wait_request,     /* waiting for a request */
310         CFQ_CFQQ_FLAG_must_dispatch,    /* must be allowed a dispatch */
311         CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
312         CFQ_CFQQ_FLAG_fifo_expire,      /* FIFO checked in this slice */
313         CFQ_CFQQ_FLAG_idle_window,      /* slice idling enabled */
314         CFQ_CFQQ_FLAG_prio_changed,     /* task priority has changed */
315         CFQ_CFQQ_FLAG_slice_new,        /* no requests dispatched in slice */
316         CFQ_CFQQ_FLAG_sync,             /* synchronous queue */
317         CFQ_CFQQ_FLAG_coop,             /* cfqq is shared */
318         CFQ_CFQQ_FLAG_split_coop,       /* shared cfqq will be splitted */
319         CFQ_CFQQ_FLAG_deep,             /* sync cfqq experienced large depth */
320         CFQ_CFQQ_FLAG_wait_busy,        /* Waiting for next request */
321 };
322
323 #define CFQ_CFQQ_FNS(name)                                              \
324 static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)         \
325 {                                                                       \
326         (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);                   \
327 }                                                                       \
328 static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)        \
329 {                                                                       \
330         (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);                  \
331 }                                                                       \
332 static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)         \
333 {                                                                       \
334         return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;      \
335 }
336
337 CFQ_CFQQ_FNS(on_rr);
338 CFQ_CFQQ_FNS(wait_request);
339 CFQ_CFQQ_FNS(must_dispatch);
340 CFQ_CFQQ_FNS(must_alloc_slice);
341 CFQ_CFQQ_FNS(fifo_expire);
342 CFQ_CFQQ_FNS(idle_window);
343 CFQ_CFQQ_FNS(prio_changed);
344 CFQ_CFQQ_FNS(slice_new);
345 CFQ_CFQQ_FNS(sync);
346 CFQ_CFQQ_FNS(coop);
347 CFQ_CFQQ_FNS(split_coop);
348 CFQ_CFQQ_FNS(deep);
349 CFQ_CFQQ_FNS(wait_busy);
350 #undef CFQ_CFQQ_FNS
351
352 #ifdef CONFIG_CFQ_GROUP_IOSCHED
353 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  \
354         blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
355                         cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
356                         blkg_path(&(cfqq)->cfqg->blkg), ##args);
357
358 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)                          \
359         blk_add_trace_msg((cfqd)->queue, "%s " fmt,                     \
360                                 blkg_path(&(cfqg)->blkg), ##args);      \
361
362 #else
363 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)  \
364         blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
365 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)          do {} while (0);
366 #endif
367 #define cfq_log(cfqd, fmt, args...)     \
368         blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
369
370 /* Traverses through cfq group service trees */
371 #define for_each_cfqg_st(cfqg, i, j, st) \
372         for (i = 0; i <= IDLE_WORKLOAD; i++) \
373                 for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
374                         : &cfqg->service_tree_idle; \
375                         (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
376                         (i == IDLE_WORKLOAD && j == 0); \
377                         j++, st = i < IDLE_WORKLOAD ? \
378                         &cfqg->service_trees[i][j]: NULL) \
379
380
381 static inline bool iops_mode(struct cfq_data *cfqd)
382 {
383         /*
384          * If we are not idling on queues and it is a NCQ drive, parallel
385          * execution of requests is on and measuring time is not possible
386          * in most of the cases until and unless we drive shallower queue
387          * depths and that becomes a performance bottleneck. In such cases
388          * switch to start providing fairness in terms of number of IOs.
389          */
390         if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
391                 return true;
392         else
393                 return false;
394 }
395
396 static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
397 {
398         if (cfq_class_idle(cfqq))
399                 return IDLE_WORKLOAD;
400         if (cfq_class_rt(cfqq))
401                 return RT_WORKLOAD;
402         return BE_WORKLOAD;
403 }
404
405
406 static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
407 {
408         if (!cfq_cfqq_sync(cfqq))
409                 return ASYNC_WORKLOAD;
410         if (!cfq_cfqq_idle_window(cfqq))
411                 return SYNC_NOIDLE_WORKLOAD;
412         return SYNC_WORKLOAD;
413 }
414
415 static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
416                                         struct cfq_data *cfqd,
417                                         struct cfq_group *cfqg)
418 {
419         if (wl == IDLE_WORKLOAD)
420                 return cfqg->service_tree_idle.count;
421
422         return cfqg->service_trees[wl][ASYNC_WORKLOAD].count
423                 + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
424                 + cfqg->service_trees[wl][SYNC_WORKLOAD].count;
425 }
426
427 static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
428                                         struct cfq_group *cfqg)
429 {
430         return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count
431                 + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
432 }
433
434 static void cfq_dispatch_insert(struct request_queue *, struct request *);
435 static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
436                                        struct io_context *, gfp_t);
437 static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
438                                                 struct io_context *);
439
440 static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
441                                             bool is_sync)
442 {
443         return cic->cfqq[is_sync];
444 }
445
446 static inline void cic_set_cfqq(struct cfq_io_context *cic,
447                                 struct cfq_queue *cfqq, bool is_sync)
448 {
449         cic->cfqq[is_sync] = cfqq;
450 }
451
452 #define CIC_DEAD_KEY    1ul
453 #define CIC_DEAD_INDEX_SHIFT    1
454
455 static inline void *cfqd_dead_key(struct cfq_data *cfqd)
456 {
457         return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
458 }
459
460 static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic)
461 {
462         struct cfq_data *cfqd = cic->key;
463
464         if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY))
465                 return NULL;
466
467         return cfqd;
468 }
469
470 /*
471  * We regard a request as SYNC, if it's either a read or has the SYNC bit
472  * set (in which case it could also be direct WRITE).
473  */
474 static inline bool cfq_bio_sync(struct bio *bio)
475 {
476         return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC);
477 }
478
479 /*
480  * scheduler run of queue, if there are requests pending and no one in the
481  * driver that will restart queueing
482  */
483 static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
484 {
485         if (cfqd->busy_queues) {
486                 cfq_log(cfqd, "schedule dispatch");
487                 kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
488         }
489 }
490
491 static int cfq_queue_empty(struct request_queue *q)
492 {
493         struct cfq_data *cfqd = q->elevator->elevator_data;
494
495         return !cfqd->rq_queued;
496 }
497
498 /*
499  * Scale schedule slice based on io priority. Use the sync time slice only
500  * if a queue is marked sync and has sync io queued. A sync queue with async
501  * io only, should not get full sync slice length.
502  */
503 static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,
504                                  unsigned short prio)
505 {
506         const int base_slice = cfqd->cfq_slice[sync];
507
508         WARN_ON(prio >= IOPRIO_BE_NR);
509
510         return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
511 }
512
513 static inline int
514 cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
515 {
516         return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
517 }
518
519 static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
520 {
521         u64 d = delta << CFQ_SERVICE_SHIFT;
522
523         d = d * BLKIO_WEIGHT_DEFAULT;
524         do_div(d, cfqg->weight);
525         return d;
526 }
527
528 static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
529 {
530         s64 delta = (s64)(vdisktime - min_vdisktime);
531         if (delta > 0)
532                 min_vdisktime = vdisktime;
533
534         return min_vdisktime;
535 }
536
537 static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
538 {
539         s64 delta = (s64)(vdisktime - min_vdisktime);
540         if (delta < 0)
541                 min_vdisktime = vdisktime;
542
543         return min_vdisktime;
544 }
545
546 static void update_min_vdisktime(struct cfq_rb_root *st)
547 {
548         u64 vdisktime = st->min_vdisktime;
549         struct cfq_group *cfqg;
550
551         if (st->active) {
552                 cfqg = rb_entry_cfqg(st->active);
553                 vdisktime = cfqg->vdisktime;
554         }
555
556         if (st->left) {
557                 cfqg = rb_entry_cfqg(st->left);
558                 vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
559         }
560
561         st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
562 }
563
564 /*
565  * get averaged number of queues of RT/BE priority.
566  * average is updated, with a formula that gives more weight to higher numbers,
567  * to quickly follows sudden increases and decrease slowly
568  */
569
570 static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
571                                         struct cfq_group *cfqg, bool rt)
572 {
573         unsigned min_q, max_q;
574         unsigned mult  = cfq_hist_divisor - 1;
575         unsigned round = cfq_hist_divisor / 2;
576         unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
577
578         min_q = min(cfqg->busy_queues_avg[rt], busy);
579         max_q = max(cfqg->busy_queues_avg[rt], busy);
580         cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
581                 cfq_hist_divisor;
582         return cfqg->busy_queues_avg[rt];
583 }
584
585 static inline unsigned
586 cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
587 {
588         struct cfq_rb_root *st = &cfqd->grp_service_tree;
589
590         return cfq_target_latency * cfqg->weight / st->total_weight;
591 }
592
593 static inline void
594 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
595 {
596         unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
597         if (cfqd->cfq_latency) {
598                 /*
599                  * interested queues (we consider only the ones with the same
600                  * priority class in the cfq group)
601                  */
602                 unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
603                                                 cfq_class_rt(cfqq));
604                 unsigned sync_slice = cfqd->cfq_slice[1];
605                 unsigned expect_latency = sync_slice * iq;
606                 unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
607
608                 if (expect_latency > group_slice) {
609                         unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
610                         /* scale low_slice according to IO priority
611                          * and sync vs async */
612                         unsigned low_slice =
613                                 min(slice, base_low_slice * slice / sync_slice);
614                         /* the adapted slice value is scaled to fit all iqs
615                          * into the target latency */
616                         slice = max(slice * group_slice / expect_latency,
617                                     low_slice);
618                 }
619         }
620         cfqq->slice_start = jiffies;
621         cfqq->slice_end = jiffies + slice;
622         cfqq->allocated_slice = slice;
623         cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
624 }
625
626 /*
627  * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
628  * isn't valid until the first request from the dispatch is activated
629  * and the slice time set.
630  */
631 static inline bool cfq_slice_used(struct cfq_queue *cfqq)
632 {
633         if (cfq_cfqq_slice_new(cfqq))
634                 return 0;
635         if (time_before(jiffies, cfqq->slice_end))
636                 return 0;
637
638         return 1;
639 }
640
641 /*
642  * Lifted from AS - choose which of rq1 and rq2 that is best served now.
643  * We choose the request that is closest to the head right now. Distance
644  * behind the head is penalized and only allowed to a certain extent.
645  */
646 static struct request *
647 cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
648 {
649         sector_t s1, s2, d1 = 0, d2 = 0;
650         unsigned long back_max;
651 #define CFQ_RQ1_WRAP    0x01 /* request 1 wraps */
652 #define CFQ_RQ2_WRAP    0x02 /* request 2 wraps */
653         unsigned wrap = 0; /* bit mask: requests behind the disk head? */
654
655         if (rq1 == NULL || rq1 == rq2)
656                 return rq2;
657         if (rq2 == NULL)
658                 return rq1;
659
660         if (rq_is_sync(rq1) && !rq_is_sync(rq2))
661                 return rq1;
662         else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
663                 return rq2;
664         if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
665                 return rq1;
666         else if ((rq2->cmd_flags & REQ_META) &&
667                  !(rq1->cmd_flags & REQ_META))
668                 return rq2;
669
670         s1 = blk_rq_pos(rq1);
671         s2 = blk_rq_pos(rq2);
672
673         /*
674          * by definition, 1KiB is 2 sectors
675          */
676         back_max = cfqd->cfq_back_max * 2;
677
678         /*
679          * Strict one way elevator _except_ in the case where we allow
680          * short backward seeks which are biased as twice the cost of a
681          * similar forward seek.
682          */
683         if (s1 >= last)
684                 d1 = s1 - last;
685         else if (s1 + back_max >= last)
686                 d1 = (last - s1) * cfqd->cfq_back_penalty;
687         else
688                 wrap |= CFQ_RQ1_WRAP;
689
690         if (s2 >= last)
691                 d2 = s2 - last;
692         else if (s2 + back_max >= last)
693                 d2 = (last - s2) * cfqd->cfq_back_penalty;
694         else
695                 wrap |= CFQ_RQ2_WRAP;
696
697         /* Found required data */
698
699         /*
700          * By doing switch() on the bit mask "wrap" we avoid having to
701          * check two variables for all permutations: --> faster!
702          */
703         switch (wrap) {
704         case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
705                 if (d1 < d2)
706                         return rq1;
707                 else if (d2 < d1)
708                         return rq2;
709                 else {
710                         if (s1 >= s2)
711                                 return rq1;
712                         else
713                                 return rq2;
714                 }
715
716         case CFQ_RQ2_WRAP:
717                 return rq1;
718         case CFQ_RQ1_WRAP:
719                 return rq2;
720         case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
721         default:
722                 /*
723                  * Since both rqs are wrapped,
724                  * start with the one that's further behind head
725                  * (--> only *one* back seek required),
726                  * since back seek takes more time than forward.
727                  */
728                 if (s1 <= s2)
729                         return rq1;
730                 else
731                         return rq2;
732         }
733 }
734
735 /*
736  * The below is leftmost cache rbtree addon
737  */
738 static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
739 {
740         /* Service tree is empty */
741         if (!root->count)
742                 return NULL;
743
744         if (!root->left)
745                 root->left = rb_first(&root->rb);
746
747         if (root->left)
748                 return rb_entry(root->left, struct cfq_queue, rb_node);
749
750         return NULL;
751 }
752
753 static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
754 {
755         if (!root->left)
756                 root->left = rb_first(&root->rb);
757
758         if (root->left)
759                 return rb_entry_cfqg(root->left);
760
761         return NULL;
762 }
763
764 static void rb_erase_init(struct rb_node *n, struct rb_root *root)
765 {
766         rb_erase(n, root);
767         RB_CLEAR_NODE(n);
768 }
769
770 static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
771 {
772         if (root->left == n)
773                 root->left = NULL;
774         rb_erase_init(n, &root->rb);
775         --root->count;
776 }
777
778 /*
779  * would be nice to take fifo expire time into account as well
780  */
781 static struct request *
782 cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
783                   struct request *last)
784 {
785         struct rb_node *rbnext = rb_next(&last->rb_node);
786         struct rb_node *rbprev = rb_prev(&last->rb_node);
787         struct request *next = NULL, *prev = NULL;
788
789         BUG_ON(RB_EMPTY_NODE(&last->rb_node));
790
791         if (rbprev)
792                 prev = rb_entry_rq(rbprev);
793
794         if (rbnext)
795                 next = rb_entry_rq(rbnext);
796         else {
797                 rbnext = rb_first(&cfqq->sort_list);
798                 if (rbnext && rbnext != &last->rb_node)
799                         next = rb_entry_rq(rbnext);
800         }
801
802         return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
803 }
804
805 static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
806                                       struct cfq_queue *cfqq)
807 {
808         /*
809          * just an approximation, should be ok.
810          */
811         return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
812                        cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
813 }
814
815 static inline s64
816 cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
817 {
818         return cfqg->vdisktime - st->min_vdisktime;
819 }
820
821 static void
822 __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
823 {
824         struct rb_node **node = &st->rb.rb_node;
825         struct rb_node *parent = NULL;
826         struct cfq_group *__cfqg;
827         s64 key = cfqg_key(st, cfqg);
828         int left = 1;
829
830         while (*node != NULL) {
831                 parent = *node;
832                 __cfqg = rb_entry_cfqg(parent);
833
834                 if (key < cfqg_key(st, __cfqg))
835                         node = &parent->rb_left;
836                 else {
837                         node = &parent->rb_right;
838                         left = 0;
839                 }
840         }
841
842         if (left)
843                 st->left = &cfqg->rb_node;
844
845         rb_link_node(&cfqg->rb_node, parent, node);
846         rb_insert_color(&cfqg->rb_node, &st->rb);
847 }
848
849 static void
850 cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
851 {
852         struct cfq_rb_root *st = &cfqd->grp_service_tree;
853         struct cfq_group *__cfqg;
854         struct rb_node *n;
855
856         cfqg->nr_cfqq++;
857         if (cfqg->on_st)
858                 return;
859
860         /*
861          * Currently put the group at the end. Later implement something
862          * so that groups get lesser vtime based on their weights, so that
863          * if group does not loose all if it was not continously backlogged.
864          */
865         n = rb_last(&st->rb);
866         if (n) {
867                 __cfqg = rb_entry_cfqg(n);
868                 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
869         } else
870                 cfqg->vdisktime = st->min_vdisktime;
871
872         __cfq_group_service_tree_add(st, cfqg);
873         cfqg->on_st = true;
874         st->total_weight += cfqg->weight;
875 }
876
877 static void
878 cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
879 {
880         struct cfq_rb_root *st = &cfqd->grp_service_tree;
881
882         if (st->active == &cfqg->rb_node)
883                 st->active = NULL;
884
885         BUG_ON(cfqg->nr_cfqq < 1);
886         cfqg->nr_cfqq--;
887
888         /* If there are other cfq queues under this group, don't delete it */
889         if (cfqg->nr_cfqq)
890                 return;
891
892         cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
893         cfqg->on_st = false;
894         st->total_weight -= cfqg->weight;
895         if (!RB_EMPTY_NODE(&cfqg->rb_node))
896                 cfq_rb_erase(&cfqg->rb_node, st);
897         cfqg->saved_workload_slice = 0;
898         cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
899 }
900
901 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
902 {
903         unsigned int slice_used;
904
905         /*
906          * Queue got expired before even a single request completed or
907          * got expired immediately after first request completion.
908          */
909         if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
910                 /*
911                  * Also charge the seek time incurred to the group, otherwise
912                  * if there are mutiple queues in the group, each can dispatch
913                  * a single request on seeky media and cause lots of seek time
914                  * and group will never know it.
915                  */
916                 slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),
917                                         1);
918         } else {
919                 slice_used = jiffies - cfqq->slice_start;
920                 if (slice_used > cfqq->allocated_slice)
921                         slice_used = cfqq->allocated_slice;
922         }
923
924         return slice_used;
925 }
926
927 static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
928                                 struct cfq_queue *cfqq)
929 {
930         struct cfq_rb_root *st = &cfqd->grp_service_tree;
931         unsigned int used_sl, charge;
932         int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
933                         - cfqg->service_tree_idle.count;
934
935         BUG_ON(nr_sync < 0);
936         used_sl = charge = cfq_cfqq_slice_usage(cfqq);
937
938         if (iops_mode(cfqd))
939                 charge = cfqq->slice_dispatch;
940         else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
941                 charge = cfqq->allocated_slice;
942
943         /* Can't update vdisktime while group is on service tree */
944         cfq_rb_erase(&cfqg->rb_node, st);
945         cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
946         __cfq_group_service_tree_add(st, cfqg);
947
948         /* This group is being expired. Save the context */
949         if (time_after(cfqd->workload_expires, jiffies)) {
950                 cfqg->saved_workload_slice = cfqd->workload_expires
951                                                 - jiffies;
952                 cfqg->saved_workload = cfqd->serving_type;
953                 cfqg->saved_serving_prio = cfqd->serving_prio;
954         } else
955                 cfqg->saved_workload_slice = 0;
956
957         cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
958                                         st->min_vdisktime);
959         cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u",
960                         used_sl, cfqq->slice_dispatch, charge, iops_mode(cfqd));
961         cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
962         cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
963 }
964
965 #ifdef CONFIG_CFQ_GROUP_IOSCHED
966 static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
967 {
968         if (blkg)
969                 return container_of(blkg, struct cfq_group, blkg);
970         return NULL;
971 }
972
973 void
974 cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
975 {
976         cfqg_of_blkg(blkg)->weight = weight;
977 }
978
979 static struct cfq_group *
980 cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
981 {
982         struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
983         struct cfq_group *cfqg = NULL;
984         void *key = cfqd;
985         int i, j;
986         struct cfq_rb_root *st;
987         struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
988         unsigned int major, minor;
989
990         cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
991         if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
992                 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
993                 cfqg->blkg.dev = MKDEV(major, minor);
994                 goto done;
995         }
996         if (cfqg || !create)
997                 goto done;
998
999         cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
1000         if (!cfqg)
1001                 goto done;
1002
1003         for_each_cfqg_st(cfqg, i, j, st)
1004                 *st = CFQ_RB_ROOT;
1005         RB_CLEAR_NODE(&cfqg->rb_node);
1006
1007         /*
1008          * Take the initial reference that will be released on destroy
1009          * This can be thought of a joint reference by cgroup and
1010          * elevator which will be dropped by either elevator exit
1011          * or cgroup deletion path depending on who is exiting first.
1012          */
1013         atomic_set(&cfqg->ref, 1);
1014
1015         /* Add group onto cgroup list */
1016         sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
1017         cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
1018                                         MKDEV(major, minor));
1019         cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
1020
1021         /* Add group on cfqd list */
1022         hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
1023
1024 done:
1025         return cfqg;
1026 }
1027
1028 /*
1029  * Search for the cfq group current task belongs to. If create = 1, then also
1030  * create the cfq group if it does not exist. request_queue lock must be held.
1031  */
1032 static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
1033 {
1034         struct cgroup *cgroup;
1035         struct cfq_group *cfqg = NULL;
1036
1037         rcu_read_lock();
1038         cgroup = task_cgroup(current, blkio_subsys_id);
1039         cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create);
1040         if (!cfqg && create)
1041                 cfqg = &cfqd->root_group;
1042         rcu_read_unlock();
1043         return cfqg;
1044 }
1045
1046 static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1047 {
1048         atomic_inc(&cfqg->ref);
1049         return cfqg;
1050 }
1051
1052 static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1053 {
1054         /* Currently, all async queues are mapped to root group */
1055         if (!cfq_cfqq_sync(cfqq))
1056                 cfqg = &cfqq->cfqd->root_group;
1057
1058         cfqq->cfqg = cfqg;
1059         /* cfqq reference on cfqg */
1060         atomic_inc(&cfqq->cfqg->ref);
1061 }
1062
1063 static void cfq_put_cfqg(struct cfq_group *cfqg)
1064 {
1065         struct cfq_rb_root *st;
1066         int i, j;
1067
1068         BUG_ON(atomic_read(&cfqg->ref) <= 0);
1069         if (!atomic_dec_and_test(&cfqg->ref))
1070                 return;
1071         for_each_cfqg_st(cfqg, i, j, st)
1072                 BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
1073         kfree(cfqg);
1074 }
1075
1076 static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
1077 {
1078         /* Something wrong if we are trying to remove same group twice */
1079         BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
1080
1081         hlist_del_init(&cfqg->cfqd_node);
1082
1083         /*
1084          * Put the reference taken at the time of creation so that when all
1085          * queues are gone, group can be destroyed.
1086          */
1087         cfq_put_cfqg(cfqg);
1088 }
1089
1090 static void cfq_release_cfq_groups(struct cfq_data *cfqd)
1091 {
1092         struct hlist_node *pos, *n;
1093         struct cfq_group *cfqg;
1094
1095         hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {
1096                 /*
1097                  * If cgroup removal path got to blk_group first and removed
1098                  * it from cgroup list, then it will take care of destroying
1099                  * cfqg also.
1100                  */
1101                 if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
1102                         cfq_destroy_cfqg(cfqd, cfqg);
1103         }
1104 }
1105
1106 /*
1107  * Blk cgroup controller notification saying that blkio_group object is being
1108  * delinked as associated cgroup object is going away. That also means that
1109  * no new IO will come in this group. So get rid of this group as soon as
1110  * any pending IO in the group is finished.
1111  *
1112  * This function is called under rcu_read_lock(). key is the rcu protected
1113  * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu
1114  * read lock.
1115  *
1116  * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
1117  * it should not be NULL as even if elevator was exiting, cgroup deltion
1118  * path got to it first.
1119  */
1120 void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
1121 {
1122         unsigned long  flags;
1123         struct cfq_data *cfqd = key;
1124
1125         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1126         cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
1127         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1128 }
1129
1130 #else /* GROUP_IOSCHED */
1131 static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
1132 {
1133         return &cfqd->root_group;
1134 }
1135
1136 static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1137 {
1138         return cfqg;
1139 }
1140
1141 static inline void
1142 cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
1143         cfqq->cfqg = cfqg;
1144 }
1145
1146 static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
1147 static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
1148
1149 #endif /* GROUP_IOSCHED */
1150
1151 /*
1152  * The cfqd->service_trees holds all pending cfq_queue's that have
1153  * requests waiting to be processed. It is sorted in the order that
1154  * we will service the queues.
1155  */
1156 static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1157                                  bool add_front)
1158 {
1159         struct rb_node **p, *parent;
1160         struct cfq_queue *__cfqq;
1161         unsigned long rb_key;
1162         struct cfq_rb_root *service_tree;
1163         int left;
1164         int new_cfqq = 1;
1165         int group_changed = 0;
1166
1167 #ifdef CONFIG_CFQ_GROUP_IOSCHED
1168         if (!cfqd->cfq_group_isolation
1169             && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
1170             && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
1171                 /* Move this cfq to root group */
1172                 cfq_log_cfqq(cfqd, cfqq, "moving to root group");
1173                 if (!RB_EMPTY_NODE(&cfqq->rb_node))
1174                         cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1175                 cfqq->orig_cfqg = cfqq->cfqg;
1176                 cfqq->cfqg = &cfqd->root_group;
1177                 atomic_inc(&cfqd->root_group.ref);
1178                 group_changed = 1;
1179         } else if (!cfqd->cfq_group_isolation
1180                    && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
1181                 /* cfqq is sequential now needs to go to its original group */
1182                 BUG_ON(cfqq->cfqg != &cfqd->root_group);
1183                 if (!RB_EMPTY_NODE(&cfqq->rb_node))
1184                         cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1185                 cfq_put_cfqg(cfqq->cfqg);
1186                 cfqq->cfqg = cfqq->orig_cfqg;
1187                 cfqq->orig_cfqg = NULL;
1188                 group_changed = 1;
1189                 cfq_log_cfqq(cfqd, cfqq, "moved to origin group");
1190         }
1191 #endif
1192
1193         service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
1194                                                 cfqq_type(cfqq));
1195         if (cfq_class_idle(cfqq)) {
1196                 rb_key = CFQ_IDLE_DELAY;
1197                 parent = rb_last(&service_tree->rb);
1198                 if (parent && parent != &cfqq->rb_node) {
1199                         __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1200                         rb_key += __cfqq->rb_key;
1201                 } else
1202                         rb_key += jiffies;
1203         } else if (!add_front) {
1204                 /*
1205                  * Get our rb key offset. Subtract any residual slice
1206                  * value carried from last service. A negative resid
1207                  * count indicates slice overrun, and this should position
1208                  * the next service time further away in the tree.
1209                  */
1210                 rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
1211                 rb_key -= cfqq->slice_resid;
1212                 cfqq->slice_resid = 0;
1213         } else {
1214                 rb_key = -HZ;
1215                 __cfqq = cfq_rb_first(service_tree);
1216                 rb_key += __cfqq ? __cfqq->rb_key : jiffies;
1217         }
1218
1219         if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
1220                 new_cfqq = 0;
1221                 /*
1222                  * same position, nothing more to do
1223                  */
1224                 if (rb_key == cfqq->rb_key &&
1225                     cfqq->service_tree == service_tree)
1226                         return;
1227
1228                 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1229                 cfqq->service_tree = NULL;
1230         }
1231
1232         left = 1;
1233         parent = NULL;
1234         cfqq->service_tree = service_tree;
1235         p = &service_tree->rb.rb_node;
1236         while (*p) {
1237                 struct rb_node **n;
1238
1239                 parent = *p;
1240                 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1241
1242                 /*
1243                  * sort by key, that represents service time.
1244                  */
1245                 if (time_before(rb_key, __cfqq->rb_key))
1246                         n = &(*p)->rb_left;
1247                 else {
1248                         n = &(*p)->rb_right;
1249                         left = 0;
1250                 }
1251
1252                 p = n;
1253         }
1254
1255         if (left)
1256                 service_tree->left = &cfqq->rb_node;
1257
1258         cfqq->rb_key = rb_key;
1259         rb_link_node(&cfqq->rb_node, parent, p);
1260         rb_insert_color(&cfqq->rb_node, &service_tree->rb);
1261         service_tree->count++;
1262         if ((add_front || !new_cfqq) && !group_changed)
1263                 return;
1264         cfq_group_service_tree_add(cfqd, cfqq->cfqg);
1265 }
1266
1267 static struct cfq_queue *
1268 cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
1269                      sector_t sector, struct rb_node **ret_parent,
1270                      struct rb_node ***rb_link)
1271 {
1272         struct rb_node **p, *parent;
1273         struct cfq_queue *cfqq = NULL;
1274
1275         parent = NULL;
1276         p = &root->rb_node;
1277         while (*p) {
1278                 struct rb_node **n;
1279
1280                 parent = *p;
1281                 cfqq = rb_entry(parent, struct cfq_queue, p_node);
1282
1283                 /*
1284                  * Sort strictly based on sector.  Smallest to the left,
1285                  * largest to the right.
1286                  */
1287                 if (sector > blk_rq_pos(cfqq->next_rq))
1288                         n = &(*p)->rb_right;
1289                 else if (sector < blk_rq_pos(cfqq->next_rq))
1290                         n = &(*p)->rb_left;
1291                 else
1292                         break;
1293                 p = n;
1294                 cfqq = NULL;
1295         }
1296
1297         *ret_parent = parent;
1298         if (rb_link)
1299                 *rb_link = p;
1300         return cfqq;
1301 }
1302
1303 static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1304 {
1305         struct rb_node **p, *parent;
1306         struct cfq_queue *__cfqq;
1307
1308         if (cfqq->p_root) {
1309                 rb_erase(&cfqq->p_node, cfqq->p_root);
1310                 cfqq->p_root = NULL;
1311         }
1312
1313         if (cfq_class_idle(cfqq))
1314                 return;
1315         if (!cfqq->next_rq)
1316                 return;
1317
1318         cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
1319         __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
1320                                       blk_rq_pos(cfqq->next_rq), &parent, &p);
1321         if (!__cfqq) {
1322                 rb_link_node(&cfqq->p_node, parent, p);
1323                 rb_insert_color(&cfqq->p_node, cfqq->p_root);
1324         } else
1325                 cfqq->p_root = NULL;
1326 }
1327
1328 /*
1329  * Update cfqq's position in the service tree.
1330  */
1331 static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1332 {
1333         /*
1334          * Resorting requires the cfqq to be on the RR list already.
1335          */
1336         if (cfq_cfqq_on_rr(cfqq)) {
1337                 cfq_service_tree_add(cfqd, cfqq, 0);
1338                 cfq_prio_tree_add(cfqd, cfqq);
1339         }
1340 }
1341
1342 /*
1343  * add to busy list of queues for service, trying to be fair in ordering
1344  * the pending list according to last request service
1345  */
1346 static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1347 {
1348         cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
1349         BUG_ON(cfq_cfqq_on_rr(cfqq));
1350         cfq_mark_cfqq_on_rr(cfqq);
1351         cfqd->busy_queues++;
1352
1353         cfq_resort_rr_list(cfqd, cfqq);
1354 }
1355
1356 /*
1357  * Called when the cfqq no longer has requests pending, remove it from
1358  * the service tree.
1359  */
1360 static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1361 {
1362         cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
1363         BUG_ON(!cfq_cfqq_on_rr(cfqq));
1364         cfq_clear_cfqq_on_rr(cfqq);
1365
1366         if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
1367                 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1368                 cfqq->service_tree = NULL;
1369         }
1370         if (cfqq->p_root) {
1371                 rb_erase(&cfqq->p_node, cfqq->p_root);
1372                 cfqq->p_root = NULL;
1373         }
1374
1375         cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1376         BUG_ON(!cfqd->busy_queues);
1377         cfqd->busy_queues--;
1378 }
1379
1380 /*
1381  * rb tree support functions
1382  */
1383 static void cfq_del_rq_rb(struct request *rq)
1384 {
1385         struct cfq_queue *cfqq = RQ_CFQQ(rq);
1386         const int sync = rq_is_sync(rq);
1387
1388         BUG_ON(!cfqq->queued[sync]);
1389         cfqq->queued[sync]--;
1390
1391         elv_rb_del(&cfqq->sort_list, rq);
1392
1393         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
1394                 /*
1395                  * Queue will be deleted from service tree when we actually
1396                  * expire it later. Right now just remove it from prio tree
1397                  * as it is empty.
1398                  */
1399                 if (cfqq->p_root) {
1400                         rb_erase(&cfqq->p_node, cfqq->p_root);
1401                         cfqq->p_root = NULL;
1402                 }
1403         }
1404 }
1405
1406 static void cfq_add_rq_rb(struct request *rq)
1407 {
1408         struct cfq_queue *cfqq = RQ_CFQQ(rq);
1409         struct cfq_data *cfqd = cfqq->cfqd;
1410         struct request *__alias, *prev;
1411
1412         cfqq->queued[rq_is_sync(rq)]++;
1413
1414         /*
1415          * looks a little odd, but the first insert might return an alias.
1416          * if that happens, put the alias on the dispatch list
1417          */
1418         while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)
1419                 cfq_dispatch_insert(cfqd->queue, __alias);
1420
1421         if (!cfq_cfqq_on_rr(cfqq))
1422                 cfq_add_cfqq_rr(cfqd, cfqq);
1423
1424         /*
1425          * check if this request is a better next-serve candidate
1426          */
1427         prev = cfqq->next_rq;
1428         cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
1429
1430         /*
1431          * adjust priority tree position, if ->next_rq changes
1432          */
1433         if (prev != cfqq->next_rq)
1434                 cfq_prio_tree_add(cfqd, cfqq);
1435
1436         BUG_ON(!cfqq->next_rq);
1437 }
1438
1439 static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
1440 {
1441         elv_rb_del(&cfqq->sort_list, rq);
1442         cfqq->queued[rq_is_sync(rq)]--;
1443         cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
1444                                         rq_data_dir(rq), rq_is_sync(rq));
1445         cfq_add_rq_rb(rq);
1446         cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
1447                         &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
1448                         rq_is_sync(rq));
1449 }
1450
1451 static struct request *
1452 cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
1453 {
1454         struct task_struct *tsk = current;
1455         struct cfq_io_context *cic;
1456         struct cfq_queue *cfqq;
1457
1458         cic = cfq_cic_lookup(cfqd, tsk->io_context);
1459         if (!cic)
1460                 return NULL;
1461
1462         cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
1463         if (cfqq) {
1464                 sector_t sector = bio->bi_sector + bio_sectors(bio);
1465
1466                 return elv_rb_find(&cfqq->sort_list, sector);
1467         }
1468
1469         return NULL;
1470 }
1471
1472 static void cfq_activate_request(struct request_queue *q, struct request *rq)
1473 {
1474         struct cfq_data *cfqd = q->elevator->elevator_data;
1475
1476         cfqd->rq_in_driver++;
1477         cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
1478                                                 cfqd->rq_in_driver);
1479
1480         cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
1481 }
1482
1483 static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
1484 {
1485         struct cfq_data *cfqd = q->elevator->elevator_data;
1486
1487         WARN_ON(!cfqd->rq_in_driver);
1488         cfqd->rq_in_driver--;
1489         cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
1490                                                 cfqd->rq_in_driver);
1491 }
1492
1493 static void cfq_remove_request(struct request *rq)
1494 {
1495         struct cfq_queue *cfqq = RQ_CFQQ(rq);
1496
1497         if (cfqq->next_rq == rq)
1498                 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
1499
1500         list_del_init(&rq->queuelist);
1501         cfq_del_rq_rb(rq);
1502
1503         cfqq->cfqd->rq_queued--;
1504         cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
1505                                         rq_data_dir(rq), rq_is_sync(rq));
1506         if (rq->cmd_flags & REQ_META) {
1507                 WARN_ON(!cfqq->meta_pending);
1508                 cfqq->meta_pending--;
1509         }
1510 }
1511
1512 static int cfq_merge(struct request_queue *q, struct request **req,
1513                      struct bio *bio)
1514 {
1515         struct cfq_data *cfqd = q->elevator->elevator_data;
1516         struct request *__rq;
1517
1518         __rq = cfq_find_rq_fmerge(cfqd, bio);
1519         if (__rq && elv_rq_merge_ok(__rq, bio)) {
1520                 *req = __rq;
1521                 return ELEVATOR_FRONT_MERGE;
1522         }
1523
1524         return ELEVATOR_NO_MERGE;
1525 }
1526
1527 static void cfq_merged_request(struct request_queue *q, struct request *req,
1528                                int type)
1529 {
1530         if (type == ELEVATOR_FRONT_MERGE) {
1531                 struct cfq_queue *cfqq = RQ_CFQQ(req);
1532
1533                 cfq_reposition_rq_rb(cfqq, req);
1534         }
1535 }
1536
1537 static void cfq_bio_merged(struct request_queue *q, struct request *req,
1538                                 struct bio *bio)
1539 {
1540         cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg,
1541                                         bio_data_dir(bio), cfq_bio_sync(bio));
1542 }
1543
1544 static void
1545 cfq_merged_requests(struct request_queue *q, struct request *rq,
1546                     struct request *next)
1547 {
1548         struct cfq_queue *cfqq = RQ_CFQQ(rq);
1549         /*
1550          * reposition in fifo if next is older than rq
1551          */
1552         if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
1553             time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
1554                 list_move(&rq->queuelist, &next->queuelist);
1555                 rq_set_fifo_time(rq, rq_fifo_time(next));
1556         }
1557
1558         if (cfqq->next_rq == next)
1559                 cfqq->next_rq = rq;
1560         cfq_remove_request(next);
1561         cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg,
1562                                         rq_data_dir(next), rq_is_sync(next));
1563 }
1564
1565 static int cfq_allow_merge(struct request_queue *q, struct request *rq,
1566                            struct bio *bio)
1567 {
1568         struct cfq_data *cfqd = q->elevator->elevator_data;
1569         struct cfq_io_context *cic;
1570         struct cfq_queue *cfqq;
1571
1572         /*
1573          * Disallow merge of a sync bio into an async request.
1574          */
1575         if (cfq_bio_sync(bio) && !rq_is_sync(rq))
1576                 return false;
1577
1578         /*
1579          * Lookup the cfqq that this bio will be queued with. Allow
1580          * merge only if rq is queued there.
1581          */
1582         cic = cfq_cic_lookup(cfqd, current->io_context);
1583         if (!cic)
1584                 return false;
1585
1586         cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
1587         return cfqq == RQ_CFQQ(rq);
1588 }
1589
1590 static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1591 {
1592         del_timer(&cfqd->idle_slice_timer);
1593         cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
1594 }
1595
1596 static void __cfq_set_active_queue(struct cfq_data *cfqd,
1597                                    struct cfq_queue *cfqq)
1598 {
1599         if (cfqq) {
1600                 cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
1601                                 cfqd->serving_prio, cfqd->serving_type);
1602                 cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
1603                 cfqq->slice_start = 0;
1604                 cfqq->dispatch_start = jiffies;
1605                 cfqq->allocated_slice = 0;
1606                 cfqq->slice_end = 0;
1607                 cfqq->slice_dispatch = 0;
1608
1609                 cfq_clear_cfqq_wait_request(cfqq);
1610                 cfq_clear_cfqq_must_dispatch(cfqq);
1611                 cfq_clear_cfqq_must_alloc_slice(cfqq);
1612                 cfq_clear_cfqq_fifo_expire(cfqq);
1613                 cfq_mark_cfqq_slice_new(cfqq);
1614
1615                 cfq_del_timer(cfqd, cfqq);
1616         }
1617
1618         cfqd->active_queue = cfqq;
1619 }
1620
1621 /*
1622  * current cfqq expired its slice (or was too idle), select new one
1623  */
1624 static void
1625 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1626                     bool timed_out)
1627 {
1628         cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
1629
1630         if (cfq_cfqq_wait_request(cfqq))
1631                 cfq_del_timer(cfqd, cfqq);
1632
1633         cfq_clear_cfqq_wait_request(cfqq);
1634         cfq_clear_cfqq_wait_busy(cfqq);
1635
1636         /*
1637          * If this cfqq is shared between multiple processes, check to
1638          * make sure that those processes are still issuing I/Os within
1639          * the mean seek distance.  If not, it may be time to break the
1640          * queues apart again.
1641          */
1642         if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
1643                 cfq_mark_cfqq_split_coop(cfqq);
1644
1645         /*
1646          * store what was left of this slice, if the queue idled/timed out
1647          */
1648         if (timed_out && !cfq_cfqq_slice_new(cfqq)) {
1649                 cfqq->slice_resid = cfqq->slice_end - jiffies;
1650                 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
1651         }
1652
1653         cfq_group_served(cfqd, cfqq->cfqg, cfqq);
1654
1655         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
1656                 cfq_del_cfqq_rr(cfqd, cfqq);
1657
1658         cfq_resort_rr_list(cfqd, cfqq);
1659
1660         if (cfqq == cfqd->active_queue)
1661                 cfqd->active_queue = NULL;
1662
1663         if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
1664                 cfqd->grp_service_tree.active = NULL;
1665
1666         if (cfqd->active_cic) {
1667                 put_io_context(cfqd->active_cic->ioc);
1668                 cfqd->active_cic = NULL;
1669         }
1670 }
1671
1672 static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
1673 {
1674         struct cfq_queue *cfqq = cfqd->active_queue;
1675
1676         if (cfqq)
1677                 __cfq_slice_expired(cfqd, cfqq, timed_out);
1678 }
1679
1680 /*
1681  * Get next queue for service. Unless we have a queue preemption,
1682  * we'll simply select the first cfqq in the service tree.
1683  */
1684 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
1685 {
1686         struct cfq_rb_root *service_tree =
1687                 service_tree_for(cfqd->serving_group, cfqd->serving_prio,
1688                                         cfqd->serving_type);
1689
1690         if (!cfqd->rq_queued)
1691                 return NULL;
1692
1693         /* There is nothing to dispatch */
1694         if (!service_tree)
1695                 return NULL;
1696         if (RB_EMPTY_ROOT(&service_tree->rb))
1697                 return NULL;
1698         return cfq_rb_first(service_tree);
1699 }
1700
1701 static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
1702 {
1703         struct cfq_group *cfqg;
1704         struct cfq_queue *cfqq;
1705         int i, j;
1706         struct cfq_rb_root *st;
1707
1708         if (!cfqd->rq_queued)
1709                 return NULL;
1710
1711         cfqg = cfq_get_next_cfqg(cfqd);
1712         if (!cfqg)
1713                 return NULL;
1714
1715         for_each_cfqg_st(cfqg, i, j, st)
1716                 if ((cfqq = cfq_rb_first(st)) != NULL)
1717                         return cfqq;
1718         return NULL;
1719 }
1720
1721 /*
1722  * Get and set a new active queue for service.
1723  */
1724 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
1725                                               struct cfq_queue *cfqq)
1726 {
1727         if (!cfqq)
1728                 cfqq = cfq_get_next_queue(cfqd);
1729
1730         __cfq_set_active_queue(cfqd, cfqq);
1731         return cfqq;
1732 }
1733
1734 static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
1735                                           struct request *rq)
1736 {
1737         if (blk_rq_pos(rq) >= cfqd->last_position)
1738                 return blk_rq_pos(rq) - cfqd->last_position;
1739         else
1740                 return cfqd->last_position - blk_rq_pos(rq);
1741 }
1742
1743 static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1744                                struct request *rq)
1745 {
1746         return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
1747 }
1748
1749 static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
1750                                     struct cfq_queue *cur_cfqq)
1751 {
1752         struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
1753         struct rb_node *parent, *node;
1754         struct cfq_queue *__cfqq;
1755         sector_t sector = cfqd->last_position;
1756
1757         if (RB_EMPTY_ROOT(root))
1758                 return NULL;
1759
1760         /*
1761          * First, if we find a request starting at the end of the last
1762          * request, choose it.
1763          */
1764         __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
1765         if (__cfqq)
1766                 return __cfqq;
1767
1768         /*
1769          * If the exact sector wasn't found, the parent of the NULL leaf
1770          * will contain the closest sector.
1771          */
1772         __cfqq = rb_entry(parent, struct cfq_queue, p_node);
1773         if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1774                 return __cfqq;
1775
1776         if (blk_rq_pos(__cfqq->next_rq) < sector)
1777                 node = rb_next(&__cfqq->p_node);
1778         else
1779                 node = rb_prev(&__cfqq->p_node);
1780         if (!node)
1781                 return NULL;
1782
1783         __cfqq = rb_entry(node, struct cfq_queue, p_node);
1784         if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1785                 return __cfqq;
1786
1787         return NULL;
1788 }
1789
1790 /*
1791  * cfqd - obvious
1792  * cur_cfqq - passed in so that we don't decide that the current queue is
1793  *            closely cooperating with itself.
1794  *
1795  * So, basically we're assuming that that cur_cfqq has dispatched at least
1796  * one request, and that cfqd->last_position reflects a position on the disk
1797  * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
1798  * assumption.
1799  */
1800 static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
1801                                               struct cfq_queue *cur_cfqq)
1802 {
1803         struct cfq_queue *cfqq;
1804
1805         if (cfq_class_idle(cur_cfqq))
1806                 return NULL;
1807         if (!cfq_cfqq_sync(cur_cfqq))
1808                 return NULL;
1809         if (CFQQ_SEEKY(cur_cfqq))
1810                 return NULL;
1811
1812         /*
1813          * Don't search priority tree if it's the only queue in the group.
1814          */
1815         if (cur_cfqq->cfqg->nr_cfqq == 1)
1816                 return NULL;
1817
1818         /*
1819          * We should notice if some of the queues are cooperating, eg
1820          * working closely on the same area of the disk. In that case,
1821          * we can group them together and don't waste time idling.
1822          */
1823         cfqq = cfqq_close(cfqd, cur_cfqq);
1824         if (!cfqq)
1825                 return NULL;
1826
1827         /* If new queue belongs to different cfq_group, don't choose it */
1828         if (cur_cfqq->cfqg != cfqq->cfqg)
1829                 return NULL;
1830
1831         /*
1832          * It only makes sense to merge sync queues.
1833          */
1834         if (!cfq_cfqq_sync(cfqq))
1835                 return NULL;
1836         if (CFQQ_SEEKY(cfqq))
1837                 return NULL;
1838
1839         /*
1840          * Do not merge queues of different priority classes
1841          */
1842         if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
1843                 return NULL;
1844
1845         return cfqq;
1846 }
1847
1848 /*
1849  * Determine whether we should enforce idle window for this queue.
1850  */
1851
1852 static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1853 {
1854         enum wl_prio_t prio = cfqq_prio(cfqq);
1855         struct cfq_rb_root *service_tree = cfqq->service_tree;
1856
1857         BUG_ON(!service_tree);
1858         BUG_ON(!service_tree->count);
1859
1860         if (!cfqd->cfq_slice_idle)
1861                 return false;
1862
1863         /* We never do for idle class queues. */
1864         if (prio == IDLE_WORKLOAD)
1865                 return false;
1866
1867         /* We do for queues that were marked with idle window flag. */
1868         if (cfq_cfqq_idle_window(cfqq) &&
1869            !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
1870                 return true;
1871
1872         /*
1873          * Otherwise, we do only if they are the last ones
1874          * in their service tree.
1875          */
1876         if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
1877                 return 1;
1878         cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
1879                         service_tree->count);
1880         return 0;
1881 }
1882
1883 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1884 {
1885         struct cfq_queue *cfqq = cfqd->active_queue;
1886         struct cfq_io_context *cic;
1887         unsigned long sl;
1888
1889         /*
1890          * SSD device without seek penalty, disable idling. But only do so
1891          * for devices that support queuing, otherwise we still have a problem
1892          * with sync vs async workloads.
1893          */
1894         if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
1895                 return;
1896
1897         WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
1898         WARN_ON(cfq_cfqq_slice_new(cfqq));
1899
1900         /*
1901          * idle is disabled, either manually or by past process history
1902          */
1903         if (!cfq_should_idle(cfqd, cfqq))
1904                 return;
1905
1906         /*
1907          * still active requests from this queue, don't idle
1908          */
1909         if (cfqq->dispatched)
1910                 return;
1911
1912         /*
1913          * task has exited, don't wait
1914          */
1915         cic = cfqd->active_cic;
1916         if (!cic || !atomic_read(&cic->ioc->nr_tasks))
1917                 return;
1918
1919         /*
1920          * If our average think time is larger than the remaining time
1921          * slice, then don't idle. This avoids overrunning the allotted
1922          * time slice.
1923          */
1924         if (sample_valid(cic->ttime_samples) &&
1925             (cfqq->slice_end - jiffies < cic->ttime_mean)) {
1926                 cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d",
1927                                 cic->ttime_mean);
1928                 return;
1929         }
1930
1931         cfq_mark_cfqq_wait_request(cfqq);
1932
1933         sl = cfqd->cfq_slice_idle;
1934
1935         mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
1936         cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
1937         cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
1938 }
1939
1940 /*
1941  * Move request from internal lists to the request queue dispatch list.
1942  */
1943 static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
1944 {
1945         struct cfq_data *cfqd = q->elevator->elevator_data;
1946         struct cfq_queue *cfqq = RQ_CFQQ(rq);
1947
1948         cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
1949
1950         cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
1951         cfq_remove_request(rq);
1952         cfqq->dispatched++;
1953         elv_dispatch_sort(q, rq);
1954
1955         cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
1956         cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
1957                                         rq_data_dir(rq), rq_is_sync(rq));
1958 }
1959
1960 /*
1961  * return expired entry, or NULL to just start from scratch in rbtree
1962  */
1963 static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
1964 {
1965         struct request *rq = NULL;
1966
1967         if (cfq_cfqq_fifo_expire(cfqq))
1968                 return NULL;
1969
1970         cfq_mark_cfqq_fifo_expire(cfqq);
1971
1972         if (list_empty(&cfqq->fifo))
1973                 return NULL;
1974
1975         rq = rq_entry_fifo(cfqq->fifo.next);
1976         if (time_before(jiffies, rq_fifo_time(rq)))
1977                 rq = NULL;
1978
1979         cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
1980         return rq;
1981 }
1982
1983 static inline int
1984 cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1985 {
1986         const int base_rq = cfqd->cfq_slice_async_rq;
1987
1988         WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
1989
1990         return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
1991 }
1992
1993 /*
1994  * Must be called with the queue_lock held.
1995  */
1996 static int cfqq_process_refs(struct cfq_queue *cfqq)
1997 {
1998         int process_refs, io_refs;
1999
2000         io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
2001         process_refs = atomic_read(&cfqq->ref) - io_refs;
2002         BUG_ON(process_refs < 0);
2003         return process_refs;
2004 }
2005
2006 static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
2007 {
2008         int process_refs, new_process_refs;
2009         struct cfq_queue *__cfqq;
2010
2011         /*
2012          * If there are no process references on the new_cfqq, then it is
2013          * unsafe to follow the ->new_cfqq chain as other cfqq's in the
2014          * chain may have dropped their last reference (not just their
2015          * last process reference).
2016          */
2017         if (!cfqq_process_refs(new_cfqq))
2018                 return;
2019
2020         /* Avoid a circular list and skip interim queue merges */
2021         while ((__cfqq = new_cfqq->new_cfqq)) {
2022                 if (__cfqq == cfqq)
2023                         return;
2024                 new_cfqq = __cfqq;
2025         }
2026
2027         process_refs = cfqq_process_refs(cfqq);
2028         new_process_refs = cfqq_process_refs(new_cfqq);
2029         /*
2030          * If the process for the cfqq has gone away, there is no
2031          * sense in merging the queues.
2032          */
2033         if (process_refs == 0 || new_process_refs == 0)
2034                 return;
2035
2036         /*
2037          * Merge in the direction of the lesser amount of work.
2038          */
2039         if (new_process_refs >= process_refs) {
2040                 cfqq->new_cfqq = new_cfqq;
2041                 atomic_add(process_refs, &new_cfqq->ref);
2042         } else {
2043                 new_cfqq->new_cfqq = cfqq;
2044                 atomic_add(new_process_refs, &cfqq->ref);
2045         }
2046 }
2047
2048 static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
2049                                 struct cfq_group *cfqg, enum wl_prio_t prio)
2050 {
2051         struct cfq_queue *queue;
2052         int i;
2053         bool key_valid = false;
2054         unsigned long lowest_key = 0;
2055         enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
2056
2057         for (i = 0; i <= SYNC_WORKLOAD; ++i) {
2058                 /* select the one with lowest rb_key */
2059                 queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
2060                 if (queue &&
2061                     (!key_valid || time_before(queue->rb_key, lowest_key))) {
2062                         lowest_key = queue->rb_key;
2063                         cur_best = i;
2064                         key_valid = true;
2065                 }
2066         }
2067
2068         return cur_best;
2069 }
2070
2071 static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2072 {
2073         unsigned slice;
2074         unsigned count;
2075         struct cfq_rb_root *st;
2076         unsigned group_slice;
2077
2078         if (!cfqg) {
2079                 cfqd->serving_prio = IDLE_WORKLOAD;
2080                 cfqd->workload_expires = jiffies + 1;
2081                 return;
2082         }
2083
2084         /* Choose next priority. RT > BE > IDLE */
2085         if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
2086                 cfqd->serving_prio = RT_WORKLOAD;
2087         else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
2088                 cfqd->serving_prio = BE_WORKLOAD;
2089         else {
2090                 cfqd->serving_prio = IDLE_WORKLOAD;
2091                 cfqd->workload_expires = jiffies + 1;
2092                 return;
2093         }
2094
2095         /*
2096          * For RT and BE, we have to choose also the type
2097          * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
2098          * expiration time
2099          */
2100         st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
2101         count = st->count;
2102
2103         /*
2104          * check workload expiration, and that we still have other queues ready
2105          */
2106         if (count && !time_after(jiffies, cfqd->workload_expires))
2107                 return;
2108
2109         /* otherwise select new workload type */
2110         cfqd->serving_type =
2111                 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
2112         st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
2113         count = st->count;
2114
2115         /*
2116          * the workload slice is computed as a fraction of target latency
2117          * proportional to the number of queues in that workload, over
2118          * all the queues in the same priority class
2119          */
2120         group_slice = cfq_group_slice(cfqd, cfqg);
2121
2122         slice = group_slice * count /
2123                 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
2124                       cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));
2125
2126         if (cfqd->serving_type == ASYNC_WORKLOAD) {
2127                 unsigned int tmp;
2128
2129                 /*
2130                  * Async queues are currently system wide. Just taking
2131                  * proportion of queues with-in same group will lead to higher
2132                  * async ratio system wide as generally root group is going
2133                  * to have higher weight. A more accurate thing would be to
2134                  * calculate system wide asnc/sync ratio.
2135                  */
2136                 tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);
2137                 tmp = tmp/cfqd->busy_queues;
2138                 slice = min_t(unsigned, slice, tmp);
2139
2140                 /* async workload slice is scaled down according to
2141                  * the sync/async slice ratio. */
2142                 slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
2143         } else
2144                 /* sync workload slice is at least 2 * cfq_slice_idle */
2145                 slice = max(slice, 2 * cfqd->cfq_slice_idle);
2146
2147         slice = max_t(unsigned, slice, CFQ_MIN_TT);
2148         cfq_log(cfqd, "workload slice:%d", slice);
2149         cfqd->workload_expires = jiffies + slice;
2150         cfqd->noidle_tree_requires_idle = false;
2151 }
2152
2153 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
2154 {
2155         struct cfq_rb_root *st = &cfqd->grp_service_tree;
2156         struct cfq_group *cfqg;
2157
2158         if (RB_EMPTY_ROOT(&st->rb))
2159                 return NULL;
2160         cfqg = cfq_rb_first_group(st);
2161         st->active = &cfqg->rb_node;
2162         update_min_vdisktime(st);
2163         return cfqg;
2164 }
2165
2166 static void cfq_choose_cfqg(struct cfq_data *cfqd)
2167 {
2168         struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
2169
2170         cfqd->serving_group = cfqg;
2171
2172         /* Restore the workload type data */
2173         if (cfqg->saved_workload_slice) {
2174                 cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
2175                 cfqd->serving_type = cfqg->saved_workload;
2176                 cfqd->serving_prio = cfqg->saved_serving_prio;
2177         } else
2178                 cfqd->workload_expires = jiffies - 1;
2179
2180         choose_service_tree(cfqd, cfqg);
2181 }
2182
2183 /*
2184  * Select a queue for service. If we have a current active queue,
2185  * check whether to continue servicing it, or retrieve and set a new one.
2186  */
2187 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
2188 {
2189         struct cfq_queue *cfqq, *new_cfqq = NULL;
2190
2191         cfqq = cfqd->active_queue;
2192         if (!cfqq)
2193                 goto new_queue;
2194
2195         if (!cfqd->rq_queued)
2196                 return NULL;
2197
2198         /*
2199          * We were waiting for group to get backlogged. Expire the queue
2200          */
2201         if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
2202                 goto expire;
2203
2204         /*
2205          * The active queue has run out of time, expire it and select new.
2206          */
2207         if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
2208                 /*
2209                  * If slice had not expired at the completion of last request
2210                  * we might not have turned on wait_busy flag. Don't expire
2211                  * the queue yet. Allow the group to get backlogged.
2212                  *
2213                  * The very fact that we have used the slice, that means we
2214                  * have been idling all along on this queue and it should be
2215                  * ok to wait for this request to complete.
2216                  */
2217                 if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
2218                     && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2219                         cfqq = NULL;
2220                         goto keep_queue;
2221                 } else
2222                         goto expire;
2223         }
2224
2225         /*
2226          * The active queue has requests and isn't expired, allow it to
2227          * dispatch.
2228          */
2229         if (!RB_EMPTY_ROOT(&cfqq->sort_list))
2230                 goto keep_queue;
2231
2232         /*
2233          * If another queue has a request waiting within our mean seek
2234          * distance, let it run.  The expire code will check for close
2235          * cooperators and put the close queue at the front of the service
2236          * tree.  If possible, merge the expiring queue with the new cfqq.
2237          */
2238         new_cfqq = cfq_close_cooperator(cfqd, cfqq);
2239         if (new_cfqq) {
2240                 if (!cfqq->new_cfqq)
2241                         cfq_setup_merge(cfqq, new_cfqq);
2242                 goto expire;
2243         }
2244
2245         /*
2246          * No requests pending. If the active queue still has requests in
2247          * flight or is idling for a new request, allow either of these
2248          * conditions to happen (or time out) before selecting a new queue.
2249          */
2250         if (timer_pending(&cfqd->idle_slice_timer) ||
2251             (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) {
2252                 cfqq = NULL;
2253                 goto keep_queue;
2254         }
2255
2256 expire:
2257         cfq_slice_expired(cfqd, 0);
2258 new_queue:
2259         /*
2260          * Current queue expired. Check if we have to switch to a new
2261          * service tree
2262          */
2263         if (!new_cfqq)
2264                 cfq_choose_cfqg(cfqd);
2265
2266         cfqq = cfq_set_active_queue(cfqd, new_cfqq);
2267 keep_queue:
2268         return cfqq;
2269 }
2270
2271 static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
2272 {
2273         int dispatched = 0;
2274
2275         while (cfqq->next_rq) {
2276                 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
2277                 dispatched++;
2278         }
2279
2280         BUG_ON(!list_empty(&cfqq->fifo));
2281
2282         /* By default cfqq is not expired if it is empty. Do it explicitly */
2283         __cfq_slice_expired(cfqq->cfqd, cfqq, 0);
2284         return dispatched;
2285 }
2286
2287 /*
2288  * Drain our current requests. Used for barriers and when switching
2289  * io schedulers on-the-fly.
2290  */
2291 static int cfq_forced_dispatch(struct cfq_data *cfqd)
2292 {
2293         struct cfq_queue *cfqq;
2294         int dispatched = 0;
2295
2296         /* Expire the timeslice of the current active queue first */
2297         cfq_slice_expired(cfqd, 0);
2298         while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
2299                 __cfq_set_active_queue(cfqd, cfqq);
2300                 dispatched += __cfq_forced_dispatch_cfqq(cfqq);
2301         }
2302
2303         BUG_ON(cfqd->busy_queues);
2304
2305         cfq_log(cfqd, "forced_dispatch=%d", dispatched);
2306         return dispatched;
2307 }
2308
2309 static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
2310         struct cfq_queue *cfqq)
2311 {
2312         /* the queue hasn't finished any request, can't estimate */
2313         if (cfq_cfqq_slice_new(cfqq))
2314                 return 1;
2315         if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
2316                 cfqq->slice_end))
2317                 return 1;
2318
2319         return 0;
2320 }
2321
2322 static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2323 {
2324         unsigned int max_dispatch;
2325
2326         /*
2327          * Drain async requests before we start sync IO
2328          */
2329         if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
2330                 return false;
2331
2332         /*
2333          * If this is an async queue and we have sync IO in flight, let it wait
2334          */
2335         if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
2336                 return false;
2337
2338         max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
2339         if (cfq_class_idle(cfqq))
2340                 max_dispatch = 1;
2341
2342         /*
2343          * Does this cfqq already have too much IO in flight?
2344          */
2345         if (cfqq->dispatched >= max_dispatch) {
2346                 /*
2347                  * idle queue must always only have a single IO in flight
2348                  */
2349                 if (cfq_class_idle(cfqq))
2350                         return false;
2351
2352                 /*
2353                  * We have other queues, don't allow more IO from this one
2354                  */
2355                 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq))
2356                         return false;
2357
2358                 /*
2359                  * Sole queue user, no limit
2360                  */
2361                 if (cfqd->busy_queues == 1)
2362                         max_dispatch = -1;
2363                 else
2364                         /*
2365                          * Normally we start throttling cfqq when cfq_quantum/2
2366                          * requests have been dispatched. But we can drive
2367                          * deeper queue depths at the beginning of slice
2368                          * subjected to upper limit of cfq_quantum.
2369                          * */
2370                         max_dispatch = cfqd->cfq_quantum;
2371         }
2372
2373         /*
2374          * Async queues must wait a bit before being allowed dispatch.
2375          * We also ramp up the dispatch depth gradually for async IO,
2376          * based on the last sync IO we serviced
2377          */
2378         if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
2379                 unsigned long last_sync = jiffies - cfqd->last_delayed_sync;
2380                 unsigned int depth;
2381
2382                 depth = last_sync / cfqd->cfq_slice[1];
2383                 if (!depth && !cfqq->dispatched)
2384                         depth = 1;
2385                 if (depth < max_dispatch)
2386                         max_dispatch = depth;
2387         }
2388
2389         /*
2390          * If we're below the current max, allow a dispatch
2391          */
2392         return cfqq->dispatched < max_dispatch;
2393 }
2394
2395 /*
2396  * Dispatch a request from cfqq, moving them to the request queue
2397  * dispatch list.
2398  */
2399 static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2400 {
2401         struct request *rq;
2402
2403         BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
2404
2405         if (!cfq_may_dispatch(cfqd, cfqq))
2406                 return false;
2407
2408         /*
2409          * follow expired path, else get first next available
2410          */
2411         rq = cfq_check_fifo(cfqq);
2412         if (!rq)
2413                 rq = cfqq->next_rq;
2414
2415         /*
2416          * insert request into driver dispatch list
2417          */
2418         cfq_dispatch_insert(cfqd->queue, rq);
2419
2420         if (!cfqd->active_cic) {
2421                 struct cfq_io_context *cic = RQ_CIC(rq);
2422
2423                 atomic_long_inc(&cic->ioc->refcount);
2424                 cfqd->active_cic = cic;
2425         }
2426
2427         return true;
2428 }
2429
2430 /*
2431  * Find the cfqq that we need to service and move a request from that to the
2432  * dispatch list
2433  */
2434 static int cfq_dispatch_requests(struct request_queue *q, int force)
2435 {
2436         struct cfq_data *cfqd = q->elevator->elevator_data;
2437         struct cfq_queue *cfqq;
2438
2439         if (!cfqd->busy_queues)
2440                 return 0;
2441
2442         if (unlikely(force))
2443                 return cfq_forced_dispatch(cfqd);
2444
2445         cfqq = cfq_select_queue(cfqd);
2446         if (!cfqq)
2447                 return 0;
2448
2449         /*
2450          * Dispatch a request from this cfqq, if it is allowed
2451          */
2452         if (!cfq_dispatch_request(cfqd, cfqq))
2453                 return 0;
2454
2455         cfqq->slice_dispatch++;
2456         cfq_clear_cfqq_must_dispatch(cfqq);
2457
2458         /*
2459          * expire an async queue immediately if it has used up its slice. idle
2460          * queue always expire after 1 dispatch round.
2461          */
2462         if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
2463             cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
2464             cfq_class_idle(cfqq))) {
2465                 cfqq->slice_end = jiffies + 1;
2466                 cfq_slice_expired(cfqd, 0);
2467         }
2468
2469         cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
2470         return 1;
2471 }
2472
2473 /*
2474  * task holds one reference to the queue, dropped when task exits. each rq
2475  * in-flight on this queue also holds a reference, dropped when rq is freed.
2476  *
2477  * Each cfq queue took a reference on the parent group. Drop it now.
2478  * queue lock must be held here.
2479  */
2480 static void cfq_put_queue(struct cfq_queue *cfqq)
2481 {
2482         struct cfq_data *cfqd = cfqq->cfqd;
2483         struct cfq_group *cfqg, *orig_cfqg;
2484
2485         BUG_ON(atomic_read(&cfqq->ref) <= 0);
2486
2487         if (!atomic_dec_and_test(&cfqq->ref))
2488                 return;
2489
2490         cfq_log_cfqq(cfqd, cfqq, "put_queue");
2491         BUG_ON(rb_first(&cfqq->sort_list));
2492         BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
2493         cfqg = cfqq->cfqg;
2494         orig_cfqg = cfqq->orig_cfqg;
2495
2496         if (unlikely(cfqd->active_queue == cfqq)) {
2497                 __cfq_slice_expired(cfqd, cfqq, 0);
2498                 cfq_schedule_dispatch(cfqd);
2499         }
2500
2501         BUG_ON(cfq_cfqq_on_rr(cfqq));
2502         kmem_cache_free(cfq_pool, cfqq);
2503         cfq_put_cfqg(cfqg);
2504         if (orig_cfqg)
2505                 cfq_put_cfqg(orig_cfqg);
2506 }
2507
2508 /*
2509  * Must always be called with the rcu_read_lock() held
2510  */
2511 static void
2512 __call_for_each_cic(struct io_context *ioc,
2513                     void (*func)(struct io_context *, struct cfq_io_context *))
2514 {
2515         struct cfq_io_context *cic;
2516         struct hlist_node *n;
2517
2518         hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
2519                 func(ioc, cic);
2520 }
2521
2522 /*
2523  * Call func for each cic attached to this ioc.
2524  */
2525 static void
2526 call_for_each_cic(struct io_context *ioc,
2527                   void (*func)(struct io_context *, struct cfq_io_context *))
2528 {
2529         rcu_read_lock();
2530         __call_for_each_cic(ioc, func);
2531         rcu_read_unlock();
2532 }
2533
2534 static void cfq_cic_free_rcu(struct rcu_head *head)
2535 {
2536         struct cfq_io_context *cic;
2537
2538         cic = container_of(head, struct cfq_io_context, rcu_head);
2539
2540         kmem_cache_free(cfq_ioc_pool, cic);
2541         elv_ioc_count_dec(cfq_ioc_count);
2542
2543         if (ioc_gone) {
2544                 /*
2545                  * CFQ scheduler is exiting, grab exit lock and check
2546                  * the pending io context count. If it hits zero,
2547                  * complete ioc_gone and set it back to NULL
2548                  */
2549                 spin_lock(&ioc_gone_lock);
2550                 if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
2551                         complete(ioc_gone);
2552                         ioc_gone = NULL;
2553                 }
2554                 spin_unlock(&ioc_gone_lock);
2555         }
2556 }
2557
2558 static void cfq_cic_free(struct cfq_io_context *cic)
2559 {
2560         call_rcu(&cic->rcu_head, cfq_cic_free_rcu);
2561 }
2562
2563 static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
2564 {
2565         unsigned long flags;
2566         unsigned long dead_key = (unsigned long) cic->key;
2567
2568         BUG_ON(!(dead_key & CIC_DEAD_KEY));
2569
2570         spin_lock_irqsave(&ioc->lock, flags);
2571         radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT);
2572         hlist_del_rcu(&cic->cic_list);
2573         spin_unlock_irqrestore(&ioc->lock, flags);
2574
2575         cfq_cic_free(cic);
2576 }
2577
2578 /*
2579  * Must be called with rcu_read_lock() held or preemption otherwise disabled.
2580  * Only two callers of this - ->dtor() which is called with the rcu_read_lock(),
2581  * and ->trim() which is called with the task lock held
2582  */
2583 static void cfq_free_io_context(struct io_context *ioc)
2584 {
2585         /*
2586          * ioc->refcount is zero here, or we are called from elv_unregister(),
2587          * so no more cic's are allowed to be linked into this ioc.  So it
2588          * should be ok to iterate over the known list, we will see all cic's
2589          * since no new ones are added.
2590          */
2591         __call_for_each_cic(ioc, cic_free_func);
2592 }
2593
2594 static void cfq_put_cooperator(struct cfq_queue *cfqq)
2595 {
2596         struct cfq_queue *__cfqq, *next;
2597
2598         /*
2599          * If this queue was scheduled to merge with another queue, be
2600          * sure to drop the reference taken on that queue (and others in
2601          * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
2602          */
2603         __cfqq = cfqq->new_cfqq;
2604         while (__cfqq) {
2605                 if (__cfqq == cfqq) {
2606                         WARN(1, "cfqq->new_cfqq loop detected\n");
2607                         break;
2608                 }
2609                 next = __cfqq->new_cfqq;
2610                 cfq_put_queue(__cfqq);
2611                 __cfqq = next;
2612         }
2613 }
2614
2615 static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2616 {
2617         if (unlikely(cfqq == cfqd->active_queue)) {
2618                 __cfq_slice_expired(cfqd, cfqq, 0);
2619                 cfq_schedule_dispatch(cfqd);
2620         }
2621
2622         cfq_put_cooperator(cfqq);
2623
2624         cfq_put_queue(cfqq);
2625 }
2626
2627 static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
2628                                          struct cfq_io_context *cic)
2629 {
2630         struct io_context *ioc = cic->ioc;
2631
2632         list_del_init(&cic->queue_list);
2633
2634         /*
2635          * Make sure dead mark is seen for dead queues
2636          */
2637         smp_wmb();
2638         cic->key = cfqd_dead_key(cfqd);
2639
2640         if (ioc->ioc_data == cic)
2641                 rcu_assign_pointer(ioc->ioc_data, NULL);
2642
2643         if (cic->cfqq[BLK_RW_ASYNC]) {
2644                 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
2645                 cic->cfqq[BLK_RW_ASYNC] = NULL;
2646         }
2647
2648         if (cic->cfqq[BLK_RW_SYNC]) {
2649                 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
2650                 cic->cfqq[BLK_RW_SYNC] = NULL;
2651         }
2652 }
2653
2654 static void cfq_exit_single_io_context(struct io_context *ioc,
2655                                        struct cfq_io_context *cic)
2656 {
2657         struct cfq_data *cfqd = cic_to_cfqd(cic);
2658
2659         if (cfqd) {
2660                 struct request_queue *q = cfqd->queue;
2661                 unsigned long flags;
2662
2663                 spin_lock_irqsave(q->queue_lock, flags);
2664
2665                 /*
2666                  * Ensure we get a fresh copy of the ->key to prevent
2667                  * race between exiting task and queue
2668                  */
2669                 smp_read_barrier_depends();
2670                 if (cic->key == cfqd)
2671                         __cfq_exit_single_io_context(cfqd, cic);
2672
2673                 spin_unlock_irqrestore(q->queue_lock, flags);
2674         }
2675 }
2676
2677 /*
2678  * The process that ioc belongs to has exited, we need to clean up
2679  * and put the internal structures we have that belongs to that process.
2680  */
2681 static void cfq_exit_io_context(struct io_context *ioc)
2682 {
2683         call_for_each_cic(ioc, cfq_exit_single_io_context);
2684 }
2685
2686 static struct cfq_io_context *
2687 cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
2688 {
2689         struct cfq_io_context *cic;
2690
2691         cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,
2692                                                         cfqd->queue->node);
2693         if (cic) {
2694                 cic->last_end_request = jiffies;
2695                 INIT_LIST_HEAD(&cic->queue_list);
2696                 INIT_HLIST_NODE(&cic->cic_list);
2697                 cic->dtor = cfq_free_io_context;
2698                 cic->exit = cfq_exit_io_context;
2699                 elv_ioc_count_inc(cfq_ioc_count);
2700         }
2701
2702         return cic;
2703 }
2704
2705 static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
2706 {
2707         struct task_struct *tsk = current;
2708         int ioprio_class;
2709
2710         if (!cfq_cfqq_prio_changed(cfqq))
2711                 return;
2712
2713         ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
2714         switch (ioprio_class) {
2715         default:
2716                 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
2717         case IOPRIO_CLASS_NONE:
2718                 /*
2719                  * no prio set, inherit CPU scheduling settings
2720                  */
2721                 cfqq->ioprio = task_nice_ioprio(tsk);
2722                 cfqq->ioprio_class = task_nice_ioclass(tsk);
2723                 break;
2724         case IOPRIO_CLASS_RT:
2725                 cfqq->ioprio = task_ioprio(ioc);
2726                 cfqq->ioprio_class = IOPRIO_CLASS_RT;
2727                 break;
2728         case IOPRIO_CLASS_BE:
2729                 cfqq->ioprio = task_ioprio(ioc);
2730                 cfqq->ioprio_class = IOPRIO_CLASS_BE;
2731                 break;
2732         case IOPRIO_CLASS_IDLE:
2733                 cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
2734                 cfqq->ioprio = 7;
2735                 cfq_clear_cfqq_idle_window(cfqq);
2736                 break;
2737         }
2738
2739         /*
2740          * keep track of original prio settings in case we have to temporarily
2741          * elevate the priority of this queue
2742          */
2743         cfqq->org_ioprio = cfqq->ioprio;
2744         cfqq->org_ioprio_class = cfqq->ioprio_class;
2745         cfq_clear_cfqq_prio_changed(cfqq);
2746 }
2747
2748 static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
2749 {
2750         struct cfq_data *cfqd = cic_to_cfqd(cic);
2751         struct cfq_queue *cfqq;
2752         unsigned long flags;
2753
2754         if (unlikely(!cfqd))
2755                 return;
2756
2757         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2758
2759         cfqq = cic->cfqq[BLK_RW_ASYNC];
2760         if (cfqq) {
2761                 struct cfq_queue *new_cfqq;
2762                 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc,
2763                                                 GFP_ATOMIC);
2764                 if (new_cfqq) {
2765                         cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
2766                         cfq_put_queue(cfqq);
2767                 }
2768         }
2769
2770         cfqq = cic->cfqq[BLK_RW_SYNC];
2771         if (cfqq)
2772                 cfq_mark_cfqq_prio_changed(cfqq);
2773
2774         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2775 }
2776
2777 static void cfq_ioc_set_ioprio(struct io_context *ioc)
2778 {
2779         call_for_each_cic(ioc, changed_ioprio);
2780         ioc->ioprio_changed = 0;
2781 }
2782
2783 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2784                           pid_t pid, bool is_sync)
2785 {
2786         RB_CLEAR_NODE(&cfqq->rb_node);
2787         RB_CLEAR_NODE(&cfqq->p_node);
2788         INIT_LIST_HEAD(&cfqq->fifo);
2789
2790         atomic_set(&cfqq->ref, 0);
2791         cfqq->cfqd = cfqd;
2792
2793         cfq_mark_cfqq_prio_changed(cfqq);
2794
2795         if (is_sync) {
2796                 if (!cfq_class_idle(cfqq))
2797                         cfq_mark_cfqq_idle_window(cfqq);
2798                 cfq_mark_cfqq_sync(cfqq);
2799         }
2800         cfqq->pid = pid;
2801 }
2802
2803 #ifdef CONFIG_CFQ_GROUP_IOSCHED
2804 static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
2805 {
2806         struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
2807         struct cfq_data *cfqd = cic_to_cfqd(cic);
2808         unsigned long flags;
2809         struct request_queue *q;
2810
2811         if (unlikely(!cfqd))
2812                 return;
2813
2814         q = cfqd->queue;
2815
2816         spin_lock_irqsave(q->queue_lock, flags);
2817
2818         if (sync_cfqq) {
2819                 /*
2820                  * Drop reference to sync queue. A new sync queue will be
2821                  * assigned in new group upon arrival of a fresh request.
2822                  */
2823                 cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
2824                 cic_set_cfqq(cic, NULL, 1);
2825                 cfq_put_queue(sync_cfqq);
2826         }
2827
2828         spin_unlock_irqrestore(q->queue_lock, flags);
2829 }
2830
2831 static void cfq_ioc_set_cgroup(struct io_context *ioc)
2832 {
2833         call_for_each_cic(ioc, changed_cgroup);
2834         ioc->cgroup_changed = 0;
2835 }
2836 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
2837
2838 static struct cfq_queue *
2839 cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
2840                      struct io_context *ioc, gfp_t gfp_mask)
2841 {
2842         struct cfq_queue *cfqq, *new_cfqq = NULL;
2843         struct cfq_io_context *cic;
2844         struct cfq_group *cfqg;
2845
2846 retry:
2847         cfqg = cfq_get_cfqg(cfqd, 1);
2848         cic = cfq_cic_lookup(cfqd, ioc);
2849         /* cic always exists here */
2850         cfqq = cic_to_cfqq(cic, is_sync);
2851
2852         /*
2853          * Always try a new alloc if we fell back to the OOM cfqq
2854          * originally, since it should just be a temporary situation.
2855          */
2856         if (!cfqq || cfqq == &cfqd->oom_cfqq) {
2857                 cfqq = NULL;
2858                 if (new_cfqq) {
2859                         cfqq = new_cfqq;
2860                         new_cfqq = NULL;
2861                 } else if (gfp_mask & __GFP_WAIT) {
2862                         spin_unlock_irq(cfqd->queue->queue_lock);
2863                         new_cfqq = kmem_cache_alloc_node(cfq_pool,
2864                                         gfp_mask | __GFP_ZERO,
2865                                         cfqd->queue->node);
2866                         spin_lock_irq(cfqd->queue->queue_lock);
2867                         if (new_cfqq)
2868                                 goto retry;
2869                 } else {
2870                         cfqq = kmem_cache_alloc_node(cfq_pool,
2871                                         gfp_mask | __GFP_ZERO,
2872                                         cfqd->queue->node);
2873                 }
2874
2875                 if (cfqq) {
2876                         cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
2877                         cfq_init_prio_data(cfqq, ioc);
2878                         cfq_link_cfqq_cfqg(cfqq, cfqg);
2879                         cfq_log_cfqq(cfqd, cfqq, "alloced");
2880                 } else
2881                         cfqq = &cfqd->oom_cfqq;
2882         }
2883
2884         if (new_cfqq)
2885                 kmem_cache_free(cfq_pool, new_cfqq);
2886
2887         return cfqq;
2888 }
2889
2890 static struct cfq_queue **
2891 cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
2892 {
2893         switch (ioprio_class) {
2894         case IOPRIO_CLASS_RT:
2895                 return &cfqd->async_cfqq[0][ioprio];
2896         case IOPRIO_CLASS_BE:
2897                 return &cfqd->async_cfqq[1][ioprio];
2898         case IOPRIO_CLASS_IDLE:
2899                 return &cfqd->async_idle_cfqq;
2900         default:
2901                 BUG();
2902         }
2903 }
2904
2905 static struct cfq_queue *
2906 cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
2907               gfp_t gfp_mask)
2908 {
2909         const int ioprio = task_ioprio(ioc);
2910         const int ioprio_class = task_ioprio_class(ioc);
2911         struct cfq_queue **async_cfqq = NULL;
2912         struct cfq_queue *cfqq = NULL;
2913
2914         if (!is_sync) {
2915                 async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
2916                 cfqq = *async_cfqq;
2917         }
2918
2919         if (!cfqq)
2920                 cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);
2921
2922         /*
2923          * pin the queue now that it's allocated, scheduler exit will prune it
2924          */
2925         if (!is_sync && !(*async_cfqq)) {
2926                 atomic_inc(&cfqq->ref);
2927                 *async_cfqq = cfqq;
2928         }
2929
2930         atomic_inc(&cfqq->ref);
2931         return cfqq;
2932 }
2933
2934 /*
2935  * We drop cfq io contexts lazily, so we may find a dead one.
2936  */
2937 static void
2938 cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
2939                   struct cfq_io_context *cic)
2940 {
2941         unsigned long flags;
2942
2943         WARN_ON(!list_empty(&cic->queue_list));
2944         BUG_ON(cic->key != cfqd_dead_key(cfqd));
2945
2946         spin_lock_irqsave(&ioc->lock, flags);
2947
2948         BUG_ON(ioc->ioc_data == cic);
2949
2950         radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
2951         hlist_del_rcu(&cic->cic_list);
2952         spin_unlock_irqrestore(&ioc->lock, flags);
2953
2954         cfq_cic_free(cic);
2955 }
2956
2957 static struct cfq_io_context *
2958 cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
2959 {
2960         struct cfq_io_context *cic;
2961         unsigned long flags;
2962
2963         if (unlikely(!ioc))
2964                 return NULL;
2965
2966         rcu_read_lock();
2967
2968         /*
2969          * we maintain a last-hit cache, to avoid browsing over the tree
2970          */
2971         cic = rcu_dereference(ioc->ioc_data);
2972         if (cic && cic->key == cfqd) {
2973                 rcu_read_unlock();
2974                 return cic;
2975         }
2976
2977         do {
2978                 cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index);
2979                 rcu_read_unlock();
2980                 if (!cic)
2981                         break;
2982                 if (unlikely(cic->key != cfqd)) {
2983                         cfq_drop_dead_cic(cfqd, ioc, cic);
2984                         rcu_read_lock();
2985                         continue;
2986                 }
2987
2988                 spin_lock_irqsave(&ioc->lock, flags);
2989                 rcu_assign_pointer(ioc->ioc_data, cic);
2990                 spin_unlock_irqrestore(&ioc->lock, flags);
2991                 break;
2992         } while (1);
2993
2994         return cic;
2995 }
2996
2997 /*
2998  * Add cic into ioc, using cfqd as the search key. This enables us to lookup
2999  * the process specific cfq io context when entered from the block layer.
3000  * Also adds the cic to a per-cfqd list, used when this queue is removed.
3001  */
3002 static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
3003                         struct cfq_io_context *cic, gfp_t gfp_mask)
3004 {
3005         unsigned long flags;
3006         int ret;
3007
3008         ret = radix_tree_preload(gfp_mask);
3009         if (!ret) {
3010                 cic->ioc = ioc;
3011                 cic->key = cfqd;
3012
3013                 spin_lock_irqsave(&ioc->lock, flags);
3014                 ret = radix_tree_insert(&ioc->radix_root,
3015                                                 cfqd->cic_index, cic);
3016                 if (!ret)
3017                         hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
3018                 spin_unlock_irqrestore(&ioc->lock, flags);
3019
3020                 radix_tree_preload_end();
3021
3022                 if (!ret) {
3023                         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
3024                         list_add(&cic->queue_list, &cfqd->cic_list);
3025                         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
3026                 }
3027         }
3028
3029         if (ret)
3030                 printk(KERN_ERR "cfq: cic link failed!\n");
3031
3032         return ret;
3033 }
3034
3035 /*
3036  * Setup general io context and cfq io context. There can be several cfq
3037  * io contexts per general io context, if this process is doing io to more
3038  * than one device managed by cfq.
3039  */
3040 static struct cfq_io_context *
3041 cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
3042 {
3043         struct io_context *ioc = NULL;
3044         struct cfq_io_context *cic;
3045
3046         might_sleep_if(gfp_mask & __GFP_WAIT);
3047
3048         ioc = get_io_context(gfp_mask, cfqd->queue->node);
3049         if (!ioc)
3050                 return NULL;
3051
3052         cic = cfq_cic_lookup(cfqd, ioc);
3053         if (cic)
3054                 goto out;
3055
3056         cic = cfq_alloc_io_context(cfqd, gfp_mask);
3057         if (cic == NULL)
3058                 goto err;
3059
3060         if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))
3061                 goto err_free;
3062
3063 out:
3064         smp_read_barrier_depends();
3065         if (unlikely(ioc->ioprio_changed))
3066                 cfq_ioc_set_ioprio(ioc);
3067
3068 #ifdef CONFIG_CFQ_GROUP_IOSCHED
3069         if (unlikely(ioc->cgroup_changed))
3070                 cfq_ioc_set_cgroup(ioc);
3071 #endif
3072         return cic;
3073 err_free:
3074         cfq_cic_free(cic);
3075 err:
3076         put_io_context(ioc);
3077         return NULL;
3078 }
3079
3080 static void
3081 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
3082 {
3083         unsigned long elapsed = jiffies - cic->last_end_request;
3084         unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
3085
3086         cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
3087         cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
3088         cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
3089 }
3090
3091 static void
3092 cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3093                        struct request *rq)
3094 {
3095         sector_t sdist = 0;
3096         sector_t n_sec = blk_rq_sectors(rq);
3097         if (cfqq->last_request_pos) {
3098                 if (cfqq->last_request_pos < blk_rq_pos(rq))
3099                         sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
3100                 else
3101                         sdist = cfqq->last_request_pos - blk_rq_pos(rq);
3102         }
3103
3104         cfqq->seek_history <<= 1;
3105         if (blk_queue_nonrot(cfqd->queue))
3106                 cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
3107         else
3108                 cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
3109 }
3110
3111 /*
3112  * Disable idle window if the process thinks too long or seeks so much that
3113  * it doesn't matter
3114  */
3115 static void
3116 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3117                        struct cfq_io_context *cic)
3118 {
3119         int old_idle, enable_idle;
3120
3121         /*
3122          * Don't idle for async or idle io prio class
3123          */
3124         if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
3125                 return;
3126
3127         enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
3128
3129         if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3130                 cfq_mark_cfqq_deep(cfqq);
3131
3132         if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
3133             (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3134                 enable_idle = 0;
3135         else if (sample_valid(cic->ttime_samples)) {
3136                 if (cic->ttime_mean > cfqd->cfq_slice_idle)
3137                         enable_idle = 0;
3138                 else
3139                         enable_idle = 1;
3140         }
3141
3142         if (old_idle != enable_idle) {
3143                 cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
3144                 if (enable_idle)
3145                         cfq_mark_cfqq_idle_window(cfqq);
3146                 else
3147                         cfq_clear_cfqq_idle_window(cfqq);
3148         }
3149 }
3150
3151 /*
3152  * Check if new_cfqq should preempt the currently active queue. Return 0 for
3153  * no or if we aren't sure, a 1 will cause a preempt.
3154  */
3155 static bool
3156 cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3157                    struct request *rq)
3158 {
3159         struct cfq_queue *cfqq;
3160
3161         cfqq = cfqd->active_queue;
3162         if (!cfqq)
3163                 return false;
3164
3165         if (cfq_class_idle(new_cfqq))
3166                 return false;
3167
3168         if (cfq_class_idle(cfqq))
3169                 return true;
3170
3171         /*
3172          * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
3173          */
3174         if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
3175                 return false;
3176
3177         /*
3178          * if the new request is sync, but the currently running queue is
3179          * not, let the sync request have priority.
3180          */
3181         if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
3182                 return true;
3183
3184         if (new_cfqq->cfqg != cfqq->cfqg)
3185                 return false;
3186
3187         if (cfq_slice_used(cfqq))
3188                 return true;
3189
3190         /* Allow preemption only if we are idling on sync-noidle tree */
3191         if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
3192             cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
3193             new_cfqq->service_tree->count == 2 &&
3194             RB_EMPTY_ROOT(&cfqq->sort_list))
3195                 return true;
3196
3197         /*
3198          * So both queues are sync. Let the new request get disk time if
3199          * it's a metadata request and the current queue is doing regular IO.
3200          */
3201         if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending)
3202                 return true;
3203
3204         /*
3205          * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
3206          */
3207         if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
3208                 return true;
3209
3210         if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
3211                 return false;
3212
3213         /*
3214          * if this request is as-good as one we would expect from the
3215          * current cfqq, let it preempt
3216          */
3217         if (cfq_rq_close(cfqd, cfqq, rq))
3218                 return true;
3219
3220         return false;
3221 }
3222
3223 /*
3224  * cfqq preempts the active queue. if we allowed preempt with no slice left,
3225  * let it have half of its nominal slice.
3226  */
3227 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3228 {
3229         cfq_log_cfqq(cfqd, cfqq, "preempt");
3230         cfq_slice_expired(cfqd, 1);
3231
3232         /*
3233          * Put the new queue at the front of the of the current list,
3234          * so we know that it will be selected next.
3235          */
3236         BUG_ON(!cfq_cfqq_on_rr(cfqq));
3237
3238         cfq_service_tree_add(cfqd, cfqq, 1);
3239
3240         cfqq->slice_end = 0;
3241         cfq_mark_cfqq_slice_new(cfqq);
3242 }
3243
3244 /*
3245  * Called when a new fs request (rq) is added (to cfqq). Check if there's
3246  * something we should do about it
3247  */
3248 static void
3249 cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3250                 struct request *rq)
3251 {
3252         struct cfq_io_context *cic = RQ_CIC(rq);
3253
3254         cfqd->rq_queued++;
3255         if (rq->cmd_flags & REQ_META)
3256                 cfqq->meta_pending++;
3257
3258         cfq_update_io_thinktime(cfqd, cic);
3259         cfq_update_io_seektime(cfqd, cfqq, rq);
3260         cfq_update_idle_window(cfqd, cfqq, cic);
3261
3262         cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
3263
3264         if (cfqq == cfqd->active_queue) {
3265                 /*
3266                  * Remember that we saw a request from this process, but
3267                  * don't start queuing just yet. Otherwise we risk seeing lots
3268                  * of tiny requests, because we disrupt the normal plugging
3269                  * and merging. If the request is already larger than a single
3270                  * page, let it rip immediately. For that case we assume that
3271                  * merging is already done. Ditto for a busy system that
3272                  * has other work pending, don't risk delaying until the
3273                  * idle timer unplug to continue working.
3274                  */
3275                 if (cfq_cfqq_wait_request(cfqq)) {
3276                         if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
3277                             cfqd->busy_queues > 1) {
3278                                 cfq_del_timer(cfqd, cfqq);
3279                                 cfq_clear_cfqq_wait_request(cfqq);
3280                                 __blk_run_queue(cfqd->queue);
3281                         } else {
3282                                 cfq_blkiocg_update_idle_time_stats(
3283                                                 &cfqq->cfqg->blkg);
3284                                 cfq_mark_cfqq_must_dispatch(cfqq);
3285                         }
3286                 }
3287         } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
3288                 /*
3289                  * not the active queue - expire current slice if it is
3290                  * idle and has expired it's mean thinktime or this new queue
3291                  * has some old slice time left and is of higher priority or
3292                  * this new queue is RT and the current one is BE
3293                  */
3294                 cfq_preempt_queue(cfqd, cfqq);
3295                 __blk_run_queue(cfqd->queue);
3296         }
3297 }
3298
3299 static void cfq_insert_request(struct request_queue *q, struct request *rq)
3300 {
3301         struct cfq_data *cfqd = q->elevator->elevator_data;
3302         struct cfq_queue *cfqq = RQ_CFQQ(rq);
3303
3304         cfq_log_cfqq(cfqd, cfqq, "insert_request");
3305         cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
3306
3307         rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
3308         list_add_tail(&rq->queuelist, &cfqq->fifo);
3309         cfq_add_rq_rb(rq);
3310         cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
3311                         &cfqd->serving_group->blkg, rq_data_dir(rq),
3312                         rq_is_sync(rq));
3313         cfq_rq_enqueued(cfqd, cfqq, rq);
3314 }
3315
3316 /*
3317  * Update hw_tag based on peak queue depth over 50 samples under
3318  * sufficient load.
3319  */
3320 static void cfq_update_hw_tag(struct cfq_data *cfqd)
3321 {
3322         struct cfq_queue *cfqq = cfqd->active_queue;
3323
3324         if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
3325                 cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
3326
3327         if (cfqd->hw_tag == 1)
3328                 return;
3329
3330         if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
3331             cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
3332                 return;
3333
3334         /*
3335          * If active queue hasn't enough requests and can idle, cfq might not
3336          * dispatch sufficient requests to hardware. Don't zero hw_tag in this
3337          * case
3338          */
3339         if (cfqq && cfq_cfqq_idle_window(cfqq) &&
3340             cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
3341             CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
3342                 return;
3343
3344         if (cfqd->hw_tag_samples++ < 50)
3345                 return;
3346
3347         if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
3348                 cfqd->hw_tag = 1;
3349         else
3350                 cfqd->hw_tag = 0;
3351 }
3352
3353 static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3354 {
3355         struct cfq_io_context *cic = cfqd->active_cic;
3356
3357         /* If there are other queues in the group, don't wait */
3358         if (cfqq->cfqg->nr_cfqq > 1)
3359                 return false;
3360
3361         if (cfq_slice_used(cfqq))
3362                 return true;
3363
3364         /* if slice left is less than think time, wait busy */
3365         if (cic && sample_valid(cic->ttime_samples)
3366             && (cfqq->slice_end - jiffies < cic->ttime_mean))
3367                 return true;
3368
3369         /*
3370          * If think times is less than a jiffy than ttime_mean=0 and above
3371          * will not be true. It might happen that slice has not expired yet
3372          * but will expire soon (4-5 ns) during select_queue(). To cover the
3373          * case where think time is less than a jiffy, mark the queue wait
3374          * busy if only 1 jiffy is left in the slice.
3375          */
3376         if (cfqq->slice_end - jiffies == 1)
3377                 return true;
3378
3379         return false;
3380 }
3381
3382 static void cfq_completed_request(struct request_queue *q, struct request *rq)
3383 {
3384         struct cfq_queue *cfqq = RQ_CFQQ(rq);
3385         struct cfq_data *cfqd = cfqq->cfqd;
3386         const int sync = rq_is_sync(rq);
3387         unsigned long now;
3388
3389         now = jiffies;
3390         cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d",
3391                      !!(rq->cmd_flags & REQ_NOIDLE));
3392
3393         cfq_update_hw_tag(cfqd);
3394
3395         WARN_ON(!cfqd->rq_in_driver);
3396         WARN_ON(!cfqq->dispatched);
3397         cfqd->rq_in_driver--;
3398         cfqq->dispatched--;
3399         cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg,
3400                         rq_start_time_ns(rq), rq_io_start_time_ns(rq),
3401                         rq_data_dir(rq), rq_is_sync(rq));
3402
3403         cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
3404
3405         if (sync) {
3406                 RQ_CIC(rq)->last_end_request = now;
3407                 if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
3408                         cfqd->last_delayed_sync = now;
3409         }
3410
3411         /*
3412          * If this is the active queue, check if it needs to be expired,
3413          * or if we want to idle in case it has no pending requests.
3414          */
3415         if (cfqd->active_queue == cfqq) {
3416                 const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
3417
3418                 if (cfq_cfqq_slice_new(cfqq)) {
3419                         cfq_set_prio_slice(cfqd, cfqq);
3420                         cfq_clear_cfqq_slice_new(cfqq);
3421                 }
3422
3423                 /*
3424                  * Should we wait for next request to come in before we expire
3425                  * the queue.
3426                  */
3427                 if (cfq_should_wait_busy(cfqd, cfqq)) {
3428                         cfqq->slice_end = jiffies + cfqd->cfq_slice_idle;
3429                         cfq_mark_cfqq_wait_busy(cfqq);
3430                         cfq_log_cfqq(cfqd, cfqq, "will busy wait");
3431                 }
3432
3433                 /*
3434                  * Idling is not enabled on:
3435                  * - expired queues
3436                  * - idle-priority queues
3437                  * - async queues
3438                  * - queues with still some requests queued
3439                  * - when there is a close cooperator
3440                  */
3441                 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
3442                         cfq_slice_expired(cfqd, 1);
3443                 else if (sync && cfqq_empty &&
3444                          !cfq_close_cooperator(cfqd, cfqq)) {
3445                         cfqd->noidle_tree_requires_idle |=
3446                                 !(rq->cmd_flags & REQ_NOIDLE);
3447                         /*
3448                          * Idling is enabled for SYNC_WORKLOAD.
3449                          * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
3450                          * only if we processed at least one !REQ_NOIDLE request
3451                          */
3452                         if (cfqd->serving_type == SYNC_WORKLOAD
3453                             || cfqd->noidle_tree_requires_idle
3454                             || cfqq->cfqg->nr_cfqq == 1)
3455                                 cfq_arm_slice_timer(cfqd);
3456                 }
3457         }
3458
3459         if (!cfqd->rq_in_driver)
3460                 cfq_schedule_dispatch(cfqd);
3461 }
3462
3463 /*
3464  * we temporarily boost lower priority queues if they are holding fs exclusive
3465  * resources. they are boosted to normal prio (CLASS_BE/4)
3466  */
3467 static void cfq_prio_boost(struct cfq_queue *cfqq)
3468 {
3469         if (has_fs_excl()) {
3470                 /*
3471                  * boost idle prio on transactions that would lock out other
3472                  * users of the filesystem
3473                  */
3474                 if (cfq_class_idle(cfqq))
3475                         cfqq->ioprio_class = IOPRIO_CLASS_BE;
3476                 if (cfqq->ioprio > IOPRIO_NORM)
3477                         cfqq->ioprio = IOPRIO_NORM;
3478         } else {
3479                 /*
3480                  * unboost the queue (if needed)
3481                  */
3482                 cfqq->ioprio_class = cfqq->org_ioprio_class;
3483                 cfqq->ioprio = cfqq->org_ioprio;
3484         }
3485 }
3486
3487 static inline int __cfq_may_queue(struct cfq_queue *cfqq)
3488 {
3489         if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
3490                 cfq_mark_cfqq_must_alloc_slice(cfqq);
3491                 return ELV_MQUEUE_MUST;
3492         }
3493
3494         return ELV_MQUEUE_MAY;
3495 }
3496
3497 static int cfq_may_queue(struct request_queue *q, int rw)
3498 {
3499         struct cfq_data *cfqd = q->elevator->elevator_data;
3500         struct task_struct *tsk = current;
3501         struct cfq_io_context *cic;
3502         struct cfq_queue *cfqq;
3503
3504         /*
3505          * don't force setup of a queue from here, as a call to may_queue
3506          * does not necessarily imply that a request actually will be queued.
3507          * so just lookup a possibly existing queue, or return 'may queue'
3508          * if that fails
3509          */
3510         cic = cfq_cic_lookup(cfqd, tsk->io_context);
3511         if (!cic)
3512                 return ELV_MQUEUE_MAY;
3513
3514         cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
3515         if (cfqq) {
3516                 cfq_init_prio_data(cfqq, cic->ioc);
3517                 cfq_prio_boost(cfqq);
3518
3519                 return __cfq_may_queue(cfqq);
3520         }
3521
3522         return ELV_MQUEUE_MAY;
3523 }
3524
3525 /*
3526  * queue lock held here
3527  */
3528 static void cfq_put_request(struct request *rq)
3529 {
3530         struct cfq_queue *cfqq = RQ_CFQQ(rq);
3531
3532         if (cfqq) {
3533                 const int rw = rq_data_dir(rq);
3534
3535                 BUG_ON(!cfqq->allocated[rw]);
3536                 cfqq->allocated[rw]--;
3537
3538                 put_io_context(RQ_CIC(rq)->ioc);
3539
3540                 rq->elevator_private = NULL;
3541                 rq->elevator_private2 = NULL;
3542
3543                 /* Put down rq reference on cfqg */
3544                 cfq_put_cfqg(RQ_CFQG(rq));
3545                 rq->elevator_private3 = NULL;
3546
3547                 cfq_put_queue(cfqq);
3548         }
3549 }
3550
3551 static struct cfq_queue *
3552 cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
3553                 struct cfq_queue *cfqq)
3554 {
3555         cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
3556         cic_set_cfqq(cic, cfqq->new_cfqq, 1);
3557         cfq_mark_cfqq_coop(cfqq->new_cfqq);
3558         cfq_put_queue(cfqq);
3559         return cic_to_cfqq(cic, 1);
3560 }
3561
3562 /*
3563  * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
3564  * was the last process referring to said cfqq.
3565  */
3566 static struct cfq_queue *
3567 split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
3568 {
3569         if (cfqq_process_refs(cfqq) == 1) {
3570                 cfqq->pid = current->pid;
3571                 cfq_clear_cfqq_coop(cfqq);
3572                 cfq_clear_cfqq_split_coop(cfqq);
3573                 return cfqq;
3574         }
3575
3576         cic_set_cfqq(cic, NULL, 1);
3577
3578         cfq_put_cooperator(cfqq);
3579
3580         cfq_put_queue(cfqq);
3581         return NULL;
3582 }
3583 /*
3584  * Allocate cfq data structures associated with this request.
3585  */
3586 static int
3587 cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
3588 {
3589         struct cfq_data *cfqd = q->elevator->elevator_data;
3590         struct cfq_io_context *cic;
3591         const int rw = rq_data_dir(rq);
3592         const bool is_sync = rq_is_sync(rq);
3593         struct cfq_queue *cfqq;
3594         unsigned long flags;
3595
3596         might_sleep_if(gfp_mask & __GFP_WAIT);
3597
3598         cic = cfq_get_io_context(cfqd, gfp_mask);
3599
3600         spin_lock_irqsave(q->queue_lock, flags);
3601
3602         if (!cic)
3603                 goto queue_fail;
3604
3605 new_queue:
3606         cfqq = cic_to_cfqq(cic, is_sync);
3607         if (!cfqq || cfqq == &cfqd->oom_cfqq) {
3608                 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
3609                 cic_set_cfqq(cic, cfqq, is_sync);
3610         } else {
3611                 /*
3612                  * If the queue was seeky for too long, break it apart.
3613                  */
3614                 if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
3615                         cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
3616                         cfqq = split_cfqq(cic, cfqq);
3617                         if (!cfqq)
3618                                 goto new_queue;
3619                 }
3620
3621                 /*
3622                  * Check to see if this queue is scheduled to merge with
3623                  * another, closely cooperating queue.  The merging of
3624                  * queues happens here as it must be done in process context.
3625                  * The reference on new_cfqq was taken in merge_cfqqs.
3626                  */
3627                 if (cfqq->new_cfqq)
3628                         cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
3629         }
3630
3631         cfqq->allocated[rw]++;
3632         atomic_inc(&cfqq->ref);
3633
3634         spin_unlock_irqrestore(q->queue_lock, flags);
3635
3636         rq->elevator_private = cic;
3637         rq->elevator_private2 = cfqq;
3638         rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
3639         return 0;
3640
3641 queue_fail:
3642         if (cic)
3643                 put_io_context(cic->ioc);
3644
3645         cfq_schedule_dispatch(cfqd);
3646         spin_unlock_irqrestore(q->queue_lock, flags);
3647         cfq_log(cfqd, "set_request fail");
3648         return 1;
3649 }
3650
3651 static void cfq_kick_queue(struct work_struct *work)
3652 {
3653         struct cfq_data *cfqd =
3654                 container_of(work, struct cfq_data, unplug_work);
3655         struct request_queue *q = cfqd->queue;
3656
3657         spin_lock_irq(q->queue_lock);
3658         __blk_run_queue(cfqd->queue);
3659         spin_unlock_irq(q->queue_lock);
3660 }
3661
3662 /*
3663  * Timer running if the active_queue is currently idling inside its time slice
3664  */
3665 static void cfq_idle_slice_timer(unsigned long data)
3666 {
3667         struct cfq_data *cfqd = (struct cfq_data *) data;
3668         struct cfq_queue *cfqq;
3669         unsigned long flags;
3670         int timed_out = 1;
3671
3672         cfq_log(cfqd, "idle timer fired");
3673
3674         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
3675
3676         cfqq = cfqd->active_queue;
3677         if (cfqq) {
3678                 timed_out = 0;
3679
3680                 /*
3681                  * We saw a request before the queue expired, let it through
3682                  */
3683                 if (cfq_cfqq_must_dispatch(cfqq))
3684                         goto out_kick;
3685
3686                 /*
3687                  * expired
3688                  */
3689                 if (cfq_slice_used(cfqq))
3690                         goto expire;
3691
3692                 /*
3693                  * only expire and reinvoke request handler, if there are
3694                  * other queues with pending requests
3695                  */
3696                 if (!cfqd->busy_queues)
3697                         goto out_cont;
3698
3699                 /*
3700                  * not expired and it has a request pending, let it dispatch
3701                  */
3702                 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3703                         goto out_kick;
3704
3705                 /*
3706                  * Queue depth flag is reset only when the idle didn't succeed
3707                  */
3708                 cfq_clear_cfqq_deep(cfqq);
3709         }
3710 expire:
3711         cfq_slice_expired(cfqd, timed_out);
3712 out_kick:
3713         cfq_schedule_dispatch(cfqd);
3714 out_cont:
3715         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
3716 }
3717
3718 static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
3719 {
3720         del_timer_sync(&cfqd->idle_slice_timer);
3721         cancel_work_sync(&cfqd->unplug_work);
3722 }
3723
3724 static void cfq_put_async_queues(struct cfq_data *cfqd)
3725 {
3726         int i;
3727
3728         for (i = 0; i < IOPRIO_BE_NR; i++) {
3729                 if (cfqd->async_cfqq[0][i])
3730                         cfq_put_queue(cfqd->async_cfqq[0][i]);
3731                 if (cfqd->async_cfqq[1][i])
3732                         cfq_put_queue(cfqd->async_cfqq[1][i]);
3733         }
3734
3735         if (cfqd->async_idle_cfqq)
3736                 cfq_put_queue(cfqd->async_idle_cfqq);
3737 }
3738
3739 static void cfq_cfqd_free(struct rcu_head *head)
3740 {
3741         kfree(container_of(head, struct cfq_data, rcu));
3742 }
3743
3744 static void cfq_exit_queue(struct elevator_queue *e)
3745 {
3746         struct cfq_data *cfqd = e->elevator_data;
3747         struct request_queue *q = cfqd->queue;
3748
3749         cfq_shutdown_timer_wq(cfqd);
3750
3751         spin_lock_irq(q->queue_lock);
3752
3753         if (cfqd->active_queue)
3754                 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
3755
3756         while (!list_empty(&cfqd->cic_list)) {
3757                 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
3758                                                         struct cfq_io_context,
3759                                                         queue_list);
3760
3761                 __cfq_exit_single_io_context(cfqd, cic);
3762         }
3763
3764         cfq_put_async_queues(cfqd);
3765         cfq_release_cfq_groups(cfqd);
3766         cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg);
3767
3768         spin_unlock_irq(q->queue_lock);
3769
3770         cfq_shutdown_timer_wq(cfqd);
3771
3772         spin_lock(&cic_index_lock);
3773         ida_remove(&cic_index_ida, cfqd->cic_index);
3774         spin_unlock(&cic_index_lock);
3775
3776         /* Wait for cfqg->blkg->key accessors to exit their grace periods. */
3777         call_rcu(&cfqd->rcu, cfq_cfqd_free);
3778 }
3779
3780 static int cfq_alloc_cic_index(void)
3781 {
3782         int index, error;
3783
3784         do {
3785                 if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))
3786                         return -ENOMEM;
3787
3788                 spin_lock(&cic_index_lock);
3789                 error = ida_get_new(&cic_index_ida, &index);
3790                 spin_unlock(&cic_index_lock);
3791                 if (error && error != -EAGAIN)
3792                         return error;
3793         } while (error);
3794
3795         return index;
3796 }
3797
3798 static void *cfq_init_queue(struct request_queue *q)
3799 {
3800         struct cfq_data *cfqd;
3801         int i, j;
3802         struct cfq_group *cfqg;
3803         struct cfq_rb_root *st;
3804
3805         i = cfq_alloc_cic_index();
3806         if (i < 0)
3807                 return NULL;
3808
3809         cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
3810         if (!cfqd)
3811                 return NULL;
3812
3813         cfqd->cic_index = i;
3814
3815         /* Init root service tree */
3816         cfqd->grp_service_tree = CFQ_RB_ROOT;
3817
3818         /* Init root group */
3819         cfqg = &cfqd->root_group;
3820         for_each_cfqg_st(cfqg, i, j, st)
3821                 *st = CFQ_RB_ROOT;
3822         RB_CLEAR_NODE(&cfqg->rb_node);
3823
3824         /* Give preference to root group over other groups */
3825         cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
3826
3827 #ifdef CONFIG_CFQ_GROUP_IOSCHED
3828         /*
3829          * Take a reference to root group which we never drop. This is just
3830          * to make sure that cfq_put_cfqg() does not try to kfree root group
3831          */
3832         atomic_set(&cfqg->ref, 1);
3833         rcu_read_lock();
3834         cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
3835                                         (void *)cfqd, 0);
3836         rcu_read_unlock();
3837 #endif
3838         /*
3839          * Not strictly needed (since RB_ROOT just clears the node and we
3840          * zeroed cfqd on alloc), but better be safe in case someone decides
3841          * to add magic to the rb code
3842          */
3843         for (i = 0; i < CFQ_PRIO_LISTS; i++)
3844                 cfqd->prio_trees[i] = RB_ROOT;
3845
3846         /*
3847          * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
3848          * Grab a permanent reference to it, so that the normal code flow
3849          * will not attempt to free it.
3850          */
3851         cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
3852         atomic_inc(&cfqd->oom_cfqq.ref);
3853         cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
3854
3855         INIT_LIST_HEAD(&cfqd->cic_list);
3856
3857         cfqd->queue = q;
3858
3859         init_timer(&cfqd->idle_slice_timer);
3860         cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
3861         cfqd->idle_slice_timer.data = (unsigned long) cfqd;
3862
3863         INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
3864
3865         cfqd->cfq_quantum = cfq_quantum;
3866         cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
3867         cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
3868         cfqd->cfq_back_max = cfq_back_max;
3869         cfqd->cfq_back_penalty = cfq_back_penalty;
3870         cfqd->cfq_slice[0] = cfq_slice_async;
3871         cfqd->cfq_slice[1] = cfq_slice_sync;
3872         cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
3873         cfqd->cfq_slice_idle = cfq_slice_idle;
3874         cfqd->cfq_latency = 1;
3875         cfqd->cfq_group_isolation = 0;
3876         cfqd->hw_tag = -1;
3877         /*
3878          * we optimistically start assuming sync ops weren't delayed in last
3879          * second, in order to have larger depth for async operations.
3880          */
3881         cfqd->last_delayed_sync = jiffies - HZ;
3882         return cfqd;
3883 }
3884
3885 static void cfq_slab_kill(void)
3886 {
3887         /*
3888          * Caller already ensured that pending RCU callbacks are completed,
3889          * so we should have no busy allocations at this point.
3890          */
3891         if (cfq_pool)
3892                 kmem_cache_destroy(cfq_pool);
3893         if (cfq_ioc_pool)
3894                 kmem_cache_destroy(cfq_ioc_pool);
3895 }
3896
3897 static int __init cfq_slab_setup(void)
3898 {
3899         cfq_pool = KMEM_CACHE(cfq_queue, 0);
3900         if (!cfq_pool)
3901                 goto fail;
3902
3903         cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);
3904         if (!cfq_ioc_pool)
3905                 goto fail;
3906
3907         return 0;
3908 fail:
3909         cfq_slab_kill();
3910         return -ENOMEM;
3911 }
3912
3913 /*
3914  * sysfs parts below -->
3915  */
3916 static ssize_t
3917 cfq_var_show(unsigned int var, char *page)
3918 {
3919         return sprintf(page, "%d\n", var);
3920 }
3921
3922 static ssize_t
3923 cfq_var_store(unsigned int *var, const char *page, size_t count)
3924 {
3925         char *p = (char *) page;
3926
3927         *var = simple_strtoul(p, &p, 10);
3928         return count;
3929 }
3930
3931 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                            \
3932 static ssize_t __FUNC(struct elevator_queue *e, char *page)             \
3933 {                                                                       \
3934         struct cfq_data *cfqd = e->elevator_data;                       \
3935         unsigned int __data = __VAR;                                    \
3936         if (__CONV)                                                     \
3937                 __data = jiffies_to_msecs(__data);                      \
3938         return cfq_var_show(__data, (page));                            \
3939 }
3940 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
3941 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
3942 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
3943 SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
3944 SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
3945 SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
3946 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
3947 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
3948 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
3949 SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
3950 SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);
3951 #undef SHOW_FUNCTION
3952
3953 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                 \
3954 static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
3955 {                                                                       \
3956         struct cfq_data *cfqd = e->elevator_data;                       \
3957         unsigned int __data;                                            \
3958         int ret = cfq_var_store(&__data, (page), count);                \
3959         if (__data < (MIN))                                             \
3960                 __data = (MIN);                                         \
3961         else if (__data > (MAX))                                        \
3962                 __data = (MAX);                                         \
3963         if (__CONV)                                                     \
3964                 *(__PTR) = msecs_to_jiffies(__data);                    \
3965         else                                                            \
3966                 *(__PTR) = __data;                                      \
3967         return ret;                                                     \
3968 }
3969 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
3970 STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
3971                 UINT_MAX, 1);
3972 STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
3973                 UINT_MAX, 1);
3974 STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
3975 STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
3976                 UINT_MAX, 0);
3977 STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
3978 STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
3979 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
3980 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
3981                 UINT_MAX, 0);
3982 STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
3983 STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);
3984 #undef STORE_FUNCTION
3985
3986 #define CFQ_ATTR(name) \
3987         __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
3988
3989 static struct elv_fs_entry cfq_attrs[] = {
3990         CFQ_ATTR(quantum),
3991         CFQ_ATTR(fifo_expire_sync),
3992         CFQ_ATTR(fifo_expire_async),
3993         CFQ_ATTR(back_seek_max),
3994         CFQ_ATTR(back_seek_penalty),
3995         CFQ_ATTR(slice_sync),
3996         CFQ_ATTR(slice_async),
3997         CFQ_ATTR(slice_async_rq),
3998         CFQ_ATTR(slice_idle),
3999         CFQ_ATTR(low_latency),
4000         CFQ_ATTR(group_isolation),
4001         __ATTR_NULL
4002 };
4003
4004 static struct elevator_type iosched_cfq = {
4005         .ops = {
4006                 .elevator_merge_fn =            cfq_merge,
4007                 .elevator_merged_fn =           cfq_merged_request,
4008                 .elevator_merge_req_fn =        cfq_merged_requests,
4009                 .elevator_allow_merge_fn =      cfq_allow_merge,
4010                 .elevator_bio_merged_fn =       cfq_bio_merged,
4011                 .elevator_dispatch_fn =         cfq_dispatch_requests,
4012                 .elevator_add_req_fn =          cfq_insert_request,
4013                 .elevator_activate_req_fn =     cfq_activate_request,
4014                 .elevator_deactivate_req_fn =   cfq_deactivate_request,
4015                 .elevator_queue_empty_fn =      cfq_queue_empty,
4016                 .elevator_completed_req_fn =    cfq_completed_request,
4017                 .elevator_former_req_fn =       elv_rb_former_request,
4018                 .elevator_latter_req_fn =       elv_rb_latter_request,
4019                 .elevator_set_req_fn =          cfq_set_request,
4020                 .elevator_put_req_fn =          cfq_put_request,
4021                 .elevator_may_queue_fn =        cfq_may_queue,
4022                 .elevator_init_fn =             cfq_init_queue,
4023                 .elevator_exit_fn =             cfq_exit_queue,
4024                 .trim =                         cfq_free_io_context,
4025         },
4026         .elevator_attrs =       cfq_attrs,
4027         .elevator_name =        "cfq",
4028         .elevator_owner =       THIS_MODULE,
4029 };
4030
4031 #ifdef CONFIG_CFQ_GROUP_IOSCHED
4032 static struct blkio_policy_type blkio_policy_cfq = {
4033         .ops = {
4034                 .blkio_unlink_group_fn =        cfq_unlink_blkio_group,
4035                 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight,
4036         },
4037 };
4038 #else
4039 static struct blkio_policy_type blkio_policy_cfq;
4040 #endif
4041
4042 static int __init cfq_init(void)
4043 {
4044         /*
4045          * could be 0 on HZ < 1000 setups
4046          */
4047         if (!cfq_slice_async)
4048                 cfq_slice_async = 1;
4049         if (!cfq_slice_idle)
4050                 cfq_slice_idle = 1;
4051
4052         if (cfq_slab_setup())
4053                 return -ENOMEM;
4054
4055         elv_register(&iosched_cfq);
4056         blkio_policy_register(&blkio_policy_cfq);
4057
4058         return 0;
4059 }
4060
4061 static void __exit cfq_exit(void)
4062 {
4063         DECLARE_COMPLETION_ONSTACK(all_gone);
4064         blkio_policy_unregister(&blkio_policy_cfq);
4065         elv_unregister(&iosched_cfq);
4066         ioc_gone = &all_gone;
4067         /* ioc_gone's update must be visible before reading ioc_count */
4068         smp_wmb();
4069
4070         /*
4071          * this also protects us from entering cfq_slab_kill() with
4072          * pending RCU callbacks
4073          */
4074         if (elv_ioc_count_read(cfq_ioc_count))
4075                 wait_for_completion(&all_gone);
4076         ida_destroy(&cic_index_ida);
4077         cfq_slab_kill();
4078 }
4079
4080 module_init(cfq_init);
4081 module_exit(cfq_exit);
4082
4083 MODULE_AUTHOR("Jens Axboe");
4084 MODULE_LICENSE("GPL");
4085 MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");