block, cfq: fix race condition in cic creation path and tighten locking
[linux-2.6-block.git] / block / cfq-iosched.c
index 16ace89613bc6e4ce343cdd2f1da85218b7dc5e7..181a63d36691ebac9653894024e0cfc486321749 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
+#include "blk.h"
 #include "cfq.h"
 
 /*
@@ -65,9 +66,6 @@ static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
 static struct completion *ioc_gone;
 static DEFINE_SPINLOCK(ioc_gone_lock);
 
-static DEFINE_SPINLOCK(cic_index_lock);
-static DEFINE_IDA(cic_index_ida);
-
 #define CFQ_PRIO_LISTS         IOPRIO_BE_NR
 #define cfq_class_idle(cfqq)   ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
 #define cfq_class_rt(cfqq)     ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
@@ -290,7 +288,6 @@ struct cfq_data {
        unsigned int cfq_group_idle;
        unsigned int cfq_latency;
 
-       unsigned int cic_index;
        struct list_head cic_list;
 
        /*
@@ -484,7 +481,7 @@ static inline void cic_set_cfqq(struct cfq_io_context *cic,
 
 static inline void *cfqd_dead_key(struct cfq_data *cfqd)
 {
-       return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
+       return (void *)(cfqd->queue->id << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
 }
 
 static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic)
@@ -2712,21 +2709,26 @@ static void cfq_cic_free(struct cfq_io_context *cic)
        call_rcu(&cic->rcu_head, cfq_cic_free_rcu);
 }
 
-static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
+static void cfq_release_cic(struct cfq_io_context *cic)
 {
-       unsigned long flags;
+       struct io_context *ioc = cic->ioc;
        unsigned long dead_key = (unsigned long) cic->key;
 
        BUG_ON(!(dead_key & CIC_DEAD_KEY));
-
-       spin_lock_irqsave(&ioc->lock, flags);
        radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT);
        hlist_del_rcu(&cic->cic_list);
-       spin_unlock_irqrestore(&ioc->lock, flags);
-
        cfq_cic_free(cic);
 }
 
+static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&ioc->lock, flags);
+       cfq_release_cic(cic);
+       spin_unlock_irqrestore(&ioc->lock, flags);
+}
+
 /*
  * Must be called with rcu_read_lock() held or preemption otherwise disabled.
  * Only two callers of this - ->dtor() which is called with the rcu_read_lock(),
@@ -2776,9 +2778,9 @@ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
        cfq_put_queue(cfqq);
 }
 
-static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
-                                        struct cfq_io_context *cic)
+static void cfq_exit_cic(struct cfq_io_context *cic)
 {
+       struct cfq_data *cfqd = cic_to_cfqd(cic);
        struct io_context *ioc = cic->ioc;
 
        list_del_init(&cic->queue_list);
@@ -2826,7 +2828,7 @@ static void cfq_exit_single_io_context(struct io_context *ioc,
                 */
                smp_read_barrier_depends();
                if (cic->key == cfqd)
-                       __cfq_exit_single_io_context(cfqd, cic);
+                       cfq_exit_cic(cic);
 
                spin_unlock_irqrestore(q->queue_lock, flags);
        }
@@ -2902,17 +2904,14 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
        cfq_clear_cfqq_prio_changed(cfqq);
 }
 
-static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
+static void changed_ioprio(struct cfq_io_context *cic)
 {
        struct cfq_data *cfqd = cic_to_cfqd(cic);
        struct cfq_queue *cfqq;
-       unsigned long flags;
 
        if (unlikely(!cfqd))
                return;
 
-       spin_lock_irqsave(cfqd->queue->queue_lock, flags);
-
        cfqq = cic->cfqq[BLK_RW_ASYNC];
        if (cfqq) {
                struct cfq_queue *new_cfqq;
@@ -2927,14 +2926,6 @@ static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
        cfqq = cic->cfqq[BLK_RW_SYNC];
        if (cfqq)
                cfq_mark_cfqq_prio_changed(cfqq);
-
-       spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
-}
-
-static void cfq_ioc_set_ioprio(struct io_context *ioc)
-{
-       call_for_each_cic(ioc, changed_ioprio);
-       ioc->ioprio_changed = 0;
 }
 
 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
@@ -2958,11 +2949,10 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
+static void changed_cgroup(struct cfq_io_context *cic)
 {
        struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
        struct cfq_data *cfqd = cic_to_cfqd(cic);
-       unsigned long flags;
        struct request_queue *q;
 
        if (unlikely(!cfqd))
@@ -2970,8 +2960,6 @@ static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
 
        q = cfqd->queue;
 
-       spin_lock_irqsave(q->queue_lock, flags);
-
        if (sync_cfqq) {
                /*
                 * Drop reference to sync queue. A new sync queue will be
@@ -2981,14 +2969,6 @@ static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
                cic_set_cfqq(cic, NULL, 1);
                cfq_put_queue(sync_cfqq);
        }
-
-       spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
-static void cfq_ioc_set_cgroup(struct io_context *ioc)
-{
-       call_for_each_cic(ioc, changed_cgroup);
-       ioc->cgroup_changed = 0;
 }
 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 
@@ -3105,7 +3085,7 @@ cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
        BUG_ON(rcu_dereference_check(ioc->ioc_data,
                lockdep_is_held(&ioc->lock)) == cic);
 
-       radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
+       radix_tree_delete(&ioc->radix_root, cfqd->queue->id);
        hlist_del_rcu(&cic->cic_list);
        spin_unlock_irqrestore(&ioc->lock, flags);
 
@@ -3133,7 +3113,7 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
        }
 
        do {
-               cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index);
+               cic = radix_tree_lookup(&ioc->radix_root, cfqd->queue->id);
                rcu_read_unlock();
                if (!cic)
                        break;
@@ -3152,87 +3132,118 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
        return cic;
 }
 
-/*
- * Add cic into ioc, using cfqd as the search key. This enables us to lookup
- * the process specific cfq io context when entered from the block layer.
- * Also adds the cic to a per-cfqd list, used when this queue is removed.
+/**
+ * cfq_create_cic - create and link a cfq_io_context
+ * @cfqd: cfqd of interest
+ * @gfp_mask: allocation mask
+ *
+ * Make sure cfq_io_context linking %current->io_context and @cfqd exists.
+ * If ioc and/or cic doesn't exist, they will be created using @gfp_mask.
  */
-static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
-                       struct cfq_io_context *cic, gfp_t gfp_mask)
+static int cfq_create_cic(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
-       unsigned long flags;
-       int ret;
+       struct request_queue *q = cfqd->queue;
+       struct cfq_io_context *cic = NULL;
+       struct io_context *ioc;
+       int ret = -ENOMEM;
 
-       ret = radix_tree_preload(gfp_mask);
-       if (!ret) {
-               cic->ioc = ioc;
-               cic->key = cfqd;
+       might_sleep_if(gfp_mask & __GFP_WAIT);
 
-               spin_lock_irqsave(&ioc->lock, flags);
-               ret = radix_tree_insert(&ioc->radix_root,
-                                               cfqd->cic_index, cic);
-               if (!ret)
-                       hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
-               spin_unlock_irqrestore(&ioc->lock, flags);
+       /* allocate stuff */
+       ioc = current_io_context(gfp_mask, q->node);
+       if (!ioc)
+               goto out;
+
+       cic = cfq_alloc_io_context(cfqd, gfp_mask);
+       if (!cic)
+               goto out;
+
+       ret = radix_tree_preload(gfp_mask);
+       if (ret)
+               goto out;
 
-               radix_tree_preload_end();
+       cic->ioc = ioc;
+       cic->key = cfqd;
+       cic->q = cfqd->queue;
 
-               if (!ret) {
-                       spin_lock_irqsave(cfqd->queue->queue_lock, flags);
-                       list_add(&cic->queue_list, &cfqd->cic_list);
-                       spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
-               }
+       /* lock both q and ioc and try to link @cic */
+       spin_lock_irq(q->queue_lock);
+       spin_lock(&ioc->lock);
+
+       ret = radix_tree_insert(&ioc->radix_root, q->id, cic);
+       if (likely(!ret)) {
+               hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
+               list_add(&cic->queue_list, &cfqd->cic_list);
+               cic = NULL;
+       } else if (ret == -EEXIST) {
+               /* someone else already did it */
+               ret = 0;
        }
 
+       spin_unlock(&ioc->lock);
+       spin_unlock_irq(q->queue_lock);
+
+       radix_tree_preload_end();
+out:
        if (ret)
                printk(KERN_ERR "cfq: cic link failed!\n");
-
+       if (cic)
+               cfq_cic_free(cic);
        return ret;
 }
 
-/*
- * Setup general io context and cfq io context. There can be several cfq
- * io contexts per general io context, if this process is doing io to more
- * than one device managed by cfq.
+/**
+ * cfq_get_io_context - acquire cfq_io_context and bump refcnt on io_context
+ * @cfqd: cfqd to setup cic for
+ * @gfp_mask: allocation mask
+ *
+ * Return cfq_io_context associating @cfqd and %current->io_context and
+ * bump refcnt on io_context.  If ioc or cic doesn't exist, they're created
+ * using @gfp_mask.
+ *
+ * Must be called under queue_lock which may be released and re-acquired.
+ * This function also may sleep depending on @gfp_mask.
  */
 static struct cfq_io_context *
 cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
-       struct io_context *ioc = NULL;
-       struct cfq_io_context *cic;
-
-       might_sleep_if(gfp_mask & __GFP_WAIT);
-
-       ioc = get_io_context(gfp_mask, cfqd->queue->node);
-       if (!ioc)
-               return NULL;
-
-       cic = cfq_cic_lookup(cfqd, ioc);
-       if (cic)
-               goto out;
-
-       cic = cfq_alloc_io_context(cfqd, gfp_mask);
-       if (cic == NULL)
-               goto err;
+       struct request_queue *q = cfqd->queue;
+       struct cfq_io_context *cic = NULL;
+       struct io_context *ioc;
+       int err;
+
+       lockdep_assert_held(q->queue_lock);
+
+       while (true) {
+               /* fast path */
+               ioc = current->io_context;
+               if (likely(ioc)) {
+                       cic = cfq_cic_lookup(cfqd, ioc);
+                       if (likely(cic))
+                               break;
+               }
 
-       if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))
-               goto err_free;
+               /* slow path - unlock, create missing ones and retry */
+               spin_unlock_irq(q->queue_lock);
+               err = cfq_create_cic(cfqd, gfp_mask);
+               spin_lock_irq(q->queue_lock);
+               if (err)
+                       return NULL;
+       }
 
-out:
-       smp_read_barrier_depends();
-       if (unlikely(ioc->ioprio_changed))
-               cfq_ioc_set_ioprio(ioc);
+       /* bump @ioc's refcnt and handle changed notifications */
+       get_io_context(ioc);
 
+       if (unlikely(cic->changed)) {
+               if (test_and_clear_bit(CIC_IOPRIO_CHANGED, &cic->changed))
+                       changed_ioprio(cic);
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-       if (unlikely(ioc->cgroup_changed))
-               cfq_ioc_set_cgroup(ioc);
+               if (test_and_clear_bit(CIC_CGROUP_CHANGED, &cic->changed))
+                       changed_cgroup(cic);
 #endif
+       }
+
        return cic;
-err_free:
-       cfq_cic_free(cic);
-err:
-       put_io_context(ioc);
-       return NULL;
 }
 
 static void
@@ -3768,14 +3779,11 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
        const int rw = rq_data_dir(rq);
        const bool is_sync = rq_is_sync(rq);
        struct cfq_queue *cfqq;
-       unsigned long flags;
 
        might_sleep_if(gfp_mask & __GFP_WAIT);
 
+       spin_lock_irq(q->queue_lock);
        cic = cfq_get_io_context(cfqd, gfp_mask);
-
-       spin_lock_irqsave(q->queue_lock, flags);
-
        if (!cic)
                goto queue_fail;
 
@@ -3811,12 +3819,12 @@ new_queue:
        rq->elevator_private[0] = cic;
        rq->elevator_private[1] = cfqq;
        rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
-       spin_unlock_irqrestore(q->queue_lock, flags);
+       spin_unlock_irq(q->queue_lock);
        return 0;
 
 queue_fail:
        cfq_schedule_dispatch(cfqd);
-       spin_unlock_irqrestore(q->queue_lock, flags);
+       spin_unlock_irq(q->queue_lock);
        cfq_log(cfqd, "set_request fail");
        return 1;
 }
@@ -3927,7 +3935,7 @@ static void cfq_exit_queue(struct elevator_queue *e)
                                                        struct cfq_io_context,
                                                        queue_list);
 
-               __cfq_exit_single_io_context(cfqd, cic);
+               cfq_exit_cic(cic);
        }
 
        cfq_put_async_queues(cfqd);
@@ -3944,10 +3952,6 @@ static void cfq_exit_queue(struct elevator_queue *e)
 
        cfq_shutdown_timer_wq(cfqd);
 
-       spin_lock(&cic_index_lock);
-       ida_remove(&cic_index_ida, cfqd->cic_index);
-       spin_unlock(&cic_index_lock);
-
        /*
         * Wait for cfqg->blkg->key accessors to exit their grace periods.
         * Do this wait only if there are other unlinked groups out
@@ -3969,24 +3973,6 @@ static void cfq_exit_queue(struct elevator_queue *e)
        kfree(cfqd);
 }
 
-static int cfq_alloc_cic_index(void)
-{
-       int index, error;
-
-       do {
-               if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))
-                       return -ENOMEM;
-
-               spin_lock(&cic_index_lock);
-               error = ida_get_new(&cic_index_ida, &index);
-               spin_unlock(&cic_index_lock);
-               if (error && error != -EAGAIN)
-                       return error;
-       } while (error);
-
-       return index;
-}
-
 static void *cfq_init_queue(struct request_queue *q)
 {
        struct cfq_data *cfqd;
@@ -3994,23 +3980,9 @@ static void *cfq_init_queue(struct request_queue *q)
        struct cfq_group *cfqg;
        struct cfq_rb_root *st;
 
-       i = cfq_alloc_cic_index();
-       if (i < 0)
-               return NULL;
-
        cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
-       if (!cfqd) {
-               spin_lock(&cic_index_lock);
-               ida_remove(&cic_index_ida, i);
-               spin_unlock(&cic_index_lock);
+       if (!cfqd)
                return NULL;
-       }
-
-       /*
-        * Don't need take queue_lock in the routine, since we are
-        * initializing the ioscheduler, and nobody is using cfqd
-        */
-       cfqd->cic_index = i;
 
        /* Init root service tree */
        cfqd->grp_service_tree = CFQ_RB_ROOT;
@@ -4294,7 +4266,6 @@ static void __exit cfq_exit(void)
         */
        if (elv_ioc_count_read(cfq_ioc_count))
                wait_for_completion(&all_gone);
-       ida_destroy(&cic_index_ida);
        cfq_slab_kill();
 }