Merge branch 'for-4.1/core' of git://git.kernel.dk/linux-block

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 17 Apr 2015 01:49:16 +0000 (21:49 -0400)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 17 Apr 2015 01:49:16 +0000 (21:49 -0400)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 17 Apr 2015 01:49:16 +0000 (21:49 -0400)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 17 Apr 2015 01:49:16 +0000 (21:49 -0400)
diff --combined block/blk-mq.c

index 33c428530193548e38e0ec4ce3608e08be8b1d8d,0b49e42e5310087cfd3dc9b91b6c192707b385ba..c82de08f3721c9f256d9837341c42290daeeef88
--- 1/block/blk-mq.c
--- 2/block/blk-mq.c
+++ b/block/blk-mq.c
@@@ -33,7 -33,6 +33,6 @@@ static DEFINE_MUTEX(all_q_mutex)
   static LIST_HEAD(all_q_list);
   
   static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
- static void blk_mq_run_queues(struct request_queue *q);
   
   /*
    * Check if any of the ctx's have pending work in this hardware queue
@@@ -78,7 -77,7 +77,7 @@@ static void blk_mq_hctx_clear_pending(s
         clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
   }
   
- static int blk_mq_queue_enter(struct request_queue *q)
+ static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp)
   {
         while (true) {
                 int ret;
@@@ -86,6 -85,9 +85,9 @@@
                 if (percpu_ref_tryget_live(&q->mq_usage_counter))
                         return 0;
   
+               if (!(gfp & __GFP_WAIT))
+                       return -EBUSY;
+ 
                 ret = wait_event_interruptible(q->mq_freeze_wq,
                                 !q->mq_freeze_depth || blk_queue_dying(q));
                 if (blk_queue_dying(q))
@@@ -118,7 -120,7 +120,7 @@@ void blk_mq_freeze_queue_start(struct r
   
         if (freeze) {
                 percpu_ref_kill(&q->mq_usage_counter);
-               blk_mq_run_queues(q);
+               blk_mq_run_hw_queues(q, false);
         }
   }
   EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
@@@ -257,7 -259,7 +259,7 @@@ struct request *blk_mq_alloc_request(st
         struct blk_mq_alloc_data alloc_data;
         int ret;
   
-       ret = blk_mq_queue_enter(q);
+       ret = blk_mq_queue_enter(q, gfp);
         if (ret)
                 return ERR_PTR(ret);
   
@@@ -904,7 -906,7 +906,7 @@@ void blk_mq_run_hw_queue(struct blk_mq_
                         &hctx->run_work, 0);
   }
   
- static void blk_mq_run_queues(struct request_queue *q)
+ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
   {
         struct blk_mq_hw_ctx *hctx;
         int i;
@@@ -915,9 -917,10 +917,10 @@@
                     test_bit(BLK_MQ_S_STOPPED, &hctx->state))
                         continue;
   
-               blk_mq_run_hw_queue(hctx, false);
+               blk_mq_run_hw_queue(hctx, async);
         }
   }
+ EXPORT_SYMBOL(blk_mq_run_hw_queues);
   
   void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
   {
@@@ -1186,7 -1189,7 +1189,7 @@@ static struct request *blk_mq_map_reque
         int rw = bio_data_dir(bio);
         struct blk_mq_alloc_data alloc_data;
   
-       if (unlikely(blk_mq_queue_enter(q))) {
+       if (unlikely(blk_mq_queue_enter(q, GFP_KERNEL))) {
                 bio_endio(bio, -EIO);
                 return NULL;
         }
@@@ -1457,7 -1460,7 +1460,7 @@@ static struct blk_mq_tags *blk_mq_init_
   
                 do {
                         page = alloc_pages_node(set->numa_node,
- -                              GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
+ +                              GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
                                 this_order);
                         if (page)
                                 break;
@@@ -1479,6 -1482,8 +1482,6 @@@
                 left -= to_do * rq_size;
                 for (j = 0; j < to_do; j++) {
                         tags->rqs[i] = p;
- -                      tags->rqs[i]->atomic_flags = 0;
- -                      tags->rqs[i]->cmd_flags = 0;
                         if (set->ops->init_request) {
                                 if (set->ops->init_request(set->driver_data,
                                                 tags->rqs[i], hctx_idx, i,
@@@ -1517,8 -1522,6 +1520,6 @@@ static int blk_mq_alloc_bitmap(struct b
         if (!bitmap->map)
                 return -ENOMEM;
   
-       bitmap->map_size = num_maps;
- 
         total = nr_cpu_ids;
         for (i = 0; i < num_maps; i++) {
                 bitmap->map[i].depth = min(total, bitmap->bits_per_word);
@@@ -1759,8 -1762,6 +1760,6 @@@ static void blk_mq_init_cpu_queues(stru
                         continue;
   
                 hctx = q->mq_ops->map_queue(q, i);
-               cpumask_set_cpu(i, hctx->cpumask);
-               hctx->nr_ctx++;
   
                 /*
                  * Set local node, IFF we have more than one hw queue. If
@@@ -1797,6 -1798,8 +1796,8 @@@ static void blk_mq_map_swqueue(struct r
         }
   
         queue_for_each_hw_ctx(q, hctx, i) {
+               struct blk_mq_ctxmap *map = &hctx->ctx_map;
+ 
                 /*
                  * If no software queues are mapped to this hardware queue,
                  * disable it and free the request entries.
@@@ -1812,6 -1815,13 +1813,13 @@@
                         continue;
                 }
   
+               /*
+                * Set the map size to the number of mapped software queues.
+                * This is more accurate and more efficient than looping
+                * over all possibly mapped software queues.
+                */
+               map->map_size = hctx->nr_ctx / map->bits_per_word;
+ 
                 /*
                  * Initialize batch roundrobin counts
                  */
@@@ -1888,10 -1898,26 +1896,26 @@@ void blk_mq_release(struct request_queu
   }
   
   struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
+ {
+       struct request_queue *uninit_q, *q;
+ 
+       uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
+       if (!uninit_q)
+               return ERR_PTR(-ENOMEM);
+ 
+       q = blk_mq_init_allocated_queue(set, uninit_q);
+       if (IS_ERR(q))
+               blk_cleanup_queue(uninit_q);
+ 
+       return q;
+ }
+ EXPORT_SYMBOL(blk_mq_init_queue);
+ 
+ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
+                                                 struct request_queue *q)
   {
         struct blk_mq_hw_ctx **hctxs;
         struct blk_mq_ctx __percpu *ctx;
-       struct request_queue *q;
         unsigned int *map;
         int i;
   
@@@ -1926,20 -1952,16 +1950,16 @@@
                 hctxs[i]->queue_num = i;
         }
   
-       q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
-       if (!q)
-               goto err_hctxs;
- 
         /*
          * Init percpu_ref in atomic mode so that it's faster to shutdown.
          * See blk_register_queue() for details.
          */
         if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release,
                             PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
-               goto err_mq_usage;
+               goto err_hctxs;
   
         setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
-       blk_queue_rq_timeout(q, 30000);
+       blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30000);
   
         q->nr_queues = nr_cpu_ids;
         q->nr_hw_queues = set->nr_hw_queues;
@@@ -1965,9 -1987,6 +1985,6 @@@
         else
                 blk_queue_make_request(q, blk_sq_make_request);
   
-       if (set->timeout)
-               blk_queue_rq_timeout(q, set->timeout);
- 
         /*
          * Do this after blk_queue_make_request() overrides it...
          */
@@@ -1979,7 -1998,7 +1996,7 @@@
         blk_mq_init_cpu_queues(q, set->nr_hw_queues);
   
         if (blk_mq_init_hw_queues(q, set))
-               goto err_mq_usage;
+               goto err_hctxs;
   
         mutex_lock(&all_q_mutex);
         list_add_tail(&q->all_q_node, &all_q_list);
@@@ -1991,8 -2010,6 +2008,6 @@@
   
         return q;
   
- err_mq_usage:
-       blk_cleanup_queue(q);
   err_hctxs:
         kfree(map);
         for (i = 0; i < set->nr_hw_queues; i++) {
@@@ -2007,7 -2024,7 +2022,7 @@@ err_percpu
         free_percpu(ctx);
         return ERR_PTR(-ENOMEM);
   }
- EXPORT_SYMBOL(blk_mq_init_queue);
+ EXPORT_SYMBOL(blk_mq_init_allocated_queue);
   
   void blk_mq_free_queue(struct request_queue *q)
   {
@@@ -2159,7 -2176,7 +2174,7 @@@ int blk_mq_alloc_tag_set(struct blk_mq_
         if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
                 return -EINVAL;
   
-       if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue)
+       if (!set->ops->queue_rq || !set->ops->map_queue)
                 return -EINVAL;
   
         if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
diff --combined fs/aio.c

index 5785c4b58fea5ffdeae17d43b6b00df2572dc638,cabb5edd9bc1c4cffe4482edba2625429a8feb94..fa8b16f47f1a9ee4bdfac0d5ce9192dac74c79e6
--- 1/fs/aio.c
--- 2/fs/aio.c
+++ b/fs/aio.c
@@@ -77,6 -77,11 +77,11 @@@ struct kioctx_cpu 
         unsigned                reqs_available;
   };
   
+ struct ctx_rq_wait {
+       struct completion comp;
+       atomic_t count;
+ };
+ 
   struct kioctx {
         struct percpu_ref       users;
         atomic_t                dead;
@@@ -115,7 -120,7 +120,7 @@@
         /*
          * signals when all in-flight requests are done
          */
-       struct completion *requests_done;
+       struct ctx_rq_wait      *rq_wait;
   
         struct {
                 /*
@@@ -151,38 -156,6 +156,38 @@@
         unsigned                id;
   };
   
+ +/*
+ + * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
+ + * cancelled or completed (this makes a certain amount of sense because
+ + * successful cancellation - io_cancel() - does deliver the completion to
+ + * userspace).
+ + *
+ + * And since most things don't implement kiocb cancellation and we'd really like
+ + * kiocb completion to be lockless when possible, we use ki_cancel to
+ + * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED
+ + * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel().
+ + */
+ +#define KIOCB_CANCELLED               ((void *) (~0ULL))
+ +
+ +struct aio_kiocb {
+ +      struct kiocb            common;
+ +
+ +      struct kioctx           *ki_ctx;
+ +      kiocb_cancel_fn         *ki_cancel;
+ +
+ +      struct iocb __user      *ki_user_iocb;  /* user's aiocb */
+ +      __u64                   ki_user_data;   /* user's data for completion */
+ +
+ +      struct list_head        ki_list;        /* the aio core uses this
+ +                                               * for cancellation */
+ +
+ +      /*
+ +       * If the aio_resfd field of the userspace iocb is not zero,
+ +       * this is the underlying eventfd context to deliver events to.
+ +       */
+ +      struct eventfd_ctx      *ki_eventfd;
+ +};
+ +
   /*------ sysctl variables----*/
   static DEFINE_SPINLOCK(aio_nr_lock);
   unsigned long aio_nr;         /* current system wide number of aio requests */
@@@ -252,7 -225,7 +257,7 @@@ static int __init aio_setup(void
         if (IS_ERR(aio_mnt))
                 panic("Failed to create aio fs mount.");
   
- -      kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+ +      kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
         kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
   
         pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));
@@@ -310,11 -283,11 +315,11 @@@ static int aio_ring_mmap(struct file *f
         return 0;
   }
   
- -static void aio_ring_remap(struct file *file, struct vm_area_struct *vma)
+ +static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
   {
         struct mm_struct *mm = vma->vm_mm;
         struct kioctx_table *table;
- -      int i;
+ +      int i, res = -EINVAL;
   
         spin_lock(&mm->ioctx_lock);
         rcu_read_lock();
@@@ -324,17 -297,13 +329,17 @@@
   
                 ctx = table->table[i];
                 if (ctx && ctx->aio_ring_file == file) {
- -                      ctx->user_id = ctx->mmap_base = vma->vm_start;
+ +                      if (!atomic_read(&ctx->dead)) {
+ +                              ctx->user_id = ctx->mmap_base = vma->vm_start;
+ +                              res = 0;
+ +                      }
                         break;
                 }
         }
   
         rcu_read_unlock();
         spin_unlock(&mm->ioctx_lock);
+ +      return res;
   }
   
   static const struct file_operations aio_ring_fops = {
@@@ -516,9 -485,8 +521,9 @@@ static int aio_setup_ring(struct kioct
   #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
   #define AIO_EVENTS_OFFSET     (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
   
- -void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
+ +void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
   {
+ +      struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, common);
         struct kioctx *ctx = req->ki_ctx;
         unsigned long flags;
   
@@@ -533,7 -501,7 +538,7 @@@
   }
   EXPORT_SYMBOL(kiocb_set_cancel_fn);
   
- -static int kiocb_cancel(struct kiocb *kiocb)
+ +static int kiocb_cancel(struct aio_kiocb *kiocb)
   {
         kiocb_cancel_fn *old, *cancel;
   
@@@ -551,7 -519,7 +556,7 @@@
                 cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
         } while (cancel != old);
   
- -      return cancel(kiocb);
+ +      return cancel(&kiocb->common);
   }
   
   static void free_ioctx(struct work_struct *work)
@@@ -572,8 -540,8 +577,8 @@@ static void free_ioctx_reqs(struct perc
         struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
   
         /* At this point we know that there are no any in-flight requests */
-       if (ctx->requests_done)
-               complete(ctx->requests_done);
+       if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
+               complete(&ctx->rq_wait->comp);
   
         INIT_WORK(&ctx->free_work, free_ioctx);
         schedule_work(&ctx->free_work);
@@@ -587,13 -555,13 +592,13 @@@
   static void free_ioctx_users(struct percpu_ref *ref)
   {
         struct kioctx *ctx = container_of(ref, struct kioctx, users);
- -      struct kiocb *req;
+ +      struct aio_kiocb *req;
   
         spin_lock_irq(&ctx->ctx_lock);
   
         while (!list_empty(&ctx->active_reqs)) {
                 req = list_first_entry(&ctx->active_reqs,
- -                                     struct kiocb, ki_list);
+ +                                     struct aio_kiocb, ki_list);
   
                 list_del_init(&req->ki_list);
                 kiocb_cancel(req);
@@@ -692,7 -660,8 +697,7 @@@ static struct kioctx *ioctx_alloc(unsig
         nr_events *= 2;
   
         /* Prevent overflows */
- -      if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
- -          (nr_events > (0x10000000U / sizeof(struct kiocb)))) {
+ +      if (nr_events > (0x10000000U / sizeof(struct io_event))) {
                 pr_debug("ENOMEM: nr_events too high\n");
                 return ERR_PTR(-EINVAL);
         }
@@@ -763,9 -732,6 +768,9 @@@
   err_cleanup:
         aio_nr_sub(ctx->max_reqs);
   err_ctx:
+ +      atomic_set(&ctx->dead, 1);
+ +      if (ctx->mmap_size)
+ +              vm_munmap(ctx->mmap_base, ctx->mmap_size);
         aio_free_ring(ctx);
   err:
         mutex_unlock(&ctx->ring_lock);
@@@ -783,16 -749,15 +788,16 @@@
    *    the rapid destruction of the kioctx.
    */
   static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
-               struct completion *requests_done)
+                     struct ctx_rq_wait *wait)
   {
         struct kioctx_table *table;
   
- -      if (atomic_xchg(&ctx->dead, 1))
+ +      spin_lock(&mm->ioctx_lock);
+ +      if (atomic_xchg(&ctx->dead, 1)) {
+ +              spin_unlock(&mm->ioctx_lock);
                 return -EINVAL;
+ +      }
   
- -
- -      spin_lock(&mm->ioctx_lock);
         table = rcu_dereference_raw(mm->ioctx_table);
         WARN_ON(ctx != table->table[ctx->id]);
         table->table[ctx->id] = NULL;
@@@ -813,11 -778,27 +818,11 @@@
         if (ctx->mmap_size)
                 vm_munmap(ctx->mmap_base, ctx->mmap_size);
   
-       ctx->requests_done = requests_done;
+       ctx->rq_wait = wait;
         percpu_ref_kill(&ctx->users);
         return 0;
   }
   
- -/* wait_on_sync_kiocb:
- - *    Waits on the given sync kiocb to complete.
- - */
- -ssize_t wait_on_sync_kiocb(struct kiocb *req)
- -{
- -      while (!req->ki_ctx) {
- -              set_current_state(TASK_UNINTERRUPTIBLE);
- -              if (req->ki_ctx)
- -                      break;
- -              io_schedule();
- -      }
- -      __set_current_state(TASK_RUNNING);
- -      return req->ki_user_data;
- -}
- -EXPORT_SYMBOL(wait_on_sync_kiocb);
- -
   /*
    * exit_aio: called when the last user of mm goes away.  At this point, there is
    * no way for any new requests to be submited or any of the io_* syscalls to be
@@@ -829,18 -810,24 +834,24 @@@
   void exit_aio(struct mm_struct *mm)
   {
         struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
-       int i;
+       struct ctx_rq_wait wait;
+       int i, skipped;
   
         if (!table)
                 return;
   
+       atomic_set(&wait.count, table->nr);
+       init_completion(&wait.comp);
+ 
+       skipped = 0;
         for (i = 0; i < table->nr; ++i) {
                 struct kioctx *ctx = table->table[i];
-               struct completion requests_done =
-                       COMPLETION_INITIALIZER_ONSTACK(requests_done);
   
-               if (!ctx)
+               if (!ctx) {
+                       skipped++;
                         continue;
+               }
+ 
                 /*
                  * We don't need to bother with munmap() here - exit_mmap(mm)
                  * is coming and it'll unmap everything. And we simply can't,
@@@ -849,10 -836,12 +860,12 @@@
                  * that it needs to unmap the area, just set it to 0.
                  */
                 ctx->mmap_size = 0;
-               kill_ioctx(mm, ctx, &requests_done);
+               kill_ioctx(mm, ctx, &wait);
+       }
   
+       if (!atomic_sub_and_test(skipped, &wait.count)) {
                 /* Wait until all IO for the context are done. */
-               wait_for_completion(&requests_done);
+               wait_for_completion(&wait.comp);
         }
   
         RCU_INIT_POINTER(mm->ioctx_table, NULL);
@@@ -972,9 -961,9 +985,9 @@@ static void user_refill_reqs_available(
    *    Allocate a slot for an aio request.
    * Returns NULL if no requests are free.
    */
- -static inline struct kiocb *aio_get_req(struct kioctx *ctx)
+ +static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
   {
- -      struct kiocb *req;
+ +      struct aio_kiocb *req;
   
         if (!get_reqs_available(ctx)) {
                 user_refill_reqs_available(ctx);
@@@ -995,10 -984,10 +1008,10 @@@ out_put
         return NULL;
   }
   
- -static void kiocb_free(struct kiocb *req)
+ +static void kiocb_free(struct aio_kiocb *req)
   {
- -      if (req->ki_filp)
- -              fput(req->ki_filp);
+ +      if (req->common.ki_filp)
+ +              fput(req->common.ki_filp);
         if (req->ki_eventfd != NULL)
                 eventfd_ctx_put(req->ki_eventfd);
         kmem_cache_free(kiocb_cachep, req);
@@@ -1034,9 -1023,8 +1047,9 @@@ out
   /* aio_complete
    *    Called when the io request on the given iocb is complete.
    */
- -void aio_complete(struct kiocb *iocb, long res, long res2)
+ +static void aio_complete(struct kiocb *kiocb, long res, long res2)
   {
+ +      struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, common);
         struct kioctx   *ctx = iocb->ki_ctx;
         struct aio_ring *ring;
         struct io_event *ev_page, *event;
@@@ -1050,7 -1038,13 +1063,7 @@@
          *    ref, no other paths have a way to get another ref
          *  - the sync task helpfully left a reference to itself in the iocb
          */
- -      if (is_sync_kiocb(iocb)) {
- -              iocb->ki_user_data = res;
- -              smp_wmb();
- -              iocb->ki_ctx = ERR_PTR(-EXDEV);
- -              wake_up_process(iocb->ki_obj.tsk);
- -              return;
- -      }
+ +      BUG_ON(is_sync_kiocb(kiocb));
   
         if (iocb->ki_list.next) {
                 unsigned long flags;
@@@ -1076,7 -1070,7 +1089,7 @@@
         ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
         event = ev_page + pos % AIO_EVENTS_PER_PAGE;
   
- -      event->obj = (u64)(unsigned long)iocb->ki_obj.user;
+ +      event->obj = (u64)(unsigned long)iocb->ki_user_iocb;
         event->data = iocb->ki_user_data;
         event->res = res;
         event->res2 = res2;
@@@ -1085,7 -1079,7 +1098,7 @@@
         flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
   
         pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
- -               ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
+ +               ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data,
                  res, res2);
   
         /* after flagging the request as done, we
@@@ -1132,6 -1126,7 +1145,6 @@@
   
         percpu_ref_put(&ctx->reqs);
   }
- -EXPORT_SYMBOL(aio_complete);
   
   /* aio_read_events_ring
    *    Pull an event off of the ioctx's event ring.  Returns the number of
@@@ -1331,15 -1326,17 +1344,17 @@@ SYSCALL_DEFINE1(io_destroy, aio_context
   {
         struct kioctx *ioctx = lookup_ioctx(ctx);
         if (likely(NULL != ioctx)) {
-               struct completion requests_done =
-                       COMPLETION_INITIALIZER_ONSTACK(requests_done);
+               struct ctx_rq_wait wait;
                 int ret;
   
+               init_completion(&wait.comp);
+               atomic_set(&wait.count, 1);
+ 
                 /* Pass requests_done to kill_ioctx() where it can be set
                  * in a thread-safe way. If we try to set it here then we have
                  * a race condition if two io_destroy() called simultaneously.
                  */
-               ret = kill_ioctx(current->mm, ioctx, &requests_done);
+               ret = kill_ioctx(current->mm, ioctx, &wait);
                 percpu_ref_put(&ioctx->users);
   
                 /* Wait until all IO for the context are done. Otherwise kernel
@@@ -1347,7 -1344,7 +1362,7 @@@
                  * is destroyed.
                  */
                 if (!ret)
-                       wait_for_completion(&requests_done);
+                       wait_for_completion(&wait.comp);
   
                 return ret;
         }
@@@ -1355,21 -1352,50 +1370,21 @@@
         return -EINVAL;
   }
   
- -typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,
- -                          unsigned long, loff_t);
   typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *);
   
- -static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
- -                                   int rw, char __user *buf,
- -                                   unsigned long *nr_segs,
- -                                   struct iovec **iovec,
- -                                   bool compat)
+ +static int aio_setup_vectored_rw(int rw, char __user *buf, size_t len,
+ +                               struct iovec **iovec,
+ +                               bool compat,
+ +                               struct iov_iter *iter)
   {
- -      ssize_t ret;
- -
- -      *nr_segs = kiocb->ki_nbytes;
- -
   #ifdef CONFIG_COMPAT
         if (compat)
- -              ret = compat_rw_copy_check_uvector(rw,
+ +              return compat_import_iovec(rw,
                                 (struct compat_iovec __user *)buf,
- -                              *nr_segs, UIO_FASTIOV, *iovec, iovec);
- -      else
+ +                              len, UIO_FASTIOV, iovec, iter);
   #endif
- -              ret = rw_copy_check_uvector(rw,
- -                              (struct iovec __user *)buf,
- -                              *nr_segs, UIO_FASTIOV, *iovec, iovec);
- -      if (ret < 0)
- -              return ret;
- -
- -      /* ki_nbytes now reflect bytes instead of segs */
- -      kiocb->ki_nbytes = ret;
- -      return 0;
- -}
- -
- -static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
- -                                     int rw, char __user *buf,
- -                                     unsigned long *nr_segs,
- -                                     struct iovec *iovec)
- -{
- -      if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes)))
- -              return -EFAULT;
- -
- -      iovec->iov_base = buf;
- -      iovec->iov_len = kiocb->ki_nbytes;
- -      *nr_segs = 1;
- -      return 0;
+ +      return import_iovec(rw, (struct iovec __user *)buf,
+ +                              len, UIO_FASTIOV, iovec, iter);
   }
   
   /*
@@@ -1377,12 -1403,14 +1392,12 @@@
    *    Performs the initial checks and io submission.
    */
   static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
- -                          char __user *buf, bool compat)
+ +                          char __user *buf, size_t len, bool compat)
   {
         struct file *file = req->ki_filp;
         ssize_t ret;
- -      unsigned long nr_segs;
         int rw;
         fmode_t mode;
- -      aio_rw_op *rw_op;
         rw_iter_op *iter_op;
         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
         struct iov_iter iter;
@@@ -1392,6 -1420,7 +1407,6 @@@
         case IOCB_CMD_PREADV:
                 mode    = FMODE_READ;
                 rw      = READ;
- -              rw_op   = file->f_op->aio_read;
                 iter_op = file->f_op->read_iter;
                 goto rw_common;
   
@@@ -1399,40 -1428,51 +1414,40 @@@
         case IOCB_CMD_PWRITEV:
                 mode    = FMODE_WRITE;
                 rw      = WRITE;
- -              rw_op   = file->f_op->aio_write;
                 iter_op = file->f_op->write_iter;
                 goto rw_common;
   rw_common:
                 if (unlikely(!(file->f_mode & mode)))
                         return -EBADF;
   
- -              if (!rw_op && !iter_op)
+ +              if (!iter_op)
                         return -EINVAL;
   
- -              ret = (opcode == IOCB_CMD_PREADV ||
- -                     opcode == IOCB_CMD_PWRITEV)
- -                      ? aio_setup_vectored_rw(req, rw, buf, &nr_segs,
- -                                              &iovec, compat)
- -                      : aio_setup_single_vector(req, rw, buf, &nr_segs,
- -                                                iovec);
+ +              if (opcode == IOCB_CMD_PREADV || opcode == IOCB_CMD_PWRITEV)
+ +                      ret = aio_setup_vectored_rw(rw, buf, len,
+ +                                              &iovec, compat, &iter);
+ +              else {
+ +                      ret = import_single_range(rw, buf, len, iovec, &iter);
+ +                      iovec = NULL;
+ +              }
                 if (!ret)
- -                      ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
+ +                      ret = rw_verify_area(rw, file, &req->ki_pos,
+ +                                           iov_iter_count(&iter));
                 if (ret < 0) {
- -                      if (iovec != inline_vecs)
- -                              kfree(iovec);
+ +                      kfree(iovec);
                         return ret;
                 }
   
- -              req->ki_nbytes = ret;
- -
- -              /* XXX: move/kill - rw_verify_area()? */
- -              /* This matches the pread()/pwrite() logic */
- -              if (req->ki_pos < 0) {
- -                      ret = -EINVAL;
- -                      break;
- -              }
+ +              len = ret;
   
                 if (rw == WRITE)
                         file_start_write(file);
   
- -              if (iter_op) {
- -                      iov_iter_init(&iter, rw, iovec, nr_segs, req->ki_nbytes);
- -                      ret = iter_op(req, &iter);
- -              } else {
- -                      ret = rw_op(req, iovec, nr_segs, req->ki_pos);
- -              }
+ +              ret = iter_op(req, &iter);
   
                 if (rw == WRITE)
                         file_end_write(file);
+ +              kfree(iovec);
                 break;
   
         case IOCB_CMD_FDSYNC:
@@@ -1454,6 -1494,9 +1469,6 @@@
                 return -EINVAL;
         }
   
- -      if (iovec != inline_vecs)
- -              kfree(iovec);
- -
         if (ret != -EIOCBQUEUED) {
                 /*
                  * There's no easy way to restart the syscall since other AIO's
@@@ -1472,7 -1515,7 +1487,7 @@@
   static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
                          struct iocb *iocb, bool compat)
   {
- -      struct kiocb *req;
+ +      struct aio_kiocb *req;
         ssize_t ret;
   
         /* enforce forwards compatibility on users */
@@@ -1495,14 -1538,11 +1510,14 @@@
         if (unlikely(!req))
                 return -EAGAIN;
   
- -      req->ki_filp = fget(iocb->aio_fildes);
- -      if (unlikely(!req->ki_filp)) {
+ +      req->common.ki_filp = fget(iocb->aio_fildes);
+ +      if (unlikely(!req->common.ki_filp)) {
                 ret = -EBADF;
                 goto out_put_req;
         }
+ +      req->common.ki_pos = iocb->aio_offset;
+ +      req->common.ki_complete = aio_complete;
+ +      req->common.ki_flags = 0;
   
         if (iocb->aio_flags & IOCB_FLAG_RESFD) {
                 /*
@@@ -1517,8 -1557,6 +1532,8 @@@
                         req->ki_eventfd = NULL;
                         goto out_put_req;
                 }
+ +
+ +              req->common.ki_flags |= IOCB_EVENTFD;
         }
   
         ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
@@@ -1527,12 -1565,13 +1542,12 @@@
                 goto out_put_req;
         }
   
- -      req->ki_obj.user = user_iocb;
+ +      req->ki_user_iocb = user_iocb;
         req->ki_user_data = iocb->aio_data;
- -      req->ki_pos = iocb->aio_offset;
- -      req->ki_nbytes = iocb->aio_nbytes;
   
- -      ret = aio_run_iocb(req, iocb->aio_lio_opcode,
+ +      ret = aio_run_iocb(&req->common, iocb->aio_lio_opcode,
                            (char __user *)(unsigned long)iocb->aio_buf,
+ +                         iocb->aio_nbytes,
                            compat);
         if (ret)
                 goto out_put_req;
@@@ -1619,10 -1658,10 +1634,10 @@@ SYSCALL_DEFINE3(io_submit, aio_context_
   /* lookup_kiocb
    *    Finds a given iocb for cancellation.
    */
- -static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
- -                                u32 key)
+ +static struct aio_kiocb *
+ +lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key)
   {
- -      struct list_head *pos;
+ +      struct aio_kiocb *kiocb;
   
         assert_spin_locked(&ctx->ctx_lock);
   
@@@ -1630,8 -1669,9 +1645,8 @@@
                 return NULL;
   
         /* TODO: use a hash or array, this sucks. */
- -      list_for_each(pos, &ctx->active_reqs) {
- -              struct kiocb *kiocb = list_kiocb(pos);
- -              if (kiocb->ki_obj.user == iocb)
+ +      list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
+ +              if (kiocb->ki_user_iocb == iocb)
                         return kiocb;
         }
         return NULL;
@@@ -1651,7 -1691,7 +1666,7 @@@ SYSCALL_DEFINE3(io_cancel, aio_context_
                 struct io_event __user *, result)
   {
         struct kioctx *ctx;
- -      struct kiocb *kiocb;
+ +      struct aio_kiocb *kiocb;
         u32 key;
         int ret;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 17 Apr 2015 01:49:16 +0000 (21:49 -0400)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 17 Apr 2015 01:49:16 +0000 (21:49 -0400)
		1	2
block/blk-mq.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/aio.c	patch \|	diff1 \|	diff2 \|	blob \| history