Merge tag 'for-linus-20190726' of git://git.kernel.dk/linux-block

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 26 Jul 2019 17:32:12 +0000 (10:32 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 26 Jul 2019 17:32:12 +0000 (10:32 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 26 Jul 2019 17:32:12 +0000 (10:32 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 26 Jul 2019 17:32:12 +0000 (10:32 -0700)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c

index 72860325245a30bb68b069102bb4243885edfe91..586fcfe227eae6ca777f26e45a7605644451643d 100644 (file)
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -3354,38 +3354,57 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
   * there is no active group, then the primary expectation for
   * this device is probably a high throughput.
   *
- * We are now left only with explaining the additional
- * compound condition that is checked below for deciding
- * whether the scenario is asymmetric. To explain this
- * compound condition, we need to add that the function
+ * We are now left only with explaining the two sub-conditions in the
+ * additional compound condition that is checked below for deciding
+ * whether the scenario is asymmetric. To explain the first
+ * sub-condition, we need to add that the function
   * bfq_asymmetric_scenario checks the weights of only
- * non-weight-raised queues, for efficiency reasons (see
- * comments on bfq_weights_tree_add()). Then the fact that
- * bfqq is weight-raised is checked explicitly here. More
- * precisely, the compound condition below takes into account
- * also the fact that, even if bfqq is being weight-raised,
- * the scenario is still symmetric if all queues with requests
- * waiting for completion happen to be
- * weight-raised. Actually, we should be even more precise
- * here, and differentiate between interactive weight raising
- * and soft real-time weight raising.
+ * non-weight-raised queues, for efficiency reasons (see comments on
+ * bfq_weights_tree_add()). Then the fact that bfqq is weight-raised
+ * is checked explicitly here. More precisely, the compound condition
+ * below takes into account also the fact that, even if bfqq is being
+ * weight-raised, the scenario is still symmetric if all queues with
+ * requests waiting for completion happen to be
+ * weight-raised. Actually, we should be even more precise here, and
+ * differentiate between interactive weight raising and soft real-time
+ * weight raising.
+ *
+ * The second sub-condition checked in the compound condition is
+ * whether there is a fair amount of already in-flight I/O not
+ * belonging to bfqq. If so, I/O dispatching is to be plugged, for the
+ * following reason. The drive may decide to serve in-flight
+ * non-bfqq's I/O requests before bfqq's ones, thereby delaying the
+ * arrival of new I/O requests for bfqq (recall that bfqq is sync). If
+ * I/O-dispatching is not plugged, then, while bfqq remains empty, a
+ * basically uncontrolled amount of I/O from other queues may be
+ * dispatched too, possibly causing the service of bfqq's I/O to be
+ * delayed even longer in the drive. This problem gets more and more
+ * serious as the speed and the queue depth of the drive grow,
+ * because, as these two quantities grow, the probability to find no
+ * queue busy but many requests in flight grows too. By contrast,
+ * plugging I/O dispatching minimizes the delay induced by already
+ * in-flight I/O, and enables bfqq to recover the bandwidth it may
+ * lose because of this delay.
   *
   * As a side note, it is worth considering that the above
- * device-idling countermeasures may however fail in the
- * following unlucky scenario: if idling is (correctly)
- * disabled in a time period during which all symmetry
- * sub-conditions hold, and hence the device is allowed to
- * enqueue many requests, but at some later point in time some
- * sub-condition stops to hold, then it may become impossible
- * to let requests be served in the desired order until all
- * the requests already queued in the device have been served.
+ * device-idling countermeasures may however fail in the following
+ * unlucky scenario: if I/O-dispatch plugging is (correctly) disabled
+ * in a time period during which all symmetry sub-conditions hold, and
+ * therefore the device is allowed to enqueue many requests, but at
+ * some later point in time some sub-condition stops to hold, then it
+ * may become impossible to make requests be served in the desired
+ * order until all the requests already queued in the device have been
+ * served. The last sub-condition commented above somewhat mitigates
+ * this problem for weight-raised queues.
   */
  static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
                                                  struct bfq_queue *bfqq)
  {
         return (bfqq->wr_coeff > 1 &&
-               bfqd->wr_busy_queues <
-               bfq_tot_busy_queues(bfqd)) ||
+               (bfqd->wr_busy_queues <
+                bfq_tot_busy_queues(bfqd) ||
+                bfqd->rq_in_driver >=
+                bfqq->dispatched + 4)) ||
                 bfq_asymmetric_scenario(bfqd, bfqq);
  }
  
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c

index 24ed26957367e2cbba91590040c9454612edad98..55a7dc227dfbd6af883a2832d09c8abf2c16d3d0 100644 (file)
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -54,7 +54,7 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
  
  static LIST_HEAD(all_blkcgs);          /* protected by blkcg_pol_mutex */
  
-static bool blkcg_debug_stats = false;
+bool blkcg_debug_stats = false;
  static struct workqueue_struct *blkcg_punt_bio_wq;
  
  static bool blkcg_policy_enabled(struct request_queue *q,
@@ -944,10 +944,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
                                          dbytes, dios);
                 }
  
-               if (!blkcg_debug_stats)
-                       goto next;
-
-               if (atomic_read(&blkg->use_delay)) {
+               if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
                         has_stats = true;
                         off += scnprintf(buf+off, size-off,
                                          " use_delay=%d delay_nsec=%llu",
@@ -967,7 +964,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
                                 has_stats = true;
                         off += written;
                 }
-next:
+
                 if (has_stats) {
                         if (off < size - 1) {
                                 off += scnprintf(buf+off, size-off, "\n");
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c

index d973c38ee4fd65b502af755e491e1a4bb830c0c3..0fff7b56df0e4d0fa0f82be4102f3ceb03735034 100644 (file)
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -917,6 +917,9 @@ static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
         unsigned long long avg_lat;
         unsigned long long cur_win;
  
+       if (!blkcg_debug_stats)
+               return 0;
+
         if (iolat->ssd)
                 return iolatency_ssd_stat(iolat, buf, size);
  
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h

index cf22ab00fefb68cdc9efda7d9dcad8329307a262..126021fc3a11f9308a5fe0745ba2a6f9f1ee2ee5 100644 (file)
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -61,15 +61,6 @@ static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
                 e->type->ops.completed_request(rq, now);
  }
  
-static inline void blk_mq_sched_started_request(struct request *rq)
-{
-       struct request_queue *q = rq->q;
-       struct elevator_queue *e = q->elevator;
-
-       if (e && e->type->ops.started_request)
-               e->type->ops.started_request(rq);
-}
-
  static inline void blk_mq_sched_requeue_request(struct request *rq)
  {
         struct request_queue *q = rq->q;
diff --git a/block/blk-mq.c b/block/blk-mq.c

index b038ec680e843e14aa2f3a7b3d967bd802efd4db..f78d3287dd823f6b431c87eec675580c1e987d55 100644 (file)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -669,8 +669,6 @@ void blk_mq_start_request(struct request *rq)
  {
         struct request_queue *q = rq->q;
  
-       blk_mq_sched_started_request(rq);
-
         trace_block_rq_issue(q, rq);
  
         if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
@@ -1960,9 +1958,13 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
         rq = blk_mq_get_request(q, bio, &data);
         if (unlikely(!rq)) {
                 rq_qos_cleanup(q, bio);
-               if (bio->bi_opf & REQ_NOWAIT)
+
+               cookie = BLK_QC_T_NONE;
+               if (bio->bi_opf & REQ_NOWAIT_INLINE)
+                       cookie = BLK_QC_T_EAGAIN;
+               else if (bio->bi_opf & REQ_NOWAIT)
                         bio_wouldblock_error(bio);
-               return BLK_QC_T_NONE;
+               return cookie;
         }
  
         trace_block_getrq(q, bio, bio->bi_opf);
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c

index 659ccb8b693fa63cedfc53536589f5afcf3b1afc..3954c0dc14433d21c4baf7f9b007c2710fa67d64 100644 (file)
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -202,6 +202,7 @@ static int rq_qos_wake_function(struct wait_queue_entry *curr,
                 return -1;
  
         data->got_token = true;
+       smp_wmb();
         list_del_init(&curr->entry);
         wake_up_process(data->task);
         return 1;
@@ -244,7 +245,9 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
                 return;
  
         prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE);
+       has_sleeper = !wq_has_single_sleeper(&rqw->wait);
         do {
+               /* The memory barrier in set_task_state saves us here. */
                 if (data.got_token)
                         break;
                 if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) {
@@ -255,12 +258,14 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
                          * which means we now have two. Put our local token
                          * and wake anyone else potentially waiting for one.
                          */
+                       smp_rmb();
                         if (data.got_token)
                                 cleanup_cb(rqw, private_data);
                         break;
                 }
                 io_schedule();
-               has_sleeper = false;
+               has_sleeper = true;
+               set_current_state(TASK_UNINTERRUPTIBLE);
         } while (1);
         finish_wait(&rqw->wait, &data.wq);
  }
diff --git a/block/genhd.c b/block/genhd.c

index 97887e59f3b2a34dfde5272bb0d3226e63bad15a..54f1f0d381f4d28c7b4d07d4044e4f0f7d2ddb05 100644 (file)
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1969,7 +1969,7 @@ static const struct attribute *disk_events_attrs[] = {
   * The default polling interval can be specified by the kernel
   * parameter block.events_dfl_poll_msecs which defaults to 0
   * (disable).  This can also be modified runtime by writing to
- * /sys/module/block/events_dfl_poll_msecs.
+ * /sys/module/block/parameters/events_dfl_poll_msecs.
   */
  static int disk_events_set_dfl_poll_msecs(const char *val,
                                           const struct kernel_param *kp)
diff --git a/drivers/ata/libahci_platform.c b/drivers/ata/libahci_platform.c

index 72312ad2e142d41202d9378889a1cd015795c745..3a36e76eca831db26715248b7665a674801aed28 100644 (file)
--- a/drivers/ata/libahci_platform.c
+++ b/drivers/ata/libahci_platform.c
@@ -408,7 +408,6 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev,
         hpriv->mmio = devm_ioremap_resource(dev,
                               platform_get_resource(pdev, IORESOURCE_MEM, 0));
         if (IS_ERR(hpriv->mmio)) {
-               dev_err(dev, "no mmio space\n");
                 rc = PTR_ERR(hpriv->mmio);
                 goto err_out;
         }
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c

index 90ebfcae0ce6e70dafed014f489a4e20e0d8876b..2b3103c308573a5398237bd90d78c6957108ce08 100644 (file)
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -5417,7 +5417,7 @@ static int drbd_do_auth(struct drbd_connection *connection)
         unsigned int key_len;
         char secret[SHARED_SECRET_MAX]; /* 64 byte */
         unsigned int resp_size;
-       SHASH_DESC_ON_STACK(desc, connection->cram_hmac_tfm);
+       struct shash_desc *desc;
         struct packet_info pi;
         struct net_conf *nc;
         int err, rv;
@@ -5430,6 +5430,13 @@ static int drbd_do_auth(struct drbd_connection *connection)
         memcpy(secret, nc->shared_secret, key_len);
         rcu_read_unlock();
  
+       desc = kmalloc(sizeof(struct shash_desc) +
+                      crypto_shash_descsize(connection->cram_hmac_tfm),
+                      GFP_KERNEL);
+       if (!desc) {
+               rv = -1;
+               goto fail;
+       }
         desc->tfm = connection->cram_hmac_tfm;
  
         rv = crypto_shash_setkey(connection->cram_hmac_tfm, (u8 *)secret, key_len);
@@ -5571,7 +5578,10 @@ static int drbd_do_auth(struct drbd_connection *connection)
         kfree(peers_ch);
         kfree(response);
         kfree(right_response);
-       shash_desc_zero(desc);
+       if (desc) {
+               shash_desc_zero(desc);
+               kfree(desc);
+       }
  
         return rv;
  }
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c

index 26e374fbf57c7873d626e3a84b052a7a8c9d2353..20ed838e9413bf532e47ee6e70f60c56b3499010 100644 (file)
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -931,6 +931,9 @@ int bch_cached_dev_run(struct cached_dev *dc)
         if (dc->io_disable) {
                 pr_err("I/O disabled on cached dev %s",
                        dc->backing_dev_name);
+               kfree(env[1]);
+               kfree(env[2]);
+               kfree(buf);
                 return -EIO;
         }
  
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c

index cc09b81fc7f434ccb46ee2afed2def35bb4ccbe5..8f3fbe5ca9379988b16c88c6e25cb45869f6a614 100644 (file)
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2311,17 +2311,15 @@ static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ct
         memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
  }
  
-static void __nvme_release_subsystem(struct nvme_subsystem *subsys)
+static void nvme_release_subsystem(struct device *dev)
  {
+       struct nvme_subsystem *subsys =
+               container_of(dev, struct nvme_subsystem, dev);
+
         ida_simple_remove(&nvme_subsystems_ida, subsys->instance);
         kfree(subsys);
  }
  
-static void nvme_release_subsystem(struct device *dev)
-{
-       __nvme_release_subsystem(container_of(dev, struct nvme_subsystem, dev));
-}
-
  static void nvme_destroy_subsystem(struct kref *ref)
  {
         struct nvme_subsystem *subsys =
@@ -2477,7 +2475,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
         mutex_lock(&nvme_subsystems_lock);
         found = __nvme_find_get_subsystem(subsys->subnqn);
         if (found) {
-               __nvme_release_subsystem(subsys);
+               put_device(&subsys->dev);
                 subsys = found;
  
                 if (!nvme_validate_cntlid(subsys, ctrl, id)) {
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c

index a9a9276779708e2beca5497131cbfc547518e58e..4f0d0d12744e0e94dbc1e518ca0c1935648c1bac 100644 (file)
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -12,11 +12,6 @@ module_param(multipath, bool, 0444);
  MODULE_PARM_DESC(multipath,
         "turn on native support for multiple controllers per subsystem");
  
-inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
-{
-       return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
-}
-
  /*
   * If multipathing is enabled we need to always use the subsystem instance
   * number for numbering our devices to avoid conflicts between subsystems that
@@ -622,7 +617,8 @@ int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
  {
         int error;
  
-       if (!nvme_ctrl_use_ana(ctrl))
+       /* check if multipath is enabled and we have the capability */
+       if (!multipath || !ctrl->subsys || !(ctrl->subsys->cmic & (1 << 3)))
                 return 0;
  
         ctrl->anacap = id->anacap;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h

index 716a876119c83a911bcc323c84edfc059bf3b7eb..26b563f9985b5a9bf4163a18646e9fee68a2ae17 100644 (file)
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -485,7 +485,11 @@ extern const struct attribute_group *nvme_ns_id_attr_groups[];
  extern const struct block_device_operations nvme_ns_head_ops;
  
  #ifdef CONFIG_NVME_MULTIPATH
-bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl);
+static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
+{
+       return ctrl->ana_log_buf != NULL;
+}
+
  void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
                         struct nvme_ctrl *ctrl, int *flags);
  void nvme_failover_req(struct request *req);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c

index bb970ca82517b1213cb4922225822edb78e23b4d..db160cee42ad178e02b11cc15bb0229a9417f118 100644 (file)
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2254,9 +2254,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
         if (!dev->ctrl.tagset) {
                 dev->tagset.ops = &nvme_mq_ops;
                 dev->tagset.nr_hw_queues = dev->online_queues - 1;
-               dev->tagset.nr_maps = 1; /* default */
-               if (dev->io_queues[HCTX_TYPE_READ])
-                       dev->tagset.nr_maps++;
+               dev->tagset.nr_maps = 2; /* default + read */
                 if (dev->io_queues[HCTX_TYPE_POLL])
                         dev->tagset.nr_maps++;
                 dev->tagset.timeout = NVME_IO_TIMEOUT;
@@ -3029,6 +3027,8 @@ static const struct pci_device_id nvme_id_table[] = {
                 .driver_data = NVME_QUIRK_LIGHTNVM, },
         { PCI_DEVICE(0x1d1d, 0x2601),   /* CNEX Granby */
                 .driver_data = NVME_QUIRK_LIGHTNVM, },
+       { PCI_DEVICE(0x10ec, 0x5762),   /* ADATA SX6000LNP */
+               .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
         { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
         { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
         { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
diff --git a/fs/block_dev.c b/fs/block_dev.c

index 4707dfff991b55ddc9399e4f28797d81feae5e18..c2a85b587922d9eacf8c539d00ad2b7efff26a3f 100644 (file)
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -345,15 +345,24 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
         struct bio *bio;
         bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
         bool is_read = (iov_iter_rw(iter) == READ), is_sync;
+       bool nowait = (iocb->ki_flags & IOCB_NOWAIT) != 0;
         loff_t pos = iocb->ki_pos;
         blk_qc_t qc = BLK_QC_T_NONE;
-       int ret = 0;
+       gfp_t gfp;
+       ssize_t ret;
  
         if ((pos | iov_iter_alignment(iter)) &
             (bdev_logical_block_size(bdev) - 1))
                 return -EINVAL;
  
-       bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
+       if (nowait)
+               gfp = GFP_NOWAIT;
+       else
+               gfp = GFP_KERNEL;
+
+       bio = bio_alloc_bioset(gfp, nr_pages, &blkdev_dio_pool);
+       if (!bio)
+               return -EAGAIN;
  
         dio = container_of(bio, struct blkdev_dio, bio);
         dio->is_sync = is_sync = is_sync_kiocb(iocb);
@@ -375,7 +384,10 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
         if (!is_poll)
                 blk_start_plug(&plug);
  
+       ret = 0;
         for (;;) {
+               int err;
+
                 bio_set_dev(bio, bdev);
                 bio->bi_iter.bi_sector = pos >> 9;
                 bio->bi_write_hint = iocb->ki_hint;
@@ -383,8 +395,10 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
                 bio->bi_end_io = blkdev_bio_end_io;
                 bio->bi_ioprio = iocb->ki_ioprio;
  
-               ret = bio_iov_iter_get_pages(bio, iter);
-               if (unlikely(ret)) {
+               err = bio_iov_iter_get_pages(bio, iter);
+               if (unlikely(err)) {
+                       if (!ret)
+                               ret = err;
                         bio->bi_status = BLK_STS_IOERR;
                         bio_endio(bio);
                         break;
@@ -399,6 +413,14 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
                         task_io_account_write(bio->bi_iter.bi_size);
                 }
  
+               /*
+                * Tell underlying layer to not block for resource shortage.
+                * And if we would have blocked, return error inline instead
+                * of through the bio->bi_end_io() callback.
+                */
+               if (nowait)
+                       bio->bi_opf |= (REQ_NOWAIT | REQ_NOWAIT_INLINE);
+
                 dio->size += bio->bi_iter.bi_size;
                 pos += bio->bi_iter.bi_size;
  
@@ -412,6 +434,11 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
                         }
  
                         qc = submit_bio(bio);
+                       if (qc == BLK_QC_T_EAGAIN) {
+                               if (!ret)
+                                       ret = -EAGAIN;
+                               goto error;
+                       }
  
                         if (polled)
                                 WRITE_ONCE(iocb->ki_cookie, qc);
@@ -432,8 +459,20 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
                         atomic_inc(&dio->ref);
                 }
  
-               submit_bio(bio);
-               bio = bio_alloc(GFP_KERNEL, nr_pages);
+               qc = submit_bio(bio);
+               if (qc == BLK_QC_T_EAGAIN) {
+                       if (!ret)
+                               ret = -EAGAIN;
+                       goto error;
+               }
+               ret += bio->bi_iter.bi_size;
+
+               bio = bio_alloc(gfp, nr_pages);
+               if (!bio) {
+                       if (!ret)
+                               ret = -EAGAIN;
+                       goto error;
+               }
         }
  
         if (!is_poll)
@@ -453,13 +492,16 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
         }
         __set_current_state(TASK_RUNNING);
  
+out:
         if (!ret)
                 ret = blk_status_to_errno(dio->bio.bi_status);
-       if (likely(!ret))
-               ret = dio->size;
  
         bio_put(&dio->bio);
         return ret;
+error:
+       if (!is_poll)
+               blk_finish_plug(&plug);
+       goto out;
  }
  
  static ssize_t
diff --git a/fs/io_uring.c b/fs/io_uring.c

index e2a66e12fbc634f1eb913ef80f504dbd470edd69..012bc0efb9d3cba3241c5fe71883106f5616b092 100644 (file)
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -202,7 +202,7 @@ struct async_list {
  
         struct file             *file;
         off_t                   io_end;
-       size_t                  io_pages;
+       size_t                  io_len;
  };
  
  struct io_ring_ctx {
@@ -333,7 +333,8 @@ struct io_kiocb {
  #define REQ_F_IO_DRAIN         16      /* drain existing IO first */
  #define REQ_F_IO_DRAINED       32      /* drain done */
  #define REQ_F_LINK             64      /* linked sqes */
-#define REQ_F_FAIL_LINK                128     /* fail rest of links */
+#define REQ_F_LINK_DONE                128     /* linked sqes done */
+#define REQ_F_FAIL_LINK                256     /* fail rest of links */
         u64                     user_data;
         u32                     result;
         u32                     sequence;
@@ -429,7 +430,7 @@ static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
         if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
                 return false;
  
-       return req->sequence > ctx->cached_cq_tail + ctx->sq_ring->dropped;
+       return req->sequence != ctx->cached_cq_tail + ctx->sq_ring->dropped;
  }
  
  static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
@@ -632,6 +633,7 @@ static void io_req_link_next(struct io_kiocb *req)
                         nxt->flags |= REQ_F_LINK;
                 }
  
+               nxt->flags |= REQ_F_LINK_DONE;
                 INIT_WORK(&nxt->work, io_sq_wq_submit_work);
                 queue_work(req->ctx->sqo_wq, &nxt->work);
         }
@@ -1064,8 +1066,44 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
          */
         offset = buf_addr - imu->ubuf;
         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
-       if (offset)
-               iov_iter_advance(iter, offset);
+
+       if (offset) {
+               /*
+                * Don't use iov_iter_advance() here, as it's really slow for
+                * using the latter parts of a big fixed buffer - it iterates
+                * over each segment manually. We can cheat a bit here, because
+                * we know that:
+                *
+                * 1) it's a BVEC iter, we set it up
+                * 2) all bvecs are PAGE_SIZE in size, except potentially the
+                *    first and last bvec
+                *
+                * So just find our index, and adjust the iterator afterwards.
+                * If the offset is within the first bvec (or the whole first
+                * bvec, just use iov_iter_advance(). This makes it easier
+                * since we can just skip the first segment, which may not
+                * be PAGE_SIZE aligned.
+                */
+               const struct bio_vec *bvec = imu->bvec;
+
+               if (offset <= bvec->bv_len) {
+                       iov_iter_advance(iter, offset);
+               } else {
+                       unsigned long seg_skip;
+
+                       /* skip first vec */
+                       offset -= bvec->bv_len;
+                       seg_skip = 1 + (offset >> PAGE_SHIFT);
+
+                       iter->bvec = bvec + seg_skip;
+                       iter->nr_segs -= seg_skip;
+                       iter->count -= (seg_skip << PAGE_SHIFT);
+                       iter->iov_offset = offset & ~PAGE_MASK;
+                       if (iter->iov_offset)
+                               iter->count -= iter->iov_offset;
+               }
+       }
+
         return 0;
  }
  
@@ -1120,28 +1158,26 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
         off_t io_end = kiocb->ki_pos + len;
  
         if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) {
-               unsigned long max_pages;
+               unsigned long max_bytes;
  
                 /* Use 8x RA size as a decent limiter for both reads/writes */
-               max_pages = filp->f_ra.ra_pages;
-               if (!max_pages)
-                       max_pages = VM_READAHEAD_PAGES;
-               max_pages *= 8;
-
-               /* If max pages are exceeded, reset the state */
-               len >>= PAGE_SHIFT;
-               if (async_list->io_pages + len <= max_pages) {
+               max_bytes = filp->f_ra.ra_pages << (PAGE_SHIFT + 3);
+               if (!max_bytes)
+                       max_bytes = VM_READAHEAD_PAGES << (PAGE_SHIFT + 3);
+
+               /* If max len are exceeded, reset the state */
+               if (async_list->io_len + len <= max_bytes) {
                         req->flags |= REQ_F_SEQ_PREV;
-                       async_list->io_pages += len;
+                       async_list->io_len += len;
                 } else {
                         io_end = 0;
-                       async_list->io_pages = 0;
+                       async_list->io_len = 0;
                 }
         }
  
         /* New file? Reset state. */
         if (async_list->file != filp) {
-               async_list->io_pages = 0;
+               async_list->io_len = 0;
                 async_list->file = filp;
         }
         async_list->io_end = io_end;
@@ -1630,6 +1666,8 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
         INIT_LIST_HEAD(&poll->wait.entry);
         init_waitqueue_func_entry(&poll->wait, io_poll_wake);
  
+       INIT_LIST_HEAD(&req->list);
+
         mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
  
         spin_lock_irq(&ctx->completion_lock);
@@ -1844,6 +1882,10 @@ restart:
                 /* async context always use a copy of the sqe */
                 kfree(sqe);
  
+               /* req from defer and link list needn't decrease async cnt */
+               if (req->flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE))
+                       goto out;
+
                 if (!async_list)
                         break;
                 if (!list_empty(&req_list)) {
@@ -1891,6 +1933,7 @@ restart:
                 }
         }
  
+out:
         if (cur_mm) {
                 set_fs(old_fs);
                 unuse_mm(cur_mm);
@@ -1917,6 +1960,10 @@ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
         ret = true;
         spin_lock(&list->lock);
         list_add_tail(&req->list, &list->list);
+       /*
+        * Ensure we see a simultaneous modification from io_sq_wq_submit_work()
+        */
+       smp_mb();
         if (!atomic_read(&list->cnt)) {
                 list_del_init(&req->list);
                 ret = false;
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h

index 689a582312887d05b1d81212454e02560e36c4de..12811091fd50bcd485d28c87d4e235890bb6ac92 100644 (file)
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -181,6 +181,7 @@ struct blkcg_policy {
  
  extern struct blkcg blkcg_root;
  extern struct cgroup_subsys_state * const blkcg_root_css;
+extern bool blkcg_debug_stats;
  
  struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
                                       struct request_queue *q, bool update_hint);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h

index feff3fe4467ec97ce92aa7e6b5b13ef05012bb47..1b1fa1557e68aefe5cf5df252485d398b8a38e35 100644 (file)
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -311,6 +311,7 @@ enum req_flag_bits {
         __REQ_RAHEAD,           /* read ahead, can fail anytime */
         __REQ_BACKGROUND,       /* background IO */
         __REQ_NOWAIT,           /* Don't wait if request will block */
+       __REQ_NOWAIT_INLINE,    /* Return would-block error inline */
         /*
          * When a shared kthread needs to issue a bio for a cgroup, doing
          * so synchronously can lead to priority inversions as the kthread
@@ -345,6 +346,7 @@ enum req_flag_bits {
  #define REQ_RAHEAD             (1ULL << __REQ_RAHEAD)
  #define REQ_BACKGROUND         (1ULL << __REQ_BACKGROUND)
  #define REQ_NOWAIT             (1ULL << __REQ_NOWAIT)
+#define REQ_NOWAIT_INLINE      (1ULL << __REQ_NOWAIT_INLINE)
  #define REQ_CGROUP_PUNT                (1ULL << __REQ_CGROUP_PUNT)
  
  #define REQ_NOUNMAP            (1ULL << __REQ_NOUNMAP)
@@ -418,12 +420,13 @@ static inline int op_stat_group(unsigned int op)
  
  typedef unsigned int blk_qc_t;
  #define BLK_QC_T_NONE          -1U
+#define BLK_QC_T_EAGAIN                -2U
  #define BLK_QC_T_SHIFT         16
  #define BLK_QC_T_INTERNAL      (1U << 31)
  
  static inline bool blk_qc_t_valid(blk_qc_t cookie)
  {
-       return cookie != BLK_QC_T_NONE;
+       return cookie != BLK_QC_T_NONE && cookie != BLK_QC_T_EAGAIN;
  }
  
  static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie)
diff --git a/include/linux/elevator.h b/include/linux/elevator.h

index 17cd0078377cae91dc86d63541b1bd6b87141b0a..1dd014c9c87b57ade5d86311635d24aa8e78e2c1 100644 (file)
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -45,7 +45,6 @@ struct elevator_mq_ops {
         struct request *(*dispatch_request)(struct blk_mq_hw_ctx *);
         bool (*has_work)(struct blk_mq_hw_ctx *);
         void (*completed_request)(struct request *, u64);
-       void (*started_request)(struct request *);
         void (*requeue_request)(struct request *);
         struct request *(*former_request)(struct request_queue *, struct request *);
         struct request *(*next_request)(struct request_queue *, struct request *);
diff --git a/include/linux/wait.h b/include/linux/wait.h

index b6f77cf60dd7bbf8154aa56a3ea0bb2631221981..30c515520fb28c69873aad6d0b498a59679f46ab 100644 (file)
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -126,6 +126,19 @@ static inline int waitqueue_active(struct wait_queue_head *wq_head)
         return !list_empty(&wq_head->head);
  }
  
+/**
+ * wq_has_single_sleeper - check if there is only one sleeper
+ * @wq_head: wait queue head
+ *
+ * Returns true of wq_head has only one sleeper on the list.
+ *
+ * Please refer to the comment for waitqueue_active.
+ */
+static inline bool wq_has_single_sleeper(struct wait_queue_head *wq_head)
+{
+       return list_is_singular(&wq_head->head);
+}
+
  /**
   * wq_has_sleeper - check if there are any waiting processes
   * @wq_head: wait queue head
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 26 Jul 2019 17:32:12 +0000 (10:32 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 26 Jul 2019 17:32:12 +0000 (10:32 -0700)
block/bfq-iosched.c		patch \| blob \| blame \| history
block/blk-cgroup.c		patch \| blob \| blame \| history
block/blk-iolatency.c		patch \| blob \| blame \| history
block/blk-mq-sched.h		patch \| blob \| blame \| history
block/blk-mq.c		patch \| blob \| blame \| history
block/blk-rq-qos.c		patch \| blob \| blame \| history
block/genhd.c		patch \| blob \| blame \| history
drivers/ata/libahci_platform.c		patch \| blob \| blame \| history
drivers/block/drbd/drbd_receiver.c		patch \| blob \| blame \| history
drivers/md/bcache/super.c		patch \| blob \| blame \| history
drivers/nvme/host/core.c		patch \| blob \| blame \| history
drivers/nvme/host/multipath.c		patch \| blob \| blame \| history
drivers/nvme/host/nvme.h		patch \| blob \| blame \| history
drivers/nvme/host/pci.c		patch \| blob \| blame \| history
fs/block_dev.c		patch \| blob \| blame \| history
fs/io_uring.c		patch \| blob \| blame \| history
include/linux/blk-cgroup.h		patch \| blob \| blame \| history
include/linux/blk_types.h		patch \| blob \| blame \| history
include/linux/elevator.h		patch \| blob \| blame \| history
include/linux/wait.h		patch \| blob \| blame \| history