Merge tag 'block-6.1-2022-10-20' of git://git.kernel.dk/linux
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 21 Oct 2022 22:14:14 +0000 (15:14 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 21 Oct 2022 22:14:14 +0000 (15:14 -0700)
Pull block fixes from Jens Axboe:

 - NVMe pull request via Christoph:
      - fix nvme-hwmon for DMA non-cohehrent architectures (Serge Semin)
      - add a nvme-hwmong maintainer (Christoph Hellwig)
      - fix error pointer dereference in error handling (Dan Carpenter)
      - fix invalid memory reference in nvmet_subsys_attr_qid_max_show
        (Daniel Wagner)
      - don't limit the DMA segment size in nvme-apple (Russell King)
      - fix workqueue MEM_RECLAIM flushing dependency (Sagi Grimberg)
      - disable write zeroes on various Kingston SSDs (Xander Li)

 - fix a memory leak with block device tracing (Ye)

 - flexible-array fix for ublk (Yushan)

 - document the ublk recovery feature from this merge window
   (ZiyangZhang)

 - remove dead bfq variable in struct (Yuwei)

 - error handling rq clearing fix (Yu)

 - add an IRQ safety check for the cached bio freeing (Pavel)

 - drbd bio cloning fix (Christoph)

* tag 'block-6.1-2022-10-20' of git://git.kernel.dk/linux:
  blktrace: remove unnessary stop block trace in 'blk_trace_shutdown'
  blktrace: fix possible memleak in '__blk_trace_remove'
  blktrace: introduce 'blk_trace_{start,stop}' helper
  bio: safeguard REQ_ALLOC_CACHE bio put
  block, bfq: remove unused variable for bfq_queue
  drbd: only clone bio if we have a backing device
  ublk_drv: use flexible-array member instead of zero-length array
  nvmet: fix invalid memory reference in nvmet_subsys_attr_qid_max_show
  nvmet: fix workqueue MEM_RECLAIM flushing dependency
  nvme-hwmon: kmalloc the NVME SMART log buffer
  nvme-hwmon: consistently ignore errors from nvme_hwmon_init
  nvme: add Guenther as nvme-hwmon maintainer
  nvme-apple: don't limit DMA segement size
  nvme-pci: disable write zeroes on various Kingston SSD
  nvme: fix error pointer dereference in error handling
  Documentation: document ublk user recovery feature
  blk-mq: fix null pointer dereference in blk_mq_clear_rq_mapping()

14 files changed:
Documentation/block/ublk.rst
MAINTAINERS
block/bfq-iosched.h
block/bio.c
block/blk-mq.c
drivers/block/drbd/drbd_req.c
drivers/block/ublk_drv.c
drivers/nvme/host/apple.c
drivers/nvme/host/core.c
drivers/nvme/host/hwmon.c
drivers/nvme/host/pci.c
drivers/nvme/target/configfs.c
drivers/nvme/target/core.c
kernel/trace/blktrace.c

index 2122d1a4a541963450791a86320a7d0169b6e667..ba45c46cc0dacf558308833bafa95441c6437d82 100644 (file)
@@ -144,6 +144,42 @@ managing and controlling ublk devices with help of several control commands:
   For retrieving device info via ``ublksrv_ctrl_dev_info``. It is the server's
   responsibility to save IO target specific info in userspace.
 
+- ``UBLK_CMD_START_USER_RECOVERY``
+
+  This command is valid if ``UBLK_F_USER_RECOVERY`` feature is enabled. This
+  command is accepted after the old process has exited, ublk device is quiesced
+  and ``/dev/ublkc*`` is released. User should send this command before he starts
+  a new process which re-opens ``/dev/ublkc*``. When this command returns, the
+  ublk device is ready for the new process.
+
+- ``UBLK_CMD_END_USER_RECOVERY``
+
+  This command is valid if ``UBLK_F_USER_RECOVERY`` feature is enabled. This
+  command is accepted after ublk device is quiesced and a new process has
+  opened ``/dev/ublkc*`` and get all ublk queues be ready. When this command
+  returns, ublk device is unquiesced and new I/O requests are passed to the
+  new process.
+
+- user recovery feature description
+
+  Two new features are added for user recovery: ``UBLK_F_USER_RECOVERY`` and
+  ``UBLK_F_USER_RECOVERY_REISSUE``.
+
+  With ``UBLK_F_USER_RECOVERY`` set, after one ubq_daemon(ublk server's io
+  handler) is dying, ublk does not delete ``/dev/ublkb*`` during the whole
+  recovery stage and ublk device ID is kept. It is ublk server's
+  responsibility to recover the device context by its own knowledge.
+  Requests which have not been issued to userspace are requeued. Requests
+  which have been issued to userspace are aborted.
+
+  With ``UBLK_F_USER_RECOVERY_REISSUE`` set, after one ubq_daemon(ublk
+  server's io handler) is dying, contrary to ``UBLK_F_USER_RECOVERY``,
+  requests which have been issued to userspace are requeued and will be
+  re-issued to the new process after handling ``UBLK_CMD_END_USER_RECOVERY``.
+  ``UBLK_F_USER_RECOVERY_REISSUE`` is designed for backends who tolerate
+  double-write since the driver may issue the same I/O request twice. It
+  might be useful to a read-only FS or a VM backend.
+
 Data plane
 ----------
 
index 8904f186c83b089606beb298dc6fe41f537d211e..bdd43abd26835d092e8692f1b26f3c0a3b4f41a5 100644 (file)
@@ -14713,6 +14713,12 @@ F:     drivers/nvme/target/auth.c
 F:     drivers/nvme/target/fabrics-cmd-auth.c
 F:     include/linux/nvme-auth.h
 
+NVM EXPRESS HARDWARE MONITORING SUPPORT
+M:     Guenter Roeck <linux@roeck-us.net>
+L:     linux-nvme@lists.infradead.org
+S:     Supported
+F:     drivers/nvme/host/hwmon.c
+
 NVM EXPRESS FC TRANSPORT DRIVERS
 M:     James Smart <james.smart@broadcom.com>
 L:     linux-nvme@lists.infradead.org
index 64ee618064ba14f5e9b5608ede0249cb4407615f..71f721670ab6214f13f32c6f8aacd035a1433ecb 100644 (file)
@@ -369,12 +369,8 @@ struct bfq_queue {
        unsigned long split_time; /* time of last split */
 
        unsigned long first_IO_time; /* time of first I/O for this queue */
-
        unsigned long creation_time; /* when this queue is created */
 
-       /* max service rate measured so far */
-       u32 max_service_rate;
-
        /*
         * Pointer to the waker queue for this queue, i.e., to the
         * queue Q such that this queue happens to get new I/O right
index 633a902468ec7f10f91f18edb5e6e083eb3f6652..57c2f327225bd13da2cc00506aeeb2986d13b637 100644 (file)
@@ -741,7 +741,7 @@ void bio_put(struct bio *bio)
                        return;
        }
 
-       if (bio->bi_opf & REQ_ALLOC_CACHE) {
+       if ((bio->bi_opf & REQ_ALLOC_CACHE) && !WARN_ON_ONCE(in_interrupt())) {
                struct bio_alloc_cache *cache;
 
                bio_uninit(bio);
index 8070b6c10e8d5ae1cf97b6f8a712b566f89d2255..33292c01875d52ef6f842410c9e1b78679da61d0 100644 (file)
@@ -3112,8 +3112,11 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
        struct page *page;
        unsigned long flags;
 
-       /* There is no need to clear a driver tags own mapping */
-       if (drv_tags == tags)
+       /*
+        * There is no need to clear mapping if driver tags is not initialized
+        * or the mapping belongs to the driver tags.
+        */
+       if (!drv_tags || drv_tags == tags)
                return;
 
        list_for_each_entry(page, &tags->page_list, lru) {
index 8f7f144e54f3a801bf1dc6e64438954f2b082f59..7f9bcc82fc9c4935a4ccd0a10b5d8d1f6051d2a1 100644 (file)
@@ -30,11 +30,6 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio
                return NULL;
        memset(req, 0, sizeof(*req));
 
-       req->private_bio = bio_alloc_clone(device->ldev->backing_bdev, bio_src,
-                                          GFP_NOIO, &drbd_io_bio_set);
-       req->private_bio->bi_private = req;
-       req->private_bio->bi_end_io = drbd_request_endio;
-
        req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0)
                      | (bio_op(bio_src) == REQ_OP_WRITE_ZEROES ? RQ_ZEROES : 0)
                      | (bio_op(bio_src) == REQ_OP_DISCARD ? RQ_UNMAP : 0);
@@ -1219,9 +1214,12 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio)
        /* Update disk stats */
        req->start_jif = bio_start_io_acct(req->master_bio);
 
-       if (!get_ldev(device)) {
-               bio_put(req->private_bio);
-               req->private_bio = NULL;
+       if (get_ldev(device)) {
+               req->private_bio = bio_alloc_clone(device->ldev->backing_bdev,
+                                                  bio, GFP_NOIO,
+                                                  &drbd_io_bio_set);
+               req->private_bio->bi_private = req;
+               req->private_bio->bi_end_io = drbd_request_endio;
        }
 
        /* process discards always from our submitter thread */
index 2651bf41dde31fdf4fa65b27513d4b543e98d9aa..5afce6ffaadfaf0efa8703a9aa999af9c3fa4f0c 100644 (file)
@@ -124,7 +124,7 @@ struct ublk_queue {
        bool force_abort;
        unsigned short nr_io_ready;     /* how many ios setup */
        struct ublk_device *dev;
-       struct ublk_io ios[0];
+       struct ublk_io ios[];
 };
 
 #define UBLK_DAEMON_MONITOR_PERIOD     (5 * HZ)
index 5fc5ea196b40072dc224c288a7ca8ef56aff5dee..ff8b083dc5c6d862912874960ac76a6abe5dd3c8 100644 (file)
@@ -1039,6 +1039,8 @@ static void apple_nvme_reset_work(struct work_struct *work)
                                         dma_max_mapping_size(anv->dev) >> 9);
        anv->ctrl.max_segments = NVME_MAX_SEGS;
 
+       dma_set_max_seg_size(anv->dev, 0xffffffff);
+
        /*
         * Enable NVMMU and linear submission queues.
         * While we could keep those disabled and pretend this is slightly
index 059737c1a2c19c3a9d73f0b0c4ec2d5a878017e1..dc42206005855dce9ce9cc2ec60429995f246573 100644 (file)
@@ -3262,8 +3262,12 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl)
                return ret;
 
        if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) {
+               /*
+                * Do not return errors unless we are in a controller reset,
+                * the controller works perfectly fine without hwmon.
+                */
                ret = nvme_hwmon_init(ctrl);
-               if (ret < 0)
+               if (ret == -EINTR)
                        return ret;
        }
 
@@ -4846,7 +4850,7 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
        return 0;
 
 out_cleanup_admin_q:
-       blk_mq_destroy_queue(ctrl->fabrics_q);
+       blk_mq_destroy_queue(ctrl->admin_q);
 out_free_tagset:
        blk_mq_free_tag_set(ctrl->admin_tagset);
        return ret;
index 0a586d7129201761d411c13b7e20425d4287c5c8..9e6e56c20ec993bfca3ee2eb18d2e371d7225230 100644 (file)
@@ -12,7 +12,7 @@
 
 struct nvme_hwmon_data {
        struct nvme_ctrl *ctrl;
-       struct nvme_smart_log log;
+       struct nvme_smart_log *log;
        struct mutex read_lock;
 };
 
@@ -60,14 +60,14 @@ static int nvme_set_temp_thresh(struct nvme_ctrl *ctrl, int sensor, bool under,
 static int nvme_hwmon_get_smart_log(struct nvme_hwmon_data *data)
 {
        return nvme_get_log(data->ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0,
-                          NVME_CSI_NVM, &data->log, sizeof(data->log), 0);
+                          NVME_CSI_NVM, data->log, sizeof(*data->log), 0);
 }
 
 static int nvme_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
                           u32 attr, int channel, long *val)
 {
        struct nvme_hwmon_data *data = dev_get_drvdata(dev);
-       struct nvme_smart_log *log = &data->log;
+       struct nvme_smart_log *log = data->log;
        int temp;
        int err;
 
@@ -163,7 +163,7 @@ static umode_t nvme_hwmon_is_visible(const void *_data,
        case hwmon_temp_max:
        case hwmon_temp_min:
                if ((!channel && data->ctrl->wctemp) ||
-                   (channel && data->log.temp_sensor[channel - 1])) {
+                   (channel && data->log->temp_sensor[channel - 1])) {
                        if (data->ctrl->quirks &
                            NVME_QUIRK_NO_TEMP_THRESH_CHANGE)
                                return 0444;
@@ -176,7 +176,7 @@ static umode_t nvme_hwmon_is_visible(const void *_data,
                break;
        case hwmon_temp_input:
        case hwmon_temp_label:
-               if (!channel || data->log.temp_sensor[channel - 1])
+               if (!channel || data->log->temp_sensor[channel - 1])
                        return 0444;
                break;
        default:
@@ -230,7 +230,13 @@ int nvme_hwmon_init(struct nvme_ctrl *ctrl)
 
        data = kzalloc(sizeof(*data), GFP_KERNEL);
        if (!data)
-               return 0;
+               return -ENOMEM;
+
+       data->log = kzalloc(sizeof(*data->log), GFP_KERNEL);
+       if (!data->log) {
+               err = -ENOMEM;
+               goto err_free_data;
+       }
 
        data->ctrl = ctrl;
        mutex_init(&data->read_lock);
@@ -238,8 +244,7 @@ int nvme_hwmon_init(struct nvme_ctrl *ctrl)
        err = nvme_hwmon_get_smart_log(data);
        if (err) {
                dev_warn(dev, "Failed to read smart log (error %d)\n", err);
-               kfree(data);
-               return err;
+               goto err_free_log;
        }
 
        hwmon = hwmon_device_register_with_info(dev, "nvme",
@@ -247,11 +252,17 @@ int nvme_hwmon_init(struct nvme_ctrl *ctrl)
                                                NULL);
        if (IS_ERR(hwmon)) {
                dev_warn(dev, "Failed to instantiate hwmon device\n");
-               kfree(data);
-               return PTR_ERR(hwmon);
+               err = PTR_ERR(hwmon);
+               goto err_free_log;
        }
        ctrl->hwmon_device = hwmon;
        return 0;
+
+err_free_log:
+       kfree(data->log);
+err_free_data:
+       kfree(data);
+       return err;
 }
 
 void nvme_hwmon_exit(struct nvme_ctrl *ctrl)
@@ -262,6 +273,7 @@ void nvme_hwmon_exit(struct nvme_ctrl *ctrl)
 
                hwmon_device_unregister(ctrl->hwmon_device);
                ctrl->hwmon_device = NULL;
+               kfree(data->log);
                kfree(data);
        }
 }
index bcbef6bc5672f09e43f7e939c8bcc25d0b26b4c7..31e577b01257d6196d71523b20575eea8d99e4a8 100644 (file)
@@ -3511,6 +3511,16 @@ static const struct pci_device_id nvme_id_table[] = {
                .driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
        { PCI_DEVICE(0x2646, 0x2263),   /* KINGSTON A2000 NVMe SSD  */
                .driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
+       { PCI_DEVICE(0x2646, 0x5018),   /* KINGSTON OM8SFP4xxxxP OS21012 NVMe SSD */
+               .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+       { PCI_DEVICE(0x2646, 0x5016),   /* KINGSTON OM3PGP4xxxxP OS21011 NVMe SSD */
+               .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+       { PCI_DEVICE(0x2646, 0x501A),   /* KINGSTON OM8PGP4xxxxP OS21005 NVMe SSD */
+               .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+       { PCI_DEVICE(0x2646, 0x501B),   /* KINGSTON OM8PGP4xxxxQ OS21005 NVMe SSD */
+               .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+       { PCI_DEVICE(0x2646, 0x501E),   /* KINGSTON OM3PGP4xxxxQ OS21011 NVMe SSD */
+               .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
        { PCI_DEVICE(0x1e4B, 0x1001),   /* MAXIO MAP1001 */
                .driver_data = NVME_QUIRK_BOGUS_NID, },
        { PCI_DEVICE(0x1e4B, 0x1002),   /* MAXIO MAP1002 */
index e34a2896fedb294d8e2269414ed5d0dae8eaafc8..9443ee1d4ae3dd6c367db6a3ca7b24a1b90da15f 100644 (file)
@@ -1290,12 +1290,8 @@ static ssize_t nvmet_subsys_attr_qid_max_show(struct config_item *item,
 static ssize_t nvmet_subsys_attr_qid_max_store(struct config_item *item,
                                               const char *page, size_t cnt)
 {
-       struct nvmet_port *port = to_nvmet_port(item);
        u16 qid_max;
 
-       if (nvmet_is_port_enabled(port, __func__))
-               return -EACCES;
-
        if (sscanf(page, "%hu\n", &qid_max) != 1)
                return -EINVAL;
 
index 14677145bbba098772d1338fc871d258acfbdb9e..aecb5853f8da44df9f4b97f36bee0c198df44a10 100644 (file)
@@ -1176,7 +1176,7 @@ static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl)
         * reset the keep alive timer when the controller is enabled.
         */
        if (ctrl->kato)
-               mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ);
+               mod_delayed_work(nvmet_wq, &ctrl->ka_work, ctrl->kato * HZ);
 }
 
 static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl)
index 7f5eb295fe198997711d0653fdd0516b486e2bce..a995ea1ef849a4ef6b607c443b6a7bbdbbe548c5 100644 (file)
@@ -346,8 +346,40 @@ static void put_probe_ref(void)
        mutex_unlock(&blk_probe_mutex);
 }
 
+static int blk_trace_start(struct blk_trace *bt)
+{
+       if (bt->trace_state != Blktrace_setup &&
+           bt->trace_state != Blktrace_stopped)
+               return -EINVAL;
+
+       blktrace_seq++;
+       smp_mb();
+       bt->trace_state = Blktrace_running;
+       raw_spin_lock_irq(&running_trace_lock);
+       list_add(&bt->running_list, &running_trace_list);
+       raw_spin_unlock_irq(&running_trace_lock);
+       trace_note_time(bt);
+
+       return 0;
+}
+
+static int blk_trace_stop(struct blk_trace *bt)
+{
+       if (bt->trace_state != Blktrace_running)
+               return -EINVAL;
+
+       bt->trace_state = Blktrace_stopped;
+       raw_spin_lock_irq(&running_trace_lock);
+       list_del_init(&bt->running_list);
+       raw_spin_unlock_irq(&running_trace_lock);
+       relay_flush(bt->rchan);
+
+       return 0;
+}
+
 static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt)
 {
+       blk_trace_stop(bt);
        synchronize_rcu();
        blk_trace_free(q, bt);
        put_probe_ref();
@@ -362,8 +394,7 @@ static int __blk_trace_remove(struct request_queue *q)
        if (!bt)
                return -EINVAL;
 
-       if (bt->trace_state != Blktrace_running)
-               blk_trace_cleanup(q, bt);
+       blk_trace_cleanup(q, bt);
 
        return 0;
 }
@@ -658,7 +689,6 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
 
 static int __blk_trace_startstop(struct request_queue *q, int start)
 {
-       int ret;
        struct blk_trace *bt;
 
        bt = rcu_dereference_protected(q->blk_trace,
@@ -666,36 +696,10 @@ static int __blk_trace_startstop(struct request_queue *q, int start)
        if (bt == NULL)
                return -EINVAL;
 
-       /*
-        * For starting a trace, we can transition from a setup or stopped
-        * trace. For stopping a trace, the state must be running
-        */
-       ret = -EINVAL;
-       if (start) {
-               if (bt->trace_state == Blktrace_setup ||
-                   bt->trace_state == Blktrace_stopped) {
-                       blktrace_seq++;
-                       smp_mb();
-                       bt->trace_state = Blktrace_running;
-                       raw_spin_lock_irq(&running_trace_lock);
-                       list_add(&bt->running_list, &running_trace_list);
-                       raw_spin_unlock_irq(&running_trace_lock);
-
-                       trace_note_time(bt);
-                       ret = 0;
-               }
-       } else {
-               if (bt->trace_state == Blktrace_running) {
-                       bt->trace_state = Blktrace_stopped;
-                       raw_spin_lock_irq(&running_trace_lock);
-                       list_del_init(&bt->running_list);
-                       raw_spin_unlock_irq(&running_trace_lock);
-                       relay_flush(bt->rchan);
-                       ret = 0;
-               }
-       }
-
-       return ret;
+       if (start)
+               return blk_trace_start(bt);
+       else
+               return blk_trace_stop(bt);
 }
 
 int blk_trace_startstop(struct request_queue *q, int start)
@@ -772,10 +776,8 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 void blk_trace_shutdown(struct request_queue *q)
 {
        if (rcu_dereference_protected(q->blk_trace,
-                                     lockdep_is_held(&q->debugfs_mutex))) {
-               __blk_trace_startstop(q, 0);
+                                     lockdep_is_held(&q->debugfs_mutex)))
                __blk_trace_remove(q);
-       }
 }
 
 #ifdef CONFIG_BLK_CGROUP
@@ -1614,13 +1616,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
        if (bt == NULL)
                return -EINVAL;
 
-       if (bt->trace_state == Blktrace_running) {
-               bt->trace_state = Blktrace_stopped;
-               raw_spin_lock_irq(&running_trace_lock);
-               list_del_init(&bt->running_list);
-               raw_spin_unlock_irq(&running_trace_lock);
-               relay_flush(bt->rchan);
-       }
+       blk_trace_stop(bt);
 
        put_probe_ref();
        synchronize_rcu();