ublk_drv: add module parameter of ublks_max for limiting max allowed ublk dev
[linux-block.git] / drivers / block / ublk_drv.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Userspace block device - block device which IO is handled from userspace
4  *
5  * Take full use of io_uring passthrough command for communicating with
6  * ublk userspace daemon(ublksrvd) for handling basic IO request.
7  *
8  * Copyright 2022 Ming Lei <ming.lei@redhat.com>
9  *
10  * (part of code stolen from loop.c)
11  */
12 #include <linux/module.h>
13 #include <linux/moduleparam.h>
14 #include <linux/sched.h>
15 #include <linux/fs.h>
16 #include <linux/pagemap.h>
17 #include <linux/file.h>
18 #include <linux/stat.h>
19 #include <linux/errno.h>
20 #include <linux/major.h>
21 #include <linux/wait.h>
22 #include <linux/blkdev.h>
23 #include <linux/init.h>
24 #include <linux/swap.h>
25 #include <linux/slab.h>
26 #include <linux/compat.h>
27 #include <linux/mutex.h>
28 #include <linux/writeback.h>
29 #include <linux/completion.h>
30 #include <linux/highmem.h>
31 #include <linux/sysfs.h>
32 #include <linux/miscdevice.h>
33 #include <linux/falloc.h>
34 #include <linux/uio.h>
35 #include <linux/ioprio.h>
36 #include <linux/sched/mm.h>
37 #include <linux/uaccess.h>
38 #include <linux/cdev.h>
39 #include <linux/io_uring.h>
40 #include <linux/blk-mq.h>
41 #include <linux/delay.h>
42 #include <linux/mm.h>
43 #include <asm/page.h>
44 #include <linux/task_work.h>
45 #include <uapi/linux/ublk_cmd.h>
46
47 #define UBLK_MINORS             (1U << MINORBITS)
48
49 /* All UBLK_F_* have to be included into UBLK_F_ALL */
50 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
51                 | UBLK_F_URING_CMD_COMP_IN_TASK \
52                 | UBLK_F_NEED_GET_DATA \
53                 | UBLK_F_USER_RECOVERY \
54                 | UBLK_F_USER_RECOVERY_REISSUE)
55
56 /* All UBLK_PARAM_TYPE_* should be included here */
57 #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \
58                 UBLK_PARAM_TYPE_DISCARD | UBLK_PARAM_TYPE_DEVT)
59
60 struct ublk_rq_data {
61         struct llist_node node;
62         struct callback_head work;
63 };
64
65 struct ublk_uring_cmd_pdu {
66         struct ublk_queue *ubq;
67 };
68
69 /*
70  * io command is active: sqe cmd is received, and its cqe isn't done
71  *
72  * If the flag is set, the io command is owned by ublk driver, and waited
73  * for incoming blk-mq request from the ublk block device.
74  *
75  * If the flag is cleared, the io command will be completed, and owned by
76  * ublk server.
77  */
78 #define UBLK_IO_FLAG_ACTIVE     0x01
79
80 /*
81  * IO command is completed via cqe, and it is being handled by ublksrv, and
82  * not committed yet
83  *
84  * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
85  * cross verification
86  */
87 #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
88
89 /*
90  * IO command is aborted, so this flag is set in case of
91  * !UBLK_IO_FLAG_ACTIVE.
92  *
93  * After this flag is observed, any pending or new incoming request
94  * associated with this io command will be failed immediately
95  */
96 #define UBLK_IO_FLAG_ABORTED 0x04
97
98 /*
99  * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
100  * get data buffer address from ublksrv.
101  *
102  * Then, bio data could be copied into this data buffer for a WRITE request
103  * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
104  */
105 #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
106
107 struct ublk_io {
108         /* userspace buffer address from io cmd */
109         __u64   addr;
110         unsigned int flags;
111         int res;
112
113         struct io_uring_cmd *cmd;
114 };
115
116 struct ublk_queue {
117         int q_id;
118         int q_depth;
119
120         unsigned long flags;
121         struct task_struct      *ubq_daemon;
122         char *io_cmd_buf;
123
124         struct llist_head       io_cmds;
125
126         unsigned long io_addr;  /* mapped vm address */
127         unsigned int max_io_sz;
128         bool force_abort;
129         unsigned short nr_io_ready;     /* how many ios setup */
130         struct ublk_device *dev;
131         struct ublk_io ios[];
132 };
133
134 #define UBLK_DAEMON_MONITOR_PERIOD      (5 * HZ)
135
136 struct ublk_device {
137         struct gendisk          *ub_disk;
138
139         char    *__queues;
140
141         unsigned short  queue_size;
142         struct ublksrv_ctrl_dev_info    dev_info;
143
144         struct blk_mq_tag_set   tag_set;
145
146         struct cdev             cdev;
147         struct device           cdev_dev;
148
149 #define UB_STATE_OPEN           0
150 #define UB_STATE_USED           1
151         unsigned long           state;
152         int                     ub_number;
153
154         struct mutex            mutex;
155
156         spinlock_t              mm_lock;
157         struct mm_struct        *mm;
158
159         struct ublk_params      params;
160
161         struct completion       completion;
162         unsigned int            nr_queues_ready;
163         unsigned int            nr_privileged_daemon;
164
165         /*
166          * Our ubq->daemon may be killed without any notification, so
167          * monitor each queue's daemon periodically
168          */
169         struct delayed_work     monitor_work;
170         struct work_struct      quiesce_work;
171         struct work_struct      stop_work;
172 };
173
174 /* header of ublk_params */
175 struct ublk_params_header {
176         __u32   len;
177         __u32   types;
178 };
179
180 static dev_t ublk_chr_devt;
181 static struct class *ublk_chr_class;
182
183 static DEFINE_IDR(ublk_index_idr);
184 static DEFINE_SPINLOCK(ublk_idr_lock);
185 static wait_queue_head_t ublk_idr_wq;   /* wait until one idr is freed */
186
187 static DEFINE_MUTEX(ublk_ctl_mutex);
188
189 /*
190  * Max ublk devices allowed to add
191  *
192  * It can be extended to one per-user limit in future or even controlled
193  * by cgroup.
194  */
195 static unsigned int ublks_max = 64;
196 static unsigned int ublks_added;        /* protected by ublk_ctl_mutex */
197
198 static struct miscdevice ublk_misc;
199
200 static void ublk_dev_param_basic_apply(struct ublk_device *ub)
201 {
202         struct request_queue *q = ub->ub_disk->queue;
203         const struct ublk_param_basic *p = &ub->params.basic;
204
205         blk_queue_logical_block_size(q, 1 << p->logical_bs_shift);
206         blk_queue_physical_block_size(q, 1 << p->physical_bs_shift);
207         blk_queue_io_min(q, 1 << p->io_min_shift);
208         blk_queue_io_opt(q, 1 << p->io_opt_shift);
209
210         blk_queue_write_cache(q, p->attrs & UBLK_ATTR_VOLATILE_CACHE,
211                         p->attrs & UBLK_ATTR_FUA);
212         if (p->attrs & UBLK_ATTR_ROTATIONAL)
213                 blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
214         else
215                 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
216
217         blk_queue_max_hw_sectors(q, p->max_sectors);
218         blk_queue_chunk_sectors(q, p->chunk_sectors);
219         blk_queue_virt_boundary(q, p->virt_boundary_mask);
220
221         if (p->attrs & UBLK_ATTR_READ_ONLY)
222                 set_disk_ro(ub->ub_disk, true);
223
224         set_capacity(ub->ub_disk, p->dev_sectors);
225 }
226
227 static void ublk_dev_param_discard_apply(struct ublk_device *ub)
228 {
229         struct request_queue *q = ub->ub_disk->queue;
230         const struct ublk_param_discard *p = &ub->params.discard;
231
232         q->limits.discard_alignment = p->discard_alignment;
233         q->limits.discard_granularity = p->discard_granularity;
234         blk_queue_max_discard_sectors(q, p->max_discard_sectors);
235         blk_queue_max_write_zeroes_sectors(q,
236                         p->max_write_zeroes_sectors);
237         blk_queue_max_discard_segments(q, p->max_discard_segments);
238 }
239
240 static int ublk_validate_params(const struct ublk_device *ub)
241 {
242         /* basic param is the only one which must be set */
243         if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
244                 const struct ublk_param_basic *p = &ub->params.basic;
245
246                 if (p->logical_bs_shift > PAGE_SHIFT)
247                         return -EINVAL;
248
249                 if (p->logical_bs_shift > p->physical_bs_shift)
250                         return -EINVAL;
251
252                 if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
253                         return -EINVAL;
254         } else
255                 return -EINVAL;
256
257         if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
258                 const struct ublk_param_discard *p = &ub->params.discard;
259
260                 /* So far, only support single segment discard */
261                 if (p->max_discard_sectors && p->max_discard_segments != 1)
262                         return -EINVAL;
263
264                 if (!p->discard_granularity)
265                         return -EINVAL;
266         }
267
268         /* dev_t is read-only */
269         if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
270                 return -EINVAL;
271
272         return 0;
273 }
274
275 static int ublk_apply_params(struct ublk_device *ub)
276 {
277         if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
278                 return -EINVAL;
279
280         ublk_dev_param_basic_apply(ub);
281
282         if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
283                 ublk_dev_param_discard_apply(ub);
284
285         return 0;
286 }
287
288 static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq)
289 {
290         if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) &&
291                         !(ubq->flags & UBLK_F_URING_CMD_COMP_IN_TASK))
292                 return true;
293         return false;
294 }
295
296 static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
297 {
298         if (ubq->flags & UBLK_F_NEED_GET_DATA)
299                 return true;
300         return false;
301 }
302
303 static struct ublk_device *ublk_get_device(struct ublk_device *ub)
304 {
305         if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
306                 return ub;
307         return NULL;
308 }
309
310 static void ublk_put_device(struct ublk_device *ub)
311 {
312         put_device(&ub->cdev_dev);
313 }
314
315 static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
316                 int qid)
317 {
318        return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
319 }
320
321 static inline bool ublk_rq_has_data(const struct request *rq)
322 {
323         return rq->bio && bio_has_data(rq->bio);
324 }
325
326 static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
327                 int tag)
328 {
329         return (struct ublksrv_io_desc *)
330                 &(ubq->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]);
331 }
332
333 static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
334 {
335         return ublk_get_queue(ub, q_id)->io_cmd_buf;
336 }
337
338 static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id)
339 {
340         struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
341
342         return round_up(ubq->q_depth * sizeof(struct ublksrv_io_desc),
343                         PAGE_SIZE);
344 }
345
346 static inline bool ublk_queue_can_use_recovery_reissue(
347                 struct ublk_queue *ubq)
348 {
349         if ((ubq->flags & UBLK_F_USER_RECOVERY) &&
350                         (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE))
351                 return true;
352         return false;
353 }
354
355 static inline bool ublk_queue_can_use_recovery(
356                 struct ublk_queue *ubq)
357 {
358         if (ubq->flags & UBLK_F_USER_RECOVERY)
359                 return true;
360         return false;
361 }
362
363 static inline bool ublk_can_use_recovery(struct ublk_device *ub)
364 {
365         if (ub->dev_info.flags & UBLK_F_USER_RECOVERY)
366                 return true;
367         return false;
368 }
369
370 static void ublk_free_disk(struct gendisk *disk)
371 {
372         struct ublk_device *ub = disk->private_data;
373
374         clear_bit(UB_STATE_USED, &ub->state);
375         put_device(&ub->cdev_dev);
376 }
377
378 static const struct block_device_operations ub_fops = {
379         .owner =        THIS_MODULE,
380         .free_disk =    ublk_free_disk,
381 };
382
383 #define UBLK_MAX_PIN_PAGES      32
384
385 struct ublk_map_data {
386         const struct ublk_queue *ubq;
387         const struct request *rq;
388         const struct ublk_io *io;
389         unsigned max_bytes;
390 };
391
392 struct ublk_io_iter {
393         struct page *pages[UBLK_MAX_PIN_PAGES];
394         unsigned pg_off;        /* offset in the 1st page in pages */
395         int nr_pages;           /* how many page pointers in pages */
396         struct bio *bio;
397         struct bvec_iter iter;
398 };
399
400 static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data,
401                 unsigned max_bytes, bool to_vm)
402 {
403         const unsigned total = min_t(unsigned, max_bytes,
404                         PAGE_SIZE - data->pg_off +
405                         ((data->nr_pages - 1) << PAGE_SHIFT));
406         unsigned done = 0;
407         unsigned pg_idx = 0;
408
409         while (done < total) {
410                 struct bio_vec bv = bio_iter_iovec(data->bio, data->iter);
411                 const unsigned int bytes = min3(bv.bv_len, total - done,
412                                 (unsigned)(PAGE_SIZE - data->pg_off));
413                 void *bv_buf = bvec_kmap_local(&bv);
414                 void *pg_buf = kmap_local_page(data->pages[pg_idx]);
415
416                 if (to_vm)
417                         memcpy(pg_buf + data->pg_off, bv_buf, bytes);
418                 else
419                         memcpy(bv_buf, pg_buf + data->pg_off, bytes);
420
421                 kunmap_local(pg_buf);
422                 kunmap_local(bv_buf);
423
424                 /* advance page array */
425                 data->pg_off += bytes;
426                 if (data->pg_off == PAGE_SIZE) {
427                         pg_idx += 1;
428                         data->pg_off = 0;
429                 }
430
431                 done += bytes;
432
433                 /* advance bio */
434                 bio_advance_iter_single(data->bio, &data->iter, bytes);
435                 if (!data->iter.bi_size) {
436                         data->bio = data->bio->bi_next;
437                         if (data->bio == NULL)
438                                 break;
439                         data->iter = data->bio->bi_iter;
440                 }
441         }
442
443         return done;
444 }
445
446 static inline int ublk_copy_user_pages(struct ublk_map_data *data,
447                 bool to_vm)
448 {
449         const unsigned int gup_flags = to_vm ? FOLL_WRITE : 0;
450         const unsigned long start_vm = data->io->addr;
451         unsigned int done = 0;
452         struct ublk_io_iter iter = {
453                 .pg_off = start_vm & (PAGE_SIZE - 1),
454                 .bio    = data->rq->bio,
455                 .iter   = data->rq->bio->bi_iter,
456         };
457         const unsigned int nr_pages = round_up(data->max_bytes +
458                         (start_vm & (PAGE_SIZE - 1)), PAGE_SIZE) >> PAGE_SHIFT;
459
460         while (done < nr_pages) {
461                 const unsigned to_pin = min_t(unsigned, UBLK_MAX_PIN_PAGES,
462                                 nr_pages - done);
463                 unsigned i, len;
464
465                 iter.nr_pages = get_user_pages_fast(start_vm +
466                                 (done << PAGE_SHIFT), to_pin, gup_flags,
467                                 iter.pages);
468                 if (iter.nr_pages <= 0)
469                         return done == 0 ? iter.nr_pages : done;
470                 len = ublk_copy_io_pages(&iter, data->max_bytes, to_vm);
471                 for (i = 0; i < iter.nr_pages; i++) {
472                         if (to_vm)
473                                 set_page_dirty(iter.pages[i]);
474                         put_page(iter.pages[i]);
475                 }
476                 data->max_bytes -= len;
477                 done += iter.nr_pages;
478         }
479
480         return done;
481 }
482
483 static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
484                 struct ublk_io *io)
485 {
486         const unsigned int rq_bytes = blk_rq_bytes(req);
487         /*
488          * no zero copy, we delay copy WRITE request data into ublksrv
489          * context and the big benefit is that pinning pages in current
490          * context is pretty fast, see ublk_pin_user_pages
491          */
492         if (req_op(req) != REQ_OP_WRITE && req_op(req) != REQ_OP_FLUSH)
493                 return rq_bytes;
494
495         if (ublk_rq_has_data(req)) {
496                 struct ublk_map_data data = {
497                         .ubq    =       ubq,
498                         .rq     =       req,
499                         .io     =       io,
500                         .max_bytes =    rq_bytes,
501                 };
502
503                 ublk_copy_user_pages(&data, true);
504
505                 return rq_bytes - data.max_bytes;
506         }
507         return rq_bytes;
508 }
509
510 static int ublk_unmap_io(const struct ublk_queue *ubq,
511                 const struct request *req,
512                 struct ublk_io *io)
513 {
514         const unsigned int rq_bytes = blk_rq_bytes(req);
515
516         if (req_op(req) == REQ_OP_READ && ublk_rq_has_data(req)) {
517                 struct ublk_map_data data = {
518                         .ubq    =       ubq,
519                         .rq     =       req,
520                         .io     =       io,
521                         .max_bytes =    io->res,
522                 };
523
524                 WARN_ON_ONCE(io->res > rq_bytes);
525
526                 ublk_copy_user_pages(&data, false);
527
528                 return io->res - data.max_bytes;
529         }
530         return rq_bytes;
531 }
532
533 static inline unsigned int ublk_req_build_flags(struct request *req)
534 {
535         unsigned flags = 0;
536
537         if (req->cmd_flags & REQ_FAILFAST_DEV)
538                 flags |= UBLK_IO_F_FAILFAST_DEV;
539
540         if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
541                 flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
542
543         if (req->cmd_flags & REQ_FAILFAST_DRIVER)
544                 flags |= UBLK_IO_F_FAILFAST_DRIVER;
545
546         if (req->cmd_flags & REQ_META)
547                 flags |= UBLK_IO_F_META;
548
549         if (req->cmd_flags & REQ_FUA)
550                 flags |= UBLK_IO_F_FUA;
551
552         if (req->cmd_flags & REQ_NOUNMAP)
553                 flags |= UBLK_IO_F_NOUNMAP;
554
555         if (req->cmd_flags & REQ_SWAP)
556                 flags |= UBLK_IO_F_SWAP;
557
558         return flags;
559 }
560
561 static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
562 {
563         struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
564         struct ublk_io *io = &ubq->ios[req->tag];
565         u32 ublk_op;
566
567         switch (req_op(req)) {
568         case REQ_OP_READ:
569                 ublk_op = UBLK_IO_OP_READ;
570                 break;
571         case REQ_OP_WRITE:
572                 ublk_op = UBLK_IO_OP_WRITE;
573                 break;
574         case REQ_OP_FLUSH:
575                 ublk_op = UBLK_IO_OP_FLUSH;
576                 break;
577         case REQ_OP_DISCARD:
578                 ublk_op = UBLK_IO_OP_DISCARD;
579                 break;
580         case REQ_OP_WRITE_ZEROES:
581                 ublk_op = UBLK_IO_OP_WRITE_ZEROES;
582                 break;
583         default:
584                 return BLK_STS_IOERR;
585         }
586
587         /* need to translate since kernel may change */
588         iod->op_flags = ublk_op | ublk_req_build_flags(req);
589         iod->nr_sectors = blk_rq_sectors(req);
590         iod->start_sector = blk_rq_pos(req);
591         iod->addr = io->addr;
592
593         return BLK_STS_OK;
594 }
595
596 static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
597                 struct io_uring_cmd *ioucmd)
598 {
599         return (struct ublk_uring_cmd_pdu *)&ioucmd->pdu;
600 }
601
602 static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq)
603 {
604         return ubq->ubq_daemon->flags & PF_EXITING;
605 }
606
607 /* todo: handle partial completion */
608 static void ublk_complete_rq(struct request *req)
609 {
610         struct ublk_queue *ubq = req->mq_hctx->driver_data;
611         struct ublk_io *io = &ubq->ios[req->tag];
612         unsigned int unmapped_bytes;
613
614         /* failed read IO if nothing is read */
615         if (!io->res && req_op(req) == REQ_OP_READ)
616                 io->res = -EIO;
617
618         if (io->res < 0) {
619                 blk_mq_end_request(req, errno_to_blk_status(io->res));
620                 return;
621         }
622
623         /*
624          * FLUSH or DISCARD usually won't return bytes returned, so end them
625          * directly.
626          *
627          * Both the two needn't unmap.
628          */
629         if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) {
630                 blk_mq_end_request(req, BLK_STS_OK);
631                 return;
632         }
633
634         /* for READ request, writing data in iod->addr to rq buffers */
635         unmapped_bytes = ublk_unmap_io(ubq, req, io);
636
637         /*
638          * Extremely impossible since we got data filled in just before
639          *
640          * Re-read simply for this unlikely case.
641          */
642         if (unlikely(unmapped_bytes < io->res))
643                 io->res = unmapped_bytes;
644
645         if (blk_update_request(req, BLK_STS_OK, io->res))
646                 blk_mq_requeue_request(req, true);
647         else
648                 __blk_mq_end_request(req, BLK_STS_OK);
649 }
650
651 /*
652  * Since __ublk_rq_task_work always fails requests immediately during
653  * exiting, __ublk_fail_req() is only called from abort context during
654  * exiting. So lock is unnecessary.
655  *
656  * Also aborting may not be started yet, keep in mind that one failed
657  * request may be issued by block layer again.
658  */
659 static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
660                 struct request *req)
661 {
662         WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
663
664         if (!(io->flags & UBLK_IO_FLAG_ABORTED)) {
665                 io->flags |= UBLK_IO_FLAG_ABORTED;
666                 if (ublk_queue_can_use_recovery_reissue(ubq))
667                         blk_mq_requeue_request(req, false);
668                 else
669                         blk_mq_end_request(req, BLK_STS_IOERR);
670         }
671 }
672
673 static void ubq_complete_io_cmd(struct ublk_io *io, int res)
674 {
675         /* mark this cmd owned by ublksrv */
676         io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
677
678         /*
679          * clear ACTIVE since we are done with this sqe/cmd slot
680          * We can only accept io cmd in case of being not active.
681          */
682         io->flags &= ~UBLK_IO_FLAG_ACTIVE;
683
684         /* tell ublksrv one io request is coming */
685         io_uring_cmd_done(io->cmd, res, 0);
686 }
687
688 #define UBLK_REQUEUE_DELAY_MS   3
689
690 static inline void __ublk_abort_rq(struct ublk_queue *ubq,
691                 struct request *rq)
692 {
693         /* We cannot process this rq so just requeue it. */
694         if (ublk_queue_can_use_recovery(ubq))
695                 blk_mq_requeue_request(rq, false);
696         else
697                 blk_mq_end_request(rq, BLK_STS_IOERR);
698
699         mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0);
700 }
701
702 static inline void __ublk_rq_task_work(struct request *req)
703 {
704         struct ublk_queue *ubq = req->mq_hctx->driver_data;
705         int tag = req->tag;
706         struct ublk_io *io = &ubq->ios[tag];
707         unsigned int mapped_bytes;
708
709         pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n",
710                         __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
711                         ublk_get_iod(ubq, req->tag)->addr);
712
713         /*
714          * Task is exiting if either:
715          *
716          * (1) current != ubq_daemon.
717          * io_uring_cmd_complete_in_task() tries to run task_work
718          * in a workqueue if ubq_daemon(cmd's task) is PF_EXITING.
719          *
720          * (2) current->flags & PF_EXITING.
721          */
722         if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) {
723                 __ublk_abort_rq(ubq, req);
724                 return;
725         }
726
727         if (ublk_need_get_data(ubq) &&
728                         (req_op(req) == REQ_OP_WRITE ||
729                         req_op(req) == REQ_OP_FLUSH)) {
730                 /*
731                  * We have not handled UBLK_IO_NEED_GET_DATA command yet,
732                  * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
733                  * and notify it.
734                  */
735                 if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) {
736                         io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
737                         pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n",
738                                         __func__, io->cmd->cmd_op, ubq->q_id,
739                                         req->tag, io->flags);
740                         ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA);
741                         return;
742                 }
743                 /*
744                  * We have handled UBLK_IO_NEED_GET_DATA command,
745                  * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
746                  * do the copy work.
747                  */
748                 io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
749                 /* update iod->addr because ublksrv may have passed a new io buffer */
750                 ublk_get_iod(ubq, req->tag)->addr = io->addr;
751                 pr_devel("%s: update iod->addr: op %d, qid %d tag %d io_flags %x addr %llx\n",
752                                 __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
753                                 ublk_get_iod(ubq, req->tag)->addr);
754         }
755
756         mapped_bytes = ublk_map_io(ubq, req, io);
757
758         /* partially mapped, update io descriptor */
759         if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
760                 /*
761                  * Nothing mapped, retry until we succeed.
762                  *
763                  * We may never succeed in mapping any bytes here because
764                  * of OOM. TODO: reserve one buffer with single page pinned
765                  * for providing forward progress guarantee.
766                  */
767                 if (unlikely(!mapped_bytes)) {
768                         blk_mq_requeue_request(req, false);
769                         blk_mq_delay_kick_requeue_list(req->q,
770                                         UBLK_REQUEUE_DELAY_MS);
771                         return;
772                 }
773
774                 ublk_get_iod(ubq, req->tag)->nr_sectors =
775                         mapped_bytes >> 9;
776         }
777
778         ubq_complete_io_cmd(io, UBLK_IO_RES_OK);
779 }
780
781 static inline void ublk_forward_io_cmds(struct ublk_queue *ubq)
782 {
783         struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds);
784         struct ublk_rq_data *data, *tmp;
785
786         io_cmds = llist_reverse_order(io_cmds);
787         llist_for_each_entry_safe(data, tmp, io_cmds, node)
788                 __ublk_rq_task_work(blk_mq_rq_from_pdu(data));
789 }
790
791 static inline void ublk_abort_io_cmds(struct ublk_queue *ubq)
792 {
793         struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds);
794         struct ublk_rq_data *data, *tmp;
795
796         llist_for_each_entry_safe(data, tmp, io_cmds, node)
797                 __ublk_abort_rq(ubq, blk_mq_rq_from_pdu(data));
798 }
799
800 static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd)
801 {
802         struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
803         struct ublk_queue *ubq = pdu->ubq;
804
805         ublk_forward_io_cmds(ubq);
806 }
807
808 static void ublk_rq_task_work_fn(struct callback_head *work)
809 {
810         struct ublk_rq_data *data = container_of(work,
811                         struct ublk_rq_data, work);
812         struct request *req = blk_mq_rq_from_pdu(data);
813         struct ublk_queue *ubq = req->mq_hctx->driver_data;
814
815         ublk_forward_io_cmds(ubq);
816 }
817
818 static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
819 {
820         struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
821         struct ublk_io *io;
822
823         if (!llist_add(&data->node, &ubq->io_cmds))
824                 return;
825
826         io = &ubq->ios[rq->tag];
827         /*
828          * If the check pass, we know that this is a re-issued request aborted
829          * previously in monitor_work because the ubq_daemon(cmd's task) is
830          * PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore
831          * because this ioucmd's io_uring context may be freed now if no inflight
832          * ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work.
833          *
834          * Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing
835          * the tag). Then the request is re-started(allocating the tag) and we are here.
836          * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED
837          * guarantees that here is a re-issued request aborted previously.
838          */
839         if (unlikely(io->flags & UBLK_IO_FLAG_ABORTED)) {
840                 ublk_abort_io_cmds(ubq);
841         } else if (ublk_can_use_task_work(ubq)) {
842                 if (task_work_add(ubq->ubq_daemon, &data->work,
843                                         TWA_SIGNAL_NO_IPI))
844                         ublk_abort_io_cmds(ubq);
845         } else {
846                 struct io_uring_cmd *cmd = io->cmd;
847                 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
848
849                 pdu->ubq = ubq;
850                 io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
851         }
852 }
853
854 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
855                 const struct blk_mq_queue_data *bd)
856 {
857         struct ublk_queue *ubq = hctx->driver_data;
858         struct request *rq = bd->rq;
859         blk_status_t res;
860
861         /* fill iod to slot in io cmd buffer */
862         res = ublk_setup_iod(ubq, rq);
863         if (unlikely(res != BLK_STS_OK))
864                 return BLK_STS_IOERR;
865
866         /* With recovery feature enabled, force_abort is set in
867          * ublk_stop_dev() before calling del_gendisk(). We have to
868          * abort all requeued and new rqs here to let del_gendisk()
869          * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
870          * to avoid UAF on io_uring ctx.
871          *
872          * Note: force_abort is guaranteed to be seen because it is set
873          * before request queue is unqiuesced.
874          */
875         if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort))
876                 return BLK_STS_IOERR;
877
878         blk_mq_start_request(bd->rq);
879
880         if (unlikely(ubq_daemon_is_dying(ubq))) {
881                 __ublk_abort_rq(ubq, rq);
882                 return BLK_STS_OK;
883         }
884
885         ublk_queue_cmd(ubq, rq);
886
887         return BLK_STS_OK;
888 }
889
890 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
891                 unsigned int hctx_idx)
892 {
893         struct ublk_device *ub = driver_data;
894         struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
895
896         hctx->driver_data = ubq;
897         return 0;
898 }
899
900 static int ublk_init_rq(struct blk_mq_tag_set *set, struct request *req,
901                 unsigned int hctx_idx, unsigned int numa_node)
902 {
903         struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
904
905         init_task_work(&data->work, ublk_rq_task_work_fn);
906         return 0;
907 }
908
909 static const struct blk_mq_ops ublk_mq_ops = {
910         .queue_rq       = ublk_queue_rq,
911         .init_hctx      = ublk_init_hctx,
912         .init_request   = ublk_init_rq,
913 };
914
915 static int ublk_ch_open(struct inode *inode, struct file *filp)
916 {
917         struct ublk_device *ub = container_of(inode->i_cdev,
918                         struct ublk_device, cdev);
919
920         if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
921                 return -EBUSY;
922         filp->private_data = ub;
923         return 0;
924 }
925
926 static int ublk_ch_release(struct inode *inode, struct file *filp)
927 {
928         struct ublk_device *ub = filp->private_data;
929
930         clear_bit(UB_STATE_OPEN, &ub->state);
931         return 0;
932 }
933
934 /* map pre-allocated per-queue cmd buffer to ublksrv daemon */
935 static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
936 {
937         struct ublk_device *ub = filp->private_data;
938         size_t sz = vma->vm_end - vma->vm_start;
939         unsigned max_sz = UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc);
940         unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
941         int q_id, ret = 0;
942
943         spin_lock(&ub->mm_lock);
944         if (!ub->mm)
945                 ub->mm = current->mm;
946         if (current->mm != ub->mm)
947                 ret = -EINVAL;
948         spin_unlock(&ub->mm_lock);
949
950         if (ret)
951                 return ret;
952
953         if (vma->vm_flags & VM_WRITE)
954                 return -EPERM;
955
956         end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
957         if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
958                 return -EINVAL;
959
960         q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
961         pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
962                         __func__, q_id, current->pid, vma->vm_start,
963                         phys_off, (unsigned long)sz);
964
965         if (sz != ublk_queue_cmd_buf_size(ub, q_id))
966                 return -EINVAL;
967
968         pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
969         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
970 }
971
972 static void ublk_commit_completion(struct ublk_device *ub,
973                 struct ublksrv_io_cmd *ub_cmd)
974 {
975         u32 qid = ub_cmd->q_id, tag = ub_cmd->tag;
976         struct ublk_queue *ubq = ublk_get_queue(ub, qid);
977         struct ublk_io *io = &ubq->ios[tag];
978         struct request *req;
979
980         /* now this cmd slot is owned by nbd driver */
981         io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
982         io->res = ub_cmd->result;
983
984         /* find the io request and complete */
985         req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
986
987         if (req && likely(!blk_should_fake_timeout(req->q)))
988                 ublk_complete_rq(req);
989 }
990
991 /*
992  * When ->ubq_daemon is exiting, either new request is ended immediately,
993  * or any queued io command is drained, so it is safe to abort queue
994  * lockless
995  */
996 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
997 {
998         int i;
999
1000         if (!ublk_get_device(ub))
1001                 return;
1002
1003         for (i = 0; i < ubq->q_depth; i++) {
1004                 struct ublk_io *io = &ubq->ios[i];
1005
1006                 if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
1007                         struct request *rq;
1008
1009                         /*
1010                          * Either we fail the request or ublk_rq_task_work_fn
1011                          * will do it
1012                          */
1013                         rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
1014                         if (rq)
1015                                 __ublk_fail_req(ubq, io, rq);
1016                 }
1017         }
1018         ublk_put_device(ub);
1019 }
1020
1021 static void ublk_daemon_monitor_work(struct work_struct *work)
1022 {
1023         struct ublk_device *ub =
1024                 container_of(work, struct ublk_device, monitor_work.work);
1025         int i;
1026
1027         for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
1028                 struct ublk_queue *ubq = ublk_get_queue(ub, i);
1029
1030                 if (ubq_daemon_is_dying(ubq)) {
1031                         if (ublk_queue_can_use_recovery(ubq))
1032                                 schedule_work(&ub->quiesce_work);
1033                         else
1034                                 schedule_work(&ub->stop_work);
1035
1036                         /* abort queue is for making forward progress */
1037                         ublk_abort_queue(ub, ubq);
1038                 }
1039         }
1040
1041         /*
1042          * We can't schedule monitor work after ub's state is not UBLK_S_DEV_LIVE.
1043          * after ublk_remove() or __ublk_quiesce_dev() is started.
1044          *
1045          * No need ub->mutex, monitor work are canceled after state is marked
1046          * as not LIVE, so new state is observed reliably.
1047          */
1048         if (ub->dev_info.state == UBLK_S_DEV_LIVE)
1049                 schedule_delayed_work(&ub->monitor_work,
1050                                 UBLK_DAEMON_MONITOR_PERIOD);
1051 }
1052
1053 static inline bool ublk_queue_ready(struct ublk_queue *ubq)
1054 {
1055         return ubq->nr_io_ready == ubq->q_depth;
1056 }
1057
1058 static void ublk_cancel_queue(struct ublk_queue *ubq)
1059 {
1060         int i;
1061
1062         if (!ublk_queue_ready(ubq))
1063                 return;
1064
1065         for (i = 0; i < ubq->q_depth; i++) {
1066                 struct ublk_io *io = &ubq->ios[i];
1067
1068                 if (io->flags & UBLK_IO_FLAG_ACTIVE)
1069                         io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0);
1070         }
1071
1072         /* all io commands are canceled */
1073         ubq->nr_io_ready = 0;
1074 }
1075
1076 /* Cancel all pending commands, must be called after del_gendisk() returns */
1077 static void ublk_cancel_dev(struct ublk_device *ub)
1078 {
1079         int i;
1080
1081         for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1082                 ublk_cancel_queue(ublk_get_queue(ub, i));
1083 }
1084
1085 static bool ublk_check_inflight_rq(struct request *rq, void *data)
1086 {
1087         bool *idle = data;
1088
1089         if (blk_mq_request_started(rq)) {
1090                 *idle = false;
1091                 return false;
1092         }
1093         return true;
1094 }
1095
1096 static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
1097 {
1098         bool idle;
1099
1100         WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
1101         while (true) {
1102                 idle = true;
1103                 blk_mq_tagset_busy_iter(&ub->tag_set,
1104                                 ublk_check_inflight_rq, &idle);
1105                 if (idle)
1106                         break;
1107                 msleep(UBLK_REQUEUE_DELAY_MS);
1108         }
1109 }
1110
1111 static void __ublk_quiesce_dev(struct ublk_device *ub)
1112 {
1113         pr_devel("%s: quiesce ub: dev_id %d state %s\n",
1114                         __func__, ub->dev_info.dev_id,
1115                         ub->dev_info.state == UBLK_S_DEV_LIVE ?
1116                         "LIVE" : "QUIESCED");
1117         blk_mq_quiesce_queue(ub->ub_disk->queue);
1118         ublk_wait_tagset_rqs_idle(ub);
1119         ub->dev_info.state = UBLK_S_DEV_QUIESCED;
1120         ublk_cancel_dev(ub);
1121         /* we are going to release task_struct of ubq_daemon and resets
1122          * ->ubq_daemon to NULL. So in monitor_work, check on ubq_daemon causes UAF.
1123          * Besides, monitor_work is not necessary in QUIESCED state since we have
1124          * already scheduled quiesce_work and quiesced all ubqs.
1125          *
1126          * Do not let monitor_work schedule itself if state it QUIESCED. And we cancel
1127          * it here and re-schedule it in END_USER_RECOVERY to avoid UAF.
1128          */
1129         cancel_delayed_work_sync(&ub->monitor_work);
1130 }
1131
1132 static void ublk_quiesce_work_fn(struct work_struct *work)
1133 {
1134         struct ublk_device *ub =
1135                 container_of(work, struct ublk_device, quiesce_work);
1136
1137         mutex_lock(&ub->mutex);
1138         if (ub->dev_info.state != UBLK_S_DEV_LIVE)
1139                 goto unlock;
1140         __ublk_quiesce_dev(ub);
1141  unlock:
1142         mutex_unlock(&ub->mutex);
1143 }
1144
1145 static void ublk_unquiesce_dev(struct ublk_device *ub)
1146 {
1147         int i;
1148
1149         pr_devel("%s: unquiesce ub: dev_id %d state %s\n",
1150                         __func__, ub->dev_info.dev_id,
1151                         ub->dev_info.state == UBLK_S_DEV_LIVE ?
1152                         "LIVE" : "QUIESCED");
1153         /* quiesce_work has run. We let requeued rqs be aborted
1154          * before running fallback_wq. "force_abort" must be seen
1155          * after request queue is unqiuesced. Then del_gendisk()
1156          * can move on.
1157          */
1158         for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1159                 ublk_get_queue(ub, i)->force_abort = true;
1160
1161         blk_mq_unquiesce_queue(ub->ub_disk->queue);
1162         /* We may have requeued some rqs in ublk_quiesce_queue() */
1163         blk_mq_kick_requeue_list(ub->ub_disk->queue);
1164 }
1165
1166 static void ublk_stop_dev(struct ublk_device *ub)
1167 {
1168         mutex_lock(&ub->mutex);
1169         if (ub->dev_info.state == UBLK_S_DEV_DEAD)
1170                 goto unlock;
1171         if (ublk_can_use_recovery(ub)) {
1172                 if (ub->dev_info.state == UBLK_S_DEV_LIVE)
1173                         __ublk_quiesce_dev(ub);
1174                 ublk_unquiesce_dev(ub);
1175         }
1176         del_gendisk(ub->ub_disk);
1177         ub->dev_info.state = UBLK_S_DEV_DEAD;
1178         ub->dev_info.ublksrv_pid = -1;
1179         put_disk(ub->ub_disk);
1180         ub->ub_disk = NULL;
1181  unlock:
1182         ublk_cancel_dev(ub);
1183         mutex_unlock(&ub->mutex);
1184         cancel_delayed_work_sync(&ub->monitor_work);
1185 }
1186
1187 /* device can only be started after all IOs are ready */
1188 static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
1189 {
1190         mutex_lock(&ub->mutex);
1191         ubq->nr_io_ready++;
1192         if (ublk_queue_ready(ubq)) {
1193                 ubq->ubq_daemon = current;
1194                 get_task_struct(ubq->ubq_daemon);
1195                 ub->nr_queues_ready++;
1196
1197                 if (capable(CAP_SYS_ADMIN))
1198                         ub->nr_privileged_daemon++;
1199         }
1200         if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues)
1201                 complete_all(&ub->completion);
1202         mutex_unlock(&ub->mutex);
1203 }
1204
1205 static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id,
1206                 int tag)
1207 {
1208         struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
1209         struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
1210
1211         ublk_queue_cmd(ubq, req);
1212 }
1213
1214 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
1215 {
1216         struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->cmd;
1217         struct ublk_device *ub = cmd->file->private_data;
1218         struct ublk_queue *ubq;
1219         struct ublk_io *io;
1220         u32 cmd_op = cmd->cmd_op;
1221         unsigned tag = ub_cmd->tag;
1222         int ret = -EINVAL;
1223
1224         pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
1225                         __func__, cmd->cmd_op, ub_cmd->q_id, tag,
1226                         ub_cmd->result);
1227
1228         if (!(issue_flags & IO_URING_F_SQE128))
1229                 goto out;
1230
1231         if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
1232                 goto out;
1233
1234         ubq = ublk_get_queue(ub, ub_cmd->q_id);
1235         if (!ubq || ub_cmd->q_id != ubq->q_id)
1236                 goto out;
1237
1238         if (ubq->ubq_daemon && ubq->ubq_daemon != current)
1239                 goto out;
1240
1241         if (tag >= ubq->q_depth)
1242                 goto out;
1243
1244         io = &ubq->ios[tag];
1245
1246         /* there is pending io cmd, something must be wrong */
1247         if (io->flags & UBLK_IO_FLAG_ACTIVE) {
1248                 ret = -EBUSY;
1249                 goto out;
1250         }
1251
1252         /*
1253          * ensure that the user issues UBLK_IO_NEED_GET_DATA
1254          * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
1255          */
1256         if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
1257                         ^ (cmd_op == UBLK_IO_NEED_GET_DATA))
1258                 goto out;
1259
1260         switch (cmd_op) {
1261         case UBLK_IO_FETCH_REQ:
1262                 /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
1263                 if (ublk_queue_ready(ubq)) {
1264                         ret = -EBUSY;
1265                         goto out;
1266                 }
1267                 /*
1268                  * The io is being handled by server, so COMMIT_RQ is expected
1269                  * instead of FETCH_REQ
1270                  */
1271                 if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
1272                         goto out;
1273                 /* FETCH_RQ has to provide IO buffer */
1274                 if (!ub_cmd->addr)
1275                         goto out;
1276                 io->cmd = cmd;
1277                 io->flags |= UBLK_IO_FLAG_ACTIVE;
1278                 io->addr = ub_cmd->addr;
1279
1280                 ublk_mark_io_ready(ub, ubq);
1281                 break;
1282         case UBLK_IO_COMMIT_AND_FETCH_REQ:
1283                 /* FETCH_RQ has to provide IO buffer */
1284                 if (!ub_cmd->addr)
1285                         goto out;
1286                 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
1287                         goto out;
1288                 io->addr = ub_cmd->addr;
1289                 io->flags |= UBLK_IO_FLAG_ACTIVE;
1290                 io->cmd = cmd;
1291                 ublk_commit_completion(ub, ub_cmd);
1292                 break;
1293         case UBLK_IO_NEED_GET_DATA:
1294                 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
1295                         goto out;
1296                 io->addr = ub_cmd->addr;
1297                 io->cmd = cmd;
1298                 io->flags |= UBLK_IO_FLAG_ACTIVE;
1299                 ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag);
1300                 break;
1301         default:
1302                 goto out;
1303         }
1304         return -EIOCBQUEUED;
1305
1306  out:
1307         io_uring_cmd_done(cmd, ret, 0);
1308         pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
1309                         __func__, cmd_op, tag, ret, io->flags);
1310         return -EIOCBQUEUED;
1311 }
1312
1313 static const struct file_operations ublk_ch_fops = {
1314         .owner = THIS_MODULE,
1315         .open = ublk_ch_open,
1316         .release = ublk_ch_release,
1317         .llseek = no_llseek,
1318         .uring_cmd = ublk_ch_uring_cmd,
1319         .mmap = ublk_ch_mmap,
1320 };
1321
1322 static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
1323 {
1324         int size = ublk_queue_cmd_buf_size(ub, q_id);
1325         struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
1326
1327         if (ubq->ubq_daemon)
1328                 put_task_struct(ubq->ubq_daemon);
1329         if (ubq->io_cmd_buf)
1330                 free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
1331 }
1332
1333 static int ublk_init_queue(struct ublk_device *ub, int q_id)
1334 {
1335         struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
1336         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
1337         void *ptr;
1338         int size;
1339
1340         ubq->flags = ub->dev_info.flags;
1341         ubq->q_id = q_id;
1342         ubq->q_depth = ub->dev_info.queue_depth;
1343         size = ublk_queue_cmd_buf_size(ub, q_id);
1344
1345         ptr = (void *) __get_free_pages(gfp_flags, get_order(size));
1346         if (!ptr)
1347                 return -ENOMEM;
1348
1349         ubq->io_cmd_buf = ptr;
1350         ubq->dev = ub;
1351         return 0;
1352 }
1353
1354 static void ublk_deinit_queues(struct ublk_device *ub)
1355 {
1356         int nr_queues = ub->dev_info.nr_hw_queues;
1357         int i;
1358
1359         if (!ub->__queues)
1360                 return;
1361
1362         for (i = 0; i < nr_queues; i++)
1363                 ublk_deinit_queue(ub, i);
1364         kfree(ub->__queues);
1365 }
1366
1367 static int ublk_init_queues(struct ublk_device *ub)
1368 {
1369         int nr_queues = ub->dev_info.nr_hw_queues;
1370         int depth = ub->dev_info.queue_depth;
1371         int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io);
1372         int i, ret = -ENOMEM;
1373
1374         ub->queue_size = ubq_size;
1375         ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL);
1376         if (!ub->__queues)
1377                 return ret;
1378
1379         for (i = 0; i < nr_queues; i++) {
1380                 if (ublk_init_queue(ub, i))
1381                         goto fail;
1382         }
1383
1384         init_completion(&ub->completion);
1385         return 0;
1386
1387  fail:
1388         ublk_deinit_queues(ub);
1389         return ret;
1390 }
1391
1392 static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
1393 {
1394         int i = idx;
1395         int err;
1396
1397         spin_lock(&ublk_idr_lock);
1398         /* allocate id, if @id >= 0, we're requesting that specific id */
1399         if (i >= 0) {
1400                 err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
1401                 if (err == -ENOSPC)
1402                         err = -EEXIST;
1403         } else {
1404                 err = idr_alloc(&ublk_index_idr, ub, 0, 0, GFP_NOWAIT);
1405         }
1406         spin_unlock(&ublk_idr_lock);
1407
1408         if (err >= 0)
1409                 ub->ub_number = err;
1410
1411         return err;
1412 }
1413
1414 static void ublk_free_dev_number(struct ublk_device *ub)
1415 {
1416         spin_lock(&ublk_idr_lock);
1417         idr_remove(&ublk_index_idr, ub->ub_number);
1418         wake_up_all(&ublk_idr_wq);
1419         spin_unlock(&ublk_idr_lock);
1420 }
1421
1422 static void ublk_cdev_rel(struct device *dev)
1423 {
1424         struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
1425
1426         blk_mq_free_tag_set(&ub->tag_set);
1427         ublk_deinit_queues(ub);
1428         ublk_free_dev_number(ub);
1429         mutex_destroy(&ub->mutex);
1430         kfree(ub);
1431 }
1432
1433 static int ublk_add_chdev(struct ublk_device *ub)
1434 {
1435         struct device *dev = &ub->cdev_dev;
1436         int minor = ub->ub_number;
1437         int ret;
1438
1439         dev->parent = ublk_misc.this_device;
1440         dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
1441         dev->class = ublk_chr_class;
1442         dev->release = ublk_cdev_rel;
1443         device_initialize(dev);
1444
1445         ret = dev_set_name(dev, "ublkc%d", minor);
1446         if (ret)
1447                 goto fail;
1448
1449         cdev_init(&ub->cdev, &ublk_ch_fops);
1450         ret = cdev_device_add(&ub->cdev, dev);
1451         if (ret)
1452                 goto fail;
1453
1454         ublks_added++;
1455         return 0;
1456  fail:
1457         put_device(dev);
1458         return ret;
1459 }
1460
1461 static void ublk_stop_work_fn(struct work_struct *work)
1462 {
1463         struct ublk_device *ub =
1464                 container_of(work, struct ublk_device, stop_work);
1465
1466         ublk_stop_dev(ub);
1467 }
1468
1469 /* align max io buffer size with PAGE_SIZE */
1470 static void ublk_align_max_io_size(struct ublk_device *ub)
1471 {
1472         unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
1473
1474         ub->dev_info.max_io_buf_bytes =
1475                 round_down(max_io_bytes, PAGE_SIZE);
1476 }
1477
1478 static int ublk_add_tag_set(struct ublk_device *ub)
1479 {
1480         ub->tag_set.ops = &ublk_mq_ops;
1481         ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
1482         ub->tag_set.queue_depth = ub->dev_info.queue_depth;
1483         ub->tag_set.numa_node = NUMA_NO_NODE;
1484         ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
1485         ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
1486         ub->tag_set.driver_data = ub;
1487         return blk_mq_alloc_tag_set(&ub->tag_set);
1488 }
1489
1490 static void ublk_remove(struct ublk_device *ub)
1491 {
1492         ublk_stop_dev(ub);
1493         cancel_work_sync(&ub->stop_work);
1494         cancel_work_sync(&ub->quiesce_work);
1495         cdev_device_del(&ub->cdev, &ub->cdev_dev);
1496         put_device(&ub->cdev_dev);
1497         ublks_added--;
1498 }
1499
1500 static struct ublk_device *ublk_get_device_from_id(int idx)
1501 {
1502         struct ublk_device *ub = NULL;
1503
1504         if (idx < 0)
1505                 return NULL;
1506
1507         spin_lock(&ublk_idr_lock);
1508         ub = idr_find(&ublk_index_idr, idx);
1509         if (ub)
1510                 ub = ublk_get_device(ub);
1511         spin_unlock(&ublk_idr_lock);
1512
1513         return ub;
1514 }
1515
1516 static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
1517 {
1518         struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1519         int ublksrv_pid = (int)header->data[0];
1520         struct gendisk *disk;
1521         int ret = -EINVAL;
1522
1523         if (ublksrv_pid <= 0)
1524                 return -EINVAL;
1525
1526         wait_for_completion_interruptible(&ub->completion);
1527
1528         schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD);
1529
1530         mutex_lock(&ub->mutex);
1531         if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
1532             test_bit(UB_STATE_USED, &ub->state)) {
1533                 ret = -EEXIST;
1534                 goto out_unlock;
1535         }
1536
1537         disk = blk_mq_alloc_disk(&ub->tag_set, ub);
1538         if (IS_ERR(disk)) {
1539                 ret = PTR_ERR(disk);
1540                 goto out_unlock;
1541         }
1542         sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
1543         disk->fops = &ub_fops;
1544         disk->private_data = ub;
1545
1546         ub->dev_info.ublksrv_pid = ublksrv_pid;
1547         ub->ub_disk = disk;
1548
1549         ret = ublk_apply_params(ub);
1550         if (ret)
1551                 goto out_put_disk;
1552
1553         /* don't probe partitions if any one ubq daemon is un-trusted */
1554         if (ub->nr_privileged_daemon != ub->nr_queues_ready)
1555                 set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
1556
1557         get_device(&ub->cdev_dev);
1558         ret = add_disk(disk);
1559         if (ret) {
1560                 /*
1561                  * Has to drop the reference since ->free_disk won't be
1562                  * called in case of add_disk failure.
1563                  */
1564                 ublk_put_device(ub);
1565                 goto out_put_disk;
1566         }
1567         set_bit(UB_STATE_USED, &ub->state);
1568         ub->dev_info.state = UBLK_S_DEV_LIVE;
1569 out_put_disk:
1570         if (ret)
1571                 put_disk(disk);
1572 out_unlock:
1573         mutex_unlock(&ub->mutex);
1574         return ret;
1575 }
1576
1577 static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
1578                 struct io_uring_cmd *cmd)
1579 {
1580         struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1581         void __user *argp = (void __user *)(unsigned long)header->addr;
1582         cpumask_var_t cpumask;
1583         unsigned long queue;
1584         unsigned int retlen;
1585         unsigned int i;
1586         int ret;
1587
1588         if (header->len * BITS_PER_BYTE < nr_cpu_ids)
1589                 return -EINVAL;
1590         if (header->len & (sizeof(unsigned long)-1))
1591                 return -EINVAL;
1592         if (!header->addr)
1593                 return -EINVAL;
1594
1595         queue = header->data[0];
1596         if (queue >= ub->dev_info.nr_hw_queues)
1597                 return -EINVAL;
1598
1599         if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
1600                 return -ENOMEM;
1601
1602         for_each_possible_cpu(i) {
1603                 if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
1604                         cpumask_set_cpu(i, cpumask);
1605         }
1606
1607         ret = -EFAULT;
1608         retlen = min_t(unsigned short, header->len, cpumask_size());
1609         if (copy_to_user(argp, cpumask, retlen))
1610                 goto out_free_cpumask;
1611         if (retlen != header->len &&
1612             clear_user(argp + retlen, header->len - retlen))
1613                 goto out_free_cpumask;
1614
1615         ret = 0;
1616 out_free_cpumask:
1617         free_cpumask_var(cpumask);
1618         return ret;
1619 }
1620
1621 static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
1622 {
1623         pr_devel("%s: dev id %d flags %llx\n", __func__,
1624                         info->dev_id, info->flags);
1625         pr_devel("\t nr_hw_queues %d queue_depth %d\n",
1626                         info->nr_hw_queues, info->queue_depth);
1627 }
1628
1629 static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
1630 {
1631         struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1632         void __user *argp = (void __user *)(unsigned long)header->addr;
1633         struct ublksrv_ctrl_dev_info info;
1634         struct ublk_device *ub;
1635         int ret = -EINVAL;
1636
1637         if (header->len < sizeof(info) || !header->addr)
1638                 return -EINVAL;
1639         if (header->queue_id != (u16)-1) {
1640                 pr_warn("%s: queue_id is wrong %x\n",
1641                         __func__, header->queue_id);
1642                 return -EINVAL;
1643         }
1644         if (copy_from_user(&info, argp, sizeof(info)))
1645                 return -EFAULT;
1646         ublk_dump_dev_info(&info);
1647         if (header->dev_id != info.dev_id) {
1648                 pr_warn("%s: dev id not match %u %u\n",
1649                         __func__, header->dev_id, info.dev_id);
1650                 return -EINVAL;
1651         }
1652
1653         ret = mutex_lock_killable(&ublk_ctl_mutex);
1654         if (ret)
1655                 return ret;
1656
1657         ret = -EACCES;
1658         if (ublks_added >= ublks_max)
1659                 goto out_unlock;
1660
1661         ret = -ENOMEM;
1662         ub = kzalloc(sizeof(*ub), GFP_KERNEL);
1663         if (!ub)
1664                 goto out_unlock;
1665         mutex_init(&ub->mutex);
1666         spin_lock_init(&ub->mm_lock);
1667         INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn);
1668         INIT_WORK(&ub->stop_work, ublk_stop_work_fn);
1669         INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work);
1670
1671         ret = ublk_alloc_dev_number(ub, header->dev_id);
1672         if (ret < 0)
1673                 goto out_free_ub;
1674
1675         memcpy(&ub->dev_info, &info, sizeof(info));
1676
1677         /* update device id */
1678         ub->dev_info.dev_id = ub->ub_number;
1679
1680         /*
1681          * 64bit flags will be copied back to userspace as feature
1682          * negotiation result, so have to clear flags which driver
1683          * doesn't support yet, then userspace can get correct flags
1684          * (features) to handle.
1685          */
1686         ub->dev_info.flags &= UBLK_F_ALL;
1687
1688         if (!IS_BUILTIN(CONFIG_BLK_DEV_UBLK))
1689                 ub->dev_info.flags |= UBLK_F_URING_CMD_COMP_IN_TASK;
1690
1691         /* We are not ready to support zero copy */
1692         ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
1693
1694         ub->dev_info.nr_hw_queues = min_t(unsigned int,
1695                         ub->dev_info.nr_hw_queues, nr_cpu_ids);
1696         ublk_align_max_io_size(ub);
1697
1698         ret = ublk_init_queues(ub);
1699         if (ret)
1700                 goto out_free_dev_number;
1701
1702         ret = ublk_add_tag_set(ub);
1703         if (ret)
1704                 goto out_deinit_queues;
1705
1706         ret = -EFAULT;
1707         if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
1708                 goto out_free_tag_set;
1709
1710         /*
1711          * Add the char dev so that ublksrv daemon can be setup.
1712          * ublk_add_chdev() will cleanup everything if it fails.
1713          */
1714         ret = ublk_add_chdev(ub);
1715         goto out_unlock;
1716
1717 out_free_tag_set:
1718         blk_mq_free_tag_set(&ub->tag_set);
1719 out_deinit_queues:
1720         ublk_deinit_queues(ub);
1721 out_free_dev_number:
1722         ublk_free_dev_number(ub);
1723 out_free_ub:
1724         mutex_destroy(&ub->mutex);
1725         kfree(ub);
1726 out_unlock:
1727         mutex_unlock(&ublk_ctl_mutex);
1728         return ret;
1729 }
1730
1731 static inline bool ublk_idr_freed(int id)
1732 {
1733         void *ptr;
1734
1735         spin_lock(&ublk_idr_lock);
1736         ptr = idr_find(&ublk_index_idr, id);
1737         spin_unlock(&ublk_idr_lock);
1738
1739         return ptr == NULL;
1740 }
1741
1742 static int ublk_ctrl_del_dev(struct ublk_device **p_ub)
1743 {
1744         struct ublk_device *ub = *p_ub;
1745         int idx = ub->ub_number;
1746         int ret;
1747
1748         ret = mutex_lock_killable(&ublk_ctl_mutex);
1749         if (ret)
1750                 return ret;
1751
1752         ublk_remove(ub);
1753
1754         /* Mark the reference as consumed */
1755         *p_ub = NULL;
1756         ublk_put_device(ub);
1757
1758         /*
1759          * Wait until the idr is removed, then it can be reused after
1760          * DEL_DEV command is returned.
1761          */
1762         wait_event(ublk_idr_wq, ublk_idr_freed(idx));
1763         mutex_unlock(&ublk_ctl_mutex);
1764
1765         return ret;
1766 }
1767
1768 static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd)
1769 {
1770         struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1771
1772         pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
1773                         __func__, cmd->cmd_op, header->dev_id, header->queue_id,
1774                         header->data[0], header->addr, header->len);
1775 }
1776
1777 static int ublk_ctrl_stop_dev(struct ublk_device *ub)
1778 {
1779         ublk_stop_dev(ub);
1780         cancel_work_sync(&ub->stop_work);
1781         cancel_work_sync(&ub->quiesce_work);
1782
1783         return 0;
1784 }
1785
1786 static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
1787                 struct io_uring_cmd *cmd)
1788 {
1789         struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1790         void __user *argp = (void __user *)(unsigned long)header->addr;
1791
1792         if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
1793                 return -EINVAL;
1794
1795         if (copy_to_user(argp, &ub->dev_info, sizeof(ub->dev_info)))
1796                 return -EFAULT;
1797
1798         return 0;
1799 }
1800
1801 /* TYPE_DEVT is readonly, so fill it up before returning to userspace */
1802 static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
1803 {
1804         ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt);
1805         ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt);
1806
1807         if (ub->ub_disk) {
1808                 ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk));
1809                 ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk));
1810         } else {
1811                 ub->params.devt.disk_major = 0;
1812                 ub->params.devt.disk_minor = 0;
1813         }
1814         ub->params.types |= UBLK_PARAM_TYPE_DEVT;
1815 }
1816
1817 static int ublk_ctrl_get_params(struct ublk_device *ub,
1818                 struct io_uring_cmd *cmd)
1819 {
1820         struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1821         void __user *argp = (void __user *)(unsigned long)header->addr;
1822         struct ublk_params_header ph;
1823         int ret;
1824
1825         if (header->len <= sizeof(ph) || !header->addr)
1826                 return -EINVAL;
1827
1828         if (copy_from_user(&ph, argp, sizeof(ph)))
1829                 return -EFAULT;
1830
1831         if (ph.len > header->len || !ph.len)
1832                 return -EINVAL;
1833
1834         if (ph.len > sizeof(struct ublk_params))
1835                 ph.len = sizeof(struct ublk_params);
1836
1837         mutex_lock(&ub->mutex);
1838         ublk_ctrl_fill_params_devt(ub);
1839         if (copy_to_user(argp, &ub->params, ph.len))
1840                 ret = -EFAULT;
1841         else
1842                 ret = 0;
1843         mutex_unlock(&ub->mutex);
1844
1845         return ret;
1846 }
1847
1848 static int ublk_ctrl_set_params(struct ublk_device *ub,
1849                 struct io_uring_cmd *cmd)
1850 {
1851         struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1852         void __user *argp = (void __user *)(unsigned long)header->addr;
1853         struct ublk_params_header ph;
1854         int ret = -EFAULT;
1855
1856         if (header->len <= sizeof(ph) || !header->addr)
1857                 return -EINVAL;
1858
1859         if (copy_from_user(&ph, argp, sizeof(ph)))
1860                 return -EFAULT;
1861
1862         if (ph.len > header->len || !ph.len || !ph.types)
1863                 return -EINVAL;
1864
1865         if (ph.len > sizeof(struct ublk_params))
1866                 ph.len = sizeof(struct ublk_params);
1867
1868         /* parameters can only be changed when device isn't live */
1869         mutex_lock(&ub->mutex);
1870         if (ub->dev_info.state == UBLK_S_DEV_LIVE) {
1871                 ret = -EACCES;
1872         } else if (copy_from_user(&ub->params, argp, ph.len)) {
1873                 ret = -EFAULT;
1874         } else {
1875                 /* clear all we don't support yet */
1876                 ub->params.types &= UBLK_PARAM_TYPE_ALL;
1877                 ret = ublk_validate_params(ub);
1878         }
1879         mutex_unlock(&ub->mutex);
1880
1881         return ret;
1882 }
1883
1884 static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
1885 {
1886         int i;
1887
1888         WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq)));
1889         /* All old ioucmds have to be completed */
1890         WARN_ON_ONCE(ubq->nr_io_ready);
1891         /* old daemon is PF_EXITING, put it now */
1892         put_task_struct(ubq->ubq_daemon);
1893         /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */
1894         ubq->ubq_daemon = NULL;
1895
1896         for (i = 0; i < ubq->q_depth; i++) {
1897                 struct ublk_io *io = &ubq->ios[i];
1898
1899                 /* forget everything now and be ready for new FETCH_REQ */
1900                 io->flags = 0;
1901                 io->cmd = NULL;
1902                 io->addr = 0;
1903         }
1904 }
1905
1906 static int ublk_ctrl_start_recovery(struct ublk_device *ub,
1907                 struct io_uring_cmd *cmd)
1908 {
1909         struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1910         int ret = -EINVAL;
1911         int i;
1912
1913         mutex_lock(&ub->mutex);
1914         if (!ublk_can_use_recovery(ub))
1915                 goto out_unlock;
1916         /*
1917          * START_RECOVERY is only allowd after:
1918          *
1919          * (1) UB_STATE_OPEN is not set, which means the dying process is exited
1920          *     and related io_uring ctx is freed so file struct of /dev/ublkcX is
1921          *     released.
1922          *
1923          * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
1924          *     (a)has quiesced request queue
1925          *     (b)has requeued every inflight rqs whose io_flags is ACTIVE
1926          *     (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
1927          *     (d)has completed/camceled all ioucmds owned by ther dying process
1928          */
1929         if (test_bit(UB_STATE_OPEN, &ub->state) ||
1930                         ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
1931                 ret = -EBUSY;
1932                 goto out_unlock;
1933         }
1934         pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id);
1935         for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1936                 ublk_queue_reinit(ub, ublk_get_queue(ub, i));
1937         /* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */
1938         ub->mm = NULL;
1939         ub->nr_queues_ready = 0;
1940         ub->nr_privileged_daemon = 0;
1941         init_completion(&ub->completion);
1942         ret = 0;
1943  out_unlock:
1944         mutex_unlock(&ub->mutex);
1945         return ret;
1946 }
1947
1948 static int ublk_ctrl_end_recovery(struct ublk_device *ub,
1949                 struct io_uring_cmd *cmd)
1950 {
1951         struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1952         int ublksrv_pid = (int)header->data[0];
1953         int ret = -EINVAL;
1954
1955         pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n",
1956                         __func__, ub->dev_info.nr_hw_queues, header->dev_id);
1957         /* wait until new ubq_daemon sending all FETCH_REQ */
1958         wait_for_completion_interruptible(&ub->completion);
1959         pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n",
1960                         __func__, ub->dev_info.nr_hw_queues, header->dev_id);
1961
1962         mutex_lock(&ub->mutex);
1963         if (!ublk_can_use_recovery(ub))
1964                 goto out_unlock;
1965
1966         if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
1967                 ret = -EBUSY;
1968                 goto out_unlock;
1969         }
1970         ub->dev_info.ublksrv_pid = ublksrv_pid;
1971         pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
1972                         __func__, ublksrv_pid, header->dev_id);
1973         blk_mq_unquiesce_queue(ub->ub_disk->queue);
1974         pr_devel("%s: queue unquiesced, dev id %d.\n",
1975                         __func__, header->dev_id);
1976         blk_mq_kick_requeue_list(ub->ub_disk->queue);
1977         ub->dev_info.state = UBLK_S_DEV_LIVE;
1978         schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD);
1979         ret = 0;
1980  out_unlock:
1981         mutex_unlock(&ub->mutex);
1982         return ret;
1983 }
1984
1985 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
1986                 unsigned int issue_flags)
1987 {
1988         struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1989         struct ublk_device *ub = NULL;
1990         int ret = -EINVAL;
1991
1992         if (issue_flags & IO_URING_F_NONBLOCK)
1993                 return -EAGAIN;
1994
1995         ublk_ctrl_cmd_dump(cmd);
1996
1997         if (!(issue_flags & IO_URING_F_SQE128))
1998                 goto out;
1999
2000         ret = -EPERM;
2001         if (!capable(CAP_SYS_ADMIN))
2002                 goto out;
2003
2004         if (cmd->cmd_op != UBLK_CMD_ADD_DEV) {
2005                 ret = -ENODEV;
2006                 ub = ublk_get_device_from_id(header->dev_id);
2007                 if (!ub)
2008                         goto out;
2009         }
2010
2011         switch (cmd->cmd_op) {
2012         case UBLK_CMD_START_DEV:
2013                 ret = ublk_ctrl_start_dev(ub, cmd);
2014                 break;
2015         case UBLK_CMD_STOP_DEV:
2016                 ret = ublk_ctrl_stop_dev(ub);
2017                 break;
2018         case UBLK_CMD_GET_DEV_INFO:
2019                 ret = ublk_ctrl_get_dev_info(ub, cmd);
2020                 break;
2021         case UBLK_CMD_ADD_DEV:
2022                 ret = ublk_ctrl_add_dev(cmd);
2023                 break;
2024         case UBLK_CMD_DEL_DEV:
2025                 ret = ublk_ctrl_del_dev(&ub);
2026                 break;
2027         case UBLK_CMD_GET_QUEUE_AFFINITY:
2028                 ret = ublk_ctrl_get_queue_affinity(ub, cmd);
2029                 break;
2030         case UBLK_CMD_GET_PARAMS:
2031                 ret = ublk_ctrl_get_params(ub, cmd);
2032                 break;
2033         case UBLK_CMD_SET_PARAMS:
2034                 ret = ublk_ctrl_set_params(ub, cmd);
2035                 break;
2036         case UBLK_CMD_START_USER_RECOVERY:
2037                 ret = ublk_ctrl_start_recovery(ub, cmd);
2038                 break;
2039         case UBLK_CMD_END_USER_RECOVERY:
2040                 ret = ublk_ctrl_end_recovery(ub, cmd);
2041                 break;
2042         default:
2043                 ret = -ENOTSUPP;
2044                 break;
2045         }
2046         if (ub)
2047                 ublk_put_device(ub);
2048  out:
2049         io_uring_cmd_done(cmd, ret, 0);
2050         pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
2051                         __func__, ret, cmd->cmd_op, header->dev_id, header->queue_id);
2052         return -EIOCBQUEUED;
2053 }
2054
2055 static const struct file_operations ublk_ctl_fops = {
2056         .open           = nonseekable_open,
2057         .uring_cmd      = ublk_ctrl_uring_cmd,
2058         .owner          = THIS_MODULE,
2059         .llseek         = noop_llseek,
2060 };
2061
2062 static struct miscdevice ublk_misc = {
2063         .minor          = MISC_DYNAMIC_MINOR,
2064         .name           = "ublk-control",
2065         .fops           = &ublk_ctl_fops,
2066 };
2067
2068 static int __init ublk_init(void)
2069 {
2070         int ret;
2071
2072         init_waitqueue_head(&ublk_idr_wq);
2073
2074         ret = misc_register(&ublk_misc);
2075         if (ret)
2076                 return ret;
2077
2078         ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
2079         if (ret)
2080                 goto unregister_mis;
2081
2082         ublk_chr_class = class_create(THIS_MODULE, "ublk-char");
2083         if (IS_ERR(ublk_chr_class)) {
2084                 ret = PTR_ERR(ublk_chr_class);
2085                 goto free_chrdev_region;
2086         }
2087         return 0;
2088
2089 free_chrdev_region:
2090         unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
2091 unregister_mis:
2092         misc_deregister(&ublk_misc);
2093         return ret;
2094 }
2095
2096 static void __exit ublk_exit(void)
2097 {
2098         struct ublk_device *ub;
2099         int id;
2100
2101         idr_for_each_entry(&ublk_index_idr, ub, id)
2102                 ublk_remove(ub);
2103
2104         class_destroy(ublk_chr_class);
2105         misc_deregister(&ublk_misc);
2106
2107         idr_destroy(&ublk_index_idr);
2108         unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
2109 }
2110
2111 module_init(ublk_init);
2112 module_exit(ublk_exit);
2113
2114 module_param(ublks_max, int, 0444);
2115 MODULE_PARM_DESC(ublks_max, "max number of ublk devices allowed to add(default: 64)");
2116
2117 MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
2118 MODULE_LICENSE("GPL");