ublk_drv: support to complete io command via task_work_add
[linux-block.git] / drivers / block / ublk_drv.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Userspace block device - block device which IO is handled from userspace
4  *
5  * Take full use of io_uring passthrough command for communicating with
6  * ublk userspace daemon(ublksrvd) for handling basic IO request.
7  *
8  * Copyright 2022 Ming Lei <ming.lei@redhat.com>
9  *
10  * (part of code stolen from loop.c)
11  */
12 #include <linux/module.h>
13 #include <linux/moduleparam.h>
14 #include <linux/sched.h>
15 #include <linux/fs.h>
16 #include <linux/pagemap.h>
17 #include <linux/file.h>
18 #include <linux/stat.h>
19 #include <linux/errno.h>
20 #include <linux/major.h>
21 #include <linux/wait.h>
22 #include <linux/blkdev.h>
23 #include <linux/init.h>
24 #include <linux/swap.h>
25 #include <linux/slab.h>
26 #include <linux/compat.h>
27 #include <linux/mutex.h>
28 #include <linux/writeback.h>
29 #include <linux/completion.h>
30 #include <linux/highmem.h>
31 #include <linux/sysfs.h>
32 #include <linux/miscdevice.h>
33 #include <linux/falloc.h>
34 #include <linux/uio.h>
35 #include <linux/ioprio.h>
36 #include <linux/sched/mm.h>
37 #include <linux/uaccess.h>
38 #include <linux/cdev.h>
39 #include <linux/io_uring.h>
40 #include <linux/blk-mq.h>
41 #include <linux/delay.h>
42 #include <linux/mm.h>
43 #include <asm/page.h>
44 #include <linux/task_work.h>
45 #include <uapi/linux/ublk_cmd.h>
46
47 #define UBLK_MINORS             (1U << MINORBITS)
48
49 struct ublk_rq_data {
50         struct callback_head work;
51 };
52
53 struct ublk_uring_cmd_pdu {
54         struct request *req;
55 };
56
57 /*
58  * io command is active: sqe cmd is received, and its cqe isn't done
59  *
60  * If the flag is set, the io command is owned by ublk driver, and waited
61  * for incoming blk-mq request from the ublk block device.
62  *
63  * If the flag is cleared, the io command will be completed, and owned by
64  * ublk server.
65  */
66 #define UBLK_IO_FLAG_ACTIVE     0x01
67
68 /*
69  * IO command is completed via cqe, and it is being handled by ublksrv, and
70  * not committed yet
71  *
72  * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
73  * cross verification
74  */
75 #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
76
77 /*
78  * IO command is aborted, so this flag is set in case of
79  * !UBLK_IO_FLAG_ACTIVE.
80  *
81  * After this flag is observed, any pending or new incoming request
82  * associated with this io command will be failed immediately
83  */
84 #define UBLK_IO_FLAG_ABORTED 0x04
85
86 struct ublk_io {
87         /* userspace buffer address from io cmd */
88         __u64   addr;
89         unsigned int flags;
90         int res;
91
92         struct io_uring_cmd *cmd;
93 };
94
95 struct ublk_queue {
96         int q_id;
97         int q_depth;
98
99         unsigned long flags;
100         struct task_struct      *ubq_daemon;
101         char *io_cmd_buf;
102
103         unsigned long io_addr;  /* mapped vm address */
104         unsigned int max_io_sz;
105         bool abort_work_pending;
106         unsigned short nr_io_ready;     /* how many ios setup */
107         struct ublk_device *dev;
108         struct ublk_io ios[0];
109 };
110
111 #define UBLK_DAEMON_MONITOR_PERIOD      (5 * HZ)
112
113 struct ublk_device {
114         struct gendisk          *ub_disk;
115         struct request_queue    *ub_queue;
116
117         char    *__queues;
118
119         unsigned short  queue_size;
120         unsigned short  bs_shift;
121         struct ublksrv_ctrl_dev_info    dev_info;
122
123         struct blk_mq_tag_set   tag_set;
124
125         struct cdev             cdev;
126         struct device           cdev_dev;
127
128         atomic_t                ch_open_cnt;
129         int                     ub_number;
130
131         struct mutex            mutex;
132
133         struct mm_struct        *mm;
134
135         struct completion       completion;
136         unsigned int            nr_queues_ready;
137         atomic_t                nr_aborted_queues;
138
139         /*
140          * Our ubq->daemon may be killed without any notification, so
141          * monitor each queue's daemon periodically
142          */
143         struct delayed_work     monitor_work;
144         struct work_struct      stop_work;
145 };
146
147 static dev_t ublk_chr_devt;
148 static struct class *ublk_chr_class;
149
150 static DEFINE_IDR(ublk_index_idr);
151 static DEFINE_SPINLOCK(ublk_idr_lock);
152 static wait_queue_head_t ublk_idr_wq;   /* wait until one idr is freed */
153
154 static DEFINE_MUTEX(ublk_ctl_mutex);
155
156 static struct miscdevice ublk_misc;
157
158 static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq)
159 {
160         if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) &&
161                         !(ubq->flags & UBLK_F_URING_CMD_COMP_IN_TASK))
162                 return true;
163         return false;
164 }
165
166 static struct ublk_device *ublk_get_device(struct ublk_device *ub)
167 {
168         if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
169                 return ub;
170         return NULL;
171 }
172
173 static void ublk_put_device(struct ublk_device *ub)
174 {
175         put_device(&ub->cdev_dev);
176 }
177
178 static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
179                 int qid)
180 {
181        return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
182 }
183
184 static inline bool ublk_rq_has_data(const struct request *rq)
185 {
186         return rq->bio && bio_has_data(rq->bio);
187 }
188
189 static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
190                 int tag)
191 {
192         return (struct ublksrv_io_desc *)
193                 &(ubq->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]);
194 }
195
196 static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
197 {
198         return ublk_get_queue(ub, q_id)->io_cmd_buf;
199 }
200
201 static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id)
202 {
203         struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
204
205         return round_up(ubq->q_depth * sizeof(struct ublksrv_io_desc),
206                         PAGE_SIZE);
207 }
208
209 static int ublk_open(struct block_device *bdev, fmode_t mode)
210 {
211         return 0;
212 }
213
214 static void ublk_release(struct gendisk *disk, fmode_t mode)
215 {
216 }
217
218 static const struct block_device_operations ub_fops = {
219         .owner =        THIS_MODULE,
220         .open =         ublk_open,
221         .release =      ublk_release,
222 };
223
224 #define UBLK_MAX_PIN_PAGES      32
225
226 struct ublk_map_data {
227         const struct ublk_queue *ubq;
228         const struct request *rq;
229         const struct ublk_io *io;
230         unsigned max_bytes;
231 };
232
233 struct ublk_io_iter {
234         struct page *pages[UBLK_MAX_PIN_PAGES];
235         unsigned pg_off;        /* offset in the 1st page in pages */
236         int nr_pages;           /* how many page pointers in pages */
237         struct bio *bio;
238         struct bvec_iter iter;
239 };
240
241 static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data,
242                 unsigned max_bytes, bool to_vm)
243 {
244         const unsigned total = min_t(unsigned, max_bytes,
245                         PAGE_SIZE - data->pg_off +
246                         ((data->nr_pages - 1) << PAGE_SHIFT));
247         unsigned done = 0;
248         unsigned pg_idx = 0;
249
250         while (done < total) {
251                 struct bio_vec bv = bio_iter_iovec(data->bio, data->iter);
252                 const unsigned int bytes = min3(bv.bv_len, total - done,
253                                 (unsigned)(PAGE_SIZE - data->pg_off));
254                 void *bv_buf = bvec_kmap_local(&bv);
255                 void *pg_buf = kmap_local_page(data->pages[pg_idx]);
256
257                 if (to_vm)
258                         memcpy(pg_buf + data->pg_off, bv_buf, bytes);
259                 else
260                         memcpy(bv_buf, pg_buf + data->pg_off, bytes);
261
262                 kunmap_local(pg_buf);
263                 kunmap_local(bv_buf);
264
265                 /* advance page array */
266                 data->pg_off += bytes;
267                 if (data->pg_off == PAGE_SIZE) {
268                         pg_idx += 1;
269                         data->pg_off = 0;
270                 }
271
272                 done += bytes;
273
274                 /* advance bio */
275                 bio_advance_iter_single(data->bio, &data->iter, bytes);
276                 if (!data->iter.bi_size) {
277                         data->bio = data->bio->bi_next;
278                         if (data->bio == NULL)
279                                 break;
280                         data->iter = data->bio->bi_iter;
281                 }
282         }
283
284         return done;
285 }
286
287 static inline int ublk_copy_user_pages(struct ublk_map_data *data,
288                 bool to_vm)
289 {
290         const unsigned int gup_flags = to_vm ? FOLL_WRITE : 0;
291         const unsigned long start_vm = data->io->addr;
292         unsigned int done = 0;
293         struct ublk_io_iter iter = {
294                 .pg_off = start_vm & (PAGE_SIZE - 1),
295                 .bio    = data->rq->bio,
296                 .iter   = data->rq->bio->bi_iter,
297         };
298         const unsigned int nr_pages = round_up(data->max_bytes +
299                         (start_vm & (PAGE_SIZE - 1)), PAGE_SIZE) >> PAGE_SHIFT;
300
301         while (done < nr_pages) {
302                 const unsigned to_pin = min_t(unsigned, UBLK_MAX_PIN_PAGES,
303                                 nr_pages - done);
304                 unsigned i, len;
305
306                 iter.nr_pages = get_user_pages_fast(start_vm +
307                                 (done << PAGE_SHIFT), to_pin, gup_flags,
308                                 iter.pages);
309                 if (iter.nr_pages <= 0)
310                         return done == 0 ? iter.nr_pages : done;
311                 len = ublk_copy_io_pages(&iter, data->max_bytes, to_vm);
312                 for (i = 0; i < iter.nr_pages; i++) {
313                         if (to_vm)
314                                 set_page_dirty(iter.pages[i]);
315                         put_page(iter.pages[i]);
316                 }
317                 data->max_bytes -= len;
318                 done += iter.nr_pages;
319         }
320
321         return done;
322 }
323
324 static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
325                 struct ublk_io *io)
326 {
327         const unsigned int rq_bytes = blk_rq_bytes(req);
328         /*
329          * no zero copy, we delay copy WRITE request data into ublksrv
330          * context and the big benefit is that pinning pages in current
331          * context is pretty fast, see ublk_pin_user_pages
332          */
333         if (req_op(req) != REQ_OP_WRITE && req_op(req) != REQ_OP_FLUSH)
334                 return rq_bytes;
335
336         if (ublk_rq_has_data(req)) {
337                 struct ublk_map_data data = {
338                         .ubq    =       ubq,
339                         .rq     =       req,
340                         .io     =       io,
341                         .max_bytes =    rq_bytes,
342                 };
343
344                 ublk_copy_user_pages(&data, true);
345
346                 return rq_bytes - data.max_bytes;
347         }
348         return rq_bytes;
349 }
350
351 static int ublk_unmap_io(const struct ublk_queue *ubq,
352                 const struct request *req,
353                 struct ublk_io *io)
354 {
355         const unsigned int rq_bytes = blk_rq_bytes(req);
356
357         if (req_op(req) == REQ_OP_READ && ublk_rq_has_data(req)) {
358                 struct ublk_map_data data = {
359                         .ubq    =       ubq,
360                         .rq     =       req,
361                         .io     =       io,
362                         .max_bytes =    io->res,
363                 };
364
365                 WARN_ON_ONCE(io->res > rq_bytes);
366
367                 ublk_copy_user_pages(&data, false);
368
369                 return io->res - data.max_bytes;
370         }
371         return rq_bytes;
372 }
373
374 static inline unsigned int ublk_req_build_flags(struct request *req)
375 {
376         unsigned flags = 0;
377
378         if (req->cmd_flags & REQ_FAILFAST_DEV)
379                 flags |= UBLK_IO_F_FAILFAST_DEV;
380
381         if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
382                 flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
383
384         if (req->cmd_flags & REQ_FAILFAST_DRIVER)
385                 flags |= UBLK_IO_F_FAILFAST_DRIVER;
386
387         if (req->cmd_flags & REQ_META)
388                 flags |= UBLK_IO_F_META;
389
390         if (req->cmd_flags & REQ_INTEGRITY)
391                 flags |= UBLK_IO_F_INTEGRITY;
392
393         if (req->cmd_flags & REQ_FUA)
394                 flags |= UBLK_IO_F_FUA;
395
396         if (req->cmd_flags & REQ_PREFLUSH)
397                 flags |= UBLK_IO_F_PREFLUSH;
398
399         if (req->cmd_flags & REQ_NOUNMAP)
400                 flags |= UBLK_IO_F_NOUNMAP;
401
402         if (req->cmd_flags & REQ_SWAP)
403                 flags |= UBLK_IO_F_SWAP;
404
405         return flags;
406 }
407
408 static int ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
409 {
410         struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
411         struct ublk_io *io = &ubq->ios[req->tag];
412         u32 ublk_op;
413
414         switch (req_op(req)) {
415         case REQ_OP_READ:
416                 ublk_op = UBLK_IO_OP_READ;
417                 break;
418         case REQ_OP_WRITE:
419                 ublk_op = UBLK_IO_OP_WRITE;
420                 break;
421         case REQ_OP_FLUSH:
422                 ublk_op = UBLK_IO_OP_FLUSH;
423                 break;
424         case REQ_OP_DISCARD:
425                 ublk_op = UBLK_IO_OP_DISCARD;
426                 break;
427         case REQ_OP_WRITE_ZEROES:
428                 ublk_op = UBLK_IO_OP_WRITE_ZEROES;
429                 break;
430         default:
431                 return BLK_STS_IOERR;
432         }
433
434         /* need to translate since kernel may change */
435         iod->op_flags = ublk_op | ublk_req_build_flags(req);
436         iod->nr_sectors = blk_rq_sectors(req);
437         iod->start_sector = blk_rq_pos(req);
438         iod->addr = io->addr;
439
440         return BLK_STS_OK;
441 }
442
443 static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
444                 struct io_uring_cmd *ioucmd)
445 {
446         return (struct ublk_uring_cmd_pdu *)&ioucmd->pdu;
447 }
448
449 static bool ubq_daemon_is_dying(struct ublk_queue *ubq)
450 {
451         return ubq->ubq_daemon->flags & PF_EXITING;
452 }
453
454 /* todo: handle partial completion */
455 static void ublk_complete_rq(struct request *req)
456 {
457         struct ublk_queue *ubq = req->mq_hctx->driver_data;
458         struct ublk_io *io = &ubq->ios[req->tag];
459         unsigned int unmapped_bytes;
460
461         /* failed read IO if nothing is read */
462         if (!io->res && req_op(req) == REQ_OP_READ)
463                 io->res = -EIO;
464
465         if (io->res < 0) {
466                 blk_mq_end_request(req, errno_to_blk_status(io->res));
467                 return;
468         }
469
470         /*
471          * FLUSH or DISCARD usually won't return bytes returned, so end them
472          * directly.
473          *
474          * Both the two needn't unmap.
475          */
476         if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) {
477                 blk_mq_end_request(req, BLK_STS_OK);
478                 return;
479         }
480
481         /* for READ request, writing data in iod->addr to rq buffers */
482         unmapped_bytes = ublk_unmap_io(ubq, req, io);
483
484         /*
485          * Extremely impossible since we got data filled in just before
486          *
487          * Re-read simply for this unlikely case.
488          */
489         if (unlikely(unmapped_bytes < io->res))
490                 io->res = unmapped_bytes;
491
492         if (blk_update_request(req, BLK_STS_OK, io->res))
493                 blk_mq_requeue_request(req, true);
494         else
495                 __blk_mq_end_request(req, BLK_STS_OK);
496 }
497
498 /*
499  * __ublk_fail_req() may be called from abort context or ->ubq_daemon
500  * context during exiting, so lock is required.
501  *
502  * Also aborting may not be started yet, keep in mind that one failed
503  * request may be issued by block layer again.
504  */
505 static void __ublk_fail_req(struct ublk_io *io, struct request *req)
506 {
507         WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
508
509         if (!(io->flags & UBLK_IO_FLAG_ABORTED)) {
510                 io->flags |= UBLK_IO_FLAG_ABORTED;
511                 blk_mq_end_request(req, BLK_STS_IOERR);
512         }
513 }
514
515 #define UBLK_REQUEUE_DELAY_MS   3
516
517 static inline void __ublk_rq_task_work(struct request *req)
518 {
519         struct ublk_queue *ubq = req->mq_hctx->driver_data;
520         struct ublk_device *ub = ubq->dev;
521         int tag = req->tag;
522         struct ublk_io *io = &ubq->ios[tag];
523         bool task_exiting = current != ubq->ubq_daemon ||
524                 (current->flags & PF_EXITING);
525         unsigned int mapped_bytes;
526
527         pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n",
528                         __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
529                         ublk_get_iod(ubq, req->tag)->addr);
530
531         if (unlikely(task_exiting)) {
532                 blk_mq_end_request(req, BLK_STS_IOERR);
533                 mod_delayed_work(system_wq, &ub->monitor_work, 0);
534                 return;
535         }
536
537         mapped_bytes = ublk_map_io(ubq, req, io);
538
539         /* partially mapped, update io descriptor */
540         if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
541                 /*
542                  * Nothing mapped, retry until we succeed.
543                  *
544                  * We may never succeed in mapping any bytes here because
545                  * of OOM. TODO: reserve one buffer with single page pinned
546                  * for providing forward progress guarantee.
547                  */
548                 if (unlikely(!mapped_bytes)) {
549                         blk_mq_requeue_request(req, false);
550                         blk_mq_delay_kick_requeue_list(req->q,
551                                         UBLK_REQUEUE_DELAY_MS);
552                         return;
553                 }
554
555                 ublk_get_iod(ubq, req->tag)->nr_sectors =
556                         mapped_bytes >> 9;
557         }
558
559         /* mark this cmd owned by ublksrv */
560         io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
561
562         /*
563          * clear ACTIVE since we are done with this sqe/cmd slot
564          * We can only accept io cmd in case of being not active.
565          */
566         io->flags &= ~UBLK_IO_FLAG_ACTIVE;
567
568         /* tell ublksrv one io request is coming */
569         io_uring_cmd_done(io->cmd, UBLK_IO_RES_OK, 0);
570 }
571
572 static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd)
573 {
574         struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
575
576         __ublk_rq_task_work(pdu->req);
577 }
578
579 static void ublk_rq_task_work_fn(struct callback_head *work)
580 {
581         struct ublk_rq_data *data = container_of(work,
582                         struct ublk_rq_data, work);
583         struct request *req = blk_mq_rq_from_pdu(data);
584
585         __ublk_rq_task_work(req);
586 }
587
588 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
589                 const struct blk_mq_queue_data *bd)
590 {
591         struct ublk_queue *ubq = hctx->driver_data;
592         struct request *rq = bd->rq;
593         blk_status_t res;
594
595         /* fill iod to slot in io cmd buffer */
596         res = ublk_setup_iod(ubq, rq);
597         if (unlikely(res != BLK_STS_OK))
598                 return BLK_STS_IOERR;
599
600         blk_mq_start_request(bd->rq);
601
602         if (unlikely(ubq_daemon_is_dying(ubq))) {
603  fail:
604                 mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0);
605                 return BLK_STS_IOERR;
606         }
607
608         if (ublk_can_use_task_work(ubq)) {
609                 struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
610                 enum task_work_notify_mode notify_mode = bd->last ?
611                         TWA_SIGNAL_NO_IPI : TWA_NONE;
612
613                 if (task_work_add(ubq->ubq_daemon, &data->work, notify_mode))
614                         goto fail;
615         } else {
616                 struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
617                 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
618
619                 pdu->req = rq;
620                 io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
621         }
622
623         return BLK_STS_OK;
624 }
625
626 static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx)
627 {
628         struct ublk_queue *ubq = hctx->driver_data;
629
630         if (ublk_can_use_task_work(ubq))
631                 __set_notify_signal(ubq->ubq_daemon);
632 }
633
634 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
635                 unsigned int hctx_idx)
636 {
637         struct ublk_device *ub = hctx->queue->queuedata;
638         struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
639
640         hctx->driver_data = ubq;
641         return 0;
642 }
643
644 static int ublk_init_rq(struct blk_mq_tag_set *set, struct request *req,
645                 unsigned int hctx_idx, unsigned int numa_node)
646 {
647         struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
648
649         init_task_work(&data->work, ublk_rq_task_work_fn);
650         return 0;
651 }
652
653 static const struct blk_mq_ops ublk_mq_ops = {
654         .queue_rq       = ublk_queue_rq,
655         .commit_rqs     = ublk_commit_rqs,
656         .init_hctx      = ublk_init_hctx,
657         .init_request   = ublk_init_rq,
658 };
659
660 static int ublk_ch_open(struct inode *inode, struct file *filp)
661 {
662         struct ublk_device *ub = container_of(inode->i_cdev,
663                         struct ublk_device, cdev);
664
665         if (atomic_cmpxchg(&ub->ch_open_cnt, 0, 1) == 0) {
666                 filp->private_data = ub;
667                 return 0;
668         }
669         return -EBUSY;
670 }
671
672 static int ublk_ch_release(struct inode *inode, struct file *filp)
673 {
674         struct ublk_device *ub = filp->private_data;
675
676         while (atomic_cmpxchg(&ub->ch_open_cnt, 1, 0) != 1)
677                 cpu_relax();
678
679         filp->private_data = NULL;
680         return 0;
681 }
682
683 /* map pre-allocated per-queue cmd buffer to ublksrv daemon */
684 static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
685 {
686         struct ublk_device *ub = filp->private_data;
687         size_t sz = vma->vm_end - vma->vm_start;
688         unsigned max_sz = UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc);
689         unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
690         int q_id, ret = 0;
691
692         mutex_lock(&ub->mutex);
693         if (!ub->mm)
694                 ub->mm = current->mm;
695         if (current->mm != ub->mm)
696                 ret = -EINVAL;
697         mutex_unlock(&ub->mutex);
698
699         if (ret)
700                 return ret;
701
702         if (vma->vm_flags & VM_WRITE)
703                 return -EPERM;
704
705         end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
706         if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
707                 return -EINVAL;
708
709         q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
710         pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
711                         __func__, q_id, current->pid, vma->vm_start,
712                         phys_off, (unsigned long)sz);
713
714         if (sz != ublk_queue_cmd_buf_size(ub, q_id))
715                 return -EINVAL;
716
717         pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
718         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
719 }
720
721 static void ublk_commit_completion(struct ublk_device *ub,
722                 struct ublksrv_io_cmd *ub_cmd)
723 {
724         u32 qid = ub_cmd->q_id, tag = ub_cmd->tag;
725         struct ublk_queue *ubq = ublk_get_queue(ub, qid);
726         struct ublk_io *io = &ubq->ios[tag];
727         struct request *req;
728
729         /* now this cmd slot is owned by nbd driver */
730         io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
731         io->res = ub_cmd->result;
732
733         /* find the io request and complete */
734         req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
735
736         if (req && likely(!blk_should_fake_timeout(req->q)))
737                 ublk_complete_rq(req);
738 }
739
740 /*
741  * When ->ubq_daemon is exiting, either new request is ended immediately,
742  * or any queued io command is drained, so it is safe to abort queue
743  * lockless
744  */
745 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
746 {
747         int i;
748
749         if (!ublk_get_device(ub))
750                 return;
751
752         for (i = 0; i < ubq->q_depth; i++) {
753                 struct ublk_io *io = &ubq->ios[i];
754
755                 if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
756                         struct request *rq;
757
758                         /*
759                          * Either we fail the request or ublk_rq_task_work_fn
760                          * will do it
761                          */
762                         rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
763                         if (rq)
764                                 __ublk_fail_req(io, rq);
765                 }
766         }
767         ublk_put_device(ub);
768 }
769
770 static void ublk_daemon_monitor_work(struct work_struct *work)
771 {
772         struct ublk_device *ub =
773                 container_of(work, struct ublk_device, monitor_work.work);
774         int i;
775
776         for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
777                 struct ublk_queue *ubq = ublk_get_queue(ub, i);
778
779                 if (ubq_daemon_is_dying(ubq)) {
780                         schedule_work(&ub->stop_work);
781
782                         /* abort queue is for making forward progress */
783                         ublk_abort_queue(ub, ubq);
784                 }
785         }
786
787         /*
788          * We can't schedule monitor work after ublk_remove() is started.
789          *
790          * No need ub->mutex, monitor work are canceled after state is marked
791          * as DEAD, so DEAD state is observed reliably.
792          */
793         if (ub->dev_info.state != UBLK_S_DEV_DEAD)
794                 schedule_delayed_work(&ub->monitor_work,
795                                 UBLK_DAEMON_MONITOR_PERIOD);
796 }
797
798 static void ublk_cancel_queue(struct ublk_queue *ubq)
799 {
800         int i;
801
802         for (i = 0; i < ubq->q_depth; i++) {
803                 struct ublk_io *io = &ubq->ios[i];
804
805                 if (io->flags & UBLK_IO_FLAG_ACTIVE)
806                         io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0);
807         }
808 }
809
810 /* Cancel all pending commands, must be called after del_gendisk() returns */
811 static void ublk_cancel_dev(struct ublk_device *ub)
812 {
813         int i;
814
815         for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
816                 ublk_cancel_queue(ublk_get_queue(ub, i));
817 }
818
819 static void ublk_stop_dev(struct ublk_device *ub)
820 {
821         mutex_lock(&ub->mutex);
822         if (!disk_live(ub->ub_disk))
823                 goto unlock;
824
825         del_gendisk(ub->ub_disk);
826         ub->dev_info.state = UBLK_S_DEV_DEAD;
827         ub->dev_info.ublksrv_pid = -1;
828         ublk_cancel_dev(ub);
829  unlock:
830         mutex_unlock(&ub->mutex);
831         cancel_delayed_work_sync(&ub->monitor_work);
832 }
833
834 static int ublk_ctrl_stop_dev(struct ublk_device *ub)
835 {
836         ublk_stop_dev(ub);
837         cancel_work_sync(&ub->stop_work);
838         return 0;
839 }
840
841 static inline bool ublk_queue_ready(struct ublk_queue *ubq)
842 {
843         return ubq->nr_io_ready == ubq->q_depth;
844 }
845
846 /* device can only be started after all IOs are ready */
847 static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
848 {
849         mutex_lock(&ub->mutex);
850         ubq->nr_io_ready++;
851         if (ublk_queue_ready(ubq)) {
852                 ubq->ubq_daemon = current;
853                 get_task_struct(ubq->ubq_daemon);
854                 ub->nr_queues_ready++;
855         }
856         if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues)
857                 complete_all(&ub->completion);
858         mutex_unlock(&ub->mutex);
859 }
860
861 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
862 {
863         struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->cmd;
864         struct ublk_device *ub = cmd->file->private_data;
865         struct ublk_queue *ubq;
866         struct ublk_io *io;
867         u32 cmd_op = cmd->cmd_op;
868         unsigned tag = ub_cmd->tag;
869         int ret = -EINVAL;
870
871         pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
872                         __func__, cmd->cmd_op, ub_cmd->q_id, tag,
873                         ub_cmd->result);
874
875         if (!(issue_flags & IO_URING_F_SQE128))
876                 goto out;
877
878         if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
879                 goto out;
880
881         ubq = ublk_get_queue(ub, ub_cmd->q_id);
882         if (!ubq || ub_cmd->q_id != ubq->q_id)
883                 goto out;
884
885         if (ubq->ubq_daemon && ubq->ubq_daemon != current)
886                 goto out;
887
888         if (tag >= ubq->q_depth)
889                 goto out;
890
891         io = &ubq->ios[tag];
892
893         /* there is pending io cmd, something must be wrong */
894         if (io->flags & UBLK_IO_FLAG_ACTIVE) {
895                 ret = -EBUSY;
896                 goto out;
897         }
898
899         switch (cmd_op) {
900         case UBLK_IO_FETCH_REQ:
901                 /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
902                 if (ublk_queue_ready(ubq)) {
903                         ret = -EBUSY;
904                         goto out;
905                 }
906                 /*
907                  * The io is being handled by server, so COMMIT_RQ is expected
908                  * instead of FETCH_REQ
909                  */
910                 if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
911                         goto out;
912                 /* FETCH_RQ has to provide IO buffer */
913                 if (!ub_cmd->addr)
914                         goto out;
915                 io->cmd = cmd;
916                 io->flags |= UBLK_IO_FLAG_ACTIVE;
917                 io->addr = ub_cmd->addr;
918
919                 ublk_mark_io_ready(ub, ubq);
920                 break;
921         case UBLK_IO_COMMIT_AND_FETCH_REQ:
922                 /* FETCH_RQ has to provide IO buffer */
923                 if (!ub_cmd->addr)
924                         goto out;
925                 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
926                         goto out;
927                 io->addr = ub_cmd->addr;
928                 io->flags |= UBLK_IO_FLAG_ACTIVE;
929                 io->cmd = cmd;
930                 ublk_commit_completion(ub, ub_cmd);
931                 break;
932         default:
933                 goto out;
934         }
935         return -EIOCBQUEUED;
936
937  out:
938         io->flags &= ~UBLK_IO_FLAG_ACTIVE;
939         io_uring_cmd_done(cmd, ret, 0);
940         pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
941                         __func__, cmd_op, tag, ret, io->flags);
942         return -EIOCBQUEUED;
943 }
944
945 static const struct file_operations ublk_ch_fops = {
946         .owner = THIS_MODULE,
947         .open = ublk_ch_open,
948         .release = ublk_ch_release,
949         .llseek = no_llseek,
950         .uring_cmd = ublk_ch_uring_cmd,
951         .mmap = ublk_ch_mmap,
952 };
953
954 static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
955 {
956         int size = ublk_queue_cmd_buf_size(ub, q_id);
957         struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
958
959         if (ubq->ubq_daemon)
960                 put_task_struct(ubq->ubq_daemon);
961         if (ubq->io_cmd_buf)
962                 free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
963 }
964
965 static int ublk_init_queue(struct ublk_device *ub, int q_id)
966 {
967         struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
968         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
969         void *ptr;
970         int size;
971
972         ubq->flags = ub->dev_info.flags[0];
973         ubq->q_id = q_id;
974         ubq->q_depth = ub->dev_info.queue_depth;
975         size = ublk_queue_cmd_buf_size(ub, q_id);
976
977         ptr = (void *) __get_free_pages(gfp_flags, get_order(size));
978         if (!ptr)
979                 return -ENOMEM;
980
981         ubq->io_cmd_buf = ptr;
982         ubq->dev = ub;
983         return 0;
984 }
985
986 static void ublk_deinit_queues(struct ublk_device *ub)
987 {
988         int nr_queues = ub->dev_info.nr_hw_queues;
989         int i;
990
991         if (!ub->__queues)
992                 return;
993
994         for (i = 0; i < nr_queues; i++)
995                 ublk_deinit_queue(ub, i);
996         kfree(ub->__queues);
997 }
998
999 static int ublk_init_queues(struct ublk_device *ub)
1000 {
1001         int nr_queues = ub->dev_info.nr_hw_queues;
1002         int depth = ub->dev_info.queue_depth;
1003         int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io);
1004         int i, ret = -ENOMEM;
1005
1006         ub->queue_size = ubq_size;
1007         ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL);
1008         if (!ub->__queues)
1009                 return ret;
1010
1011         for (i = 0; i < nr_queues; i++) {
1012                 if (ublk_init_queue(ub, i))
1013                         goto fail;
1014         }
1015
1016         init_completion(&ub->completion);
1017         return 0;
1018
1019  fail:
1020         ublk_deinit_queues(ub);
1021         return ret;
1022 }
1023
1024 static int __ublk_alloc_dev_number(struct ublk_device *ub, int idx)
1025 {
1026         int i = idx;
1027         int err;
1028
1029         spin_lock(&ublk_idr_lock);
1030         /* allocate id, if @id >= 0, we're requesting that specific id */
1031         if (i >= 0) {
1032                 err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
1033                 if (err == -ENOSPC)
1034                         err = -EEXIST;
1035         } else {
1036                 err = idr_alloc(&ublk_index_idr, ub, 0, 0, GFP_NOWAIT);
1037         }
1038         spin_unlock(&ublk_idr_lock);
1039
1040         if (err >= 0)
1041                 ub->ub_number = err;
1042
1043         return err;
1044 }
1045
1046 static struct ublk_device *__ublk_create_dev(int idx)
1047 {
1048         struct ublk_device *ub = NULL;
1049         int ret;
1050
1051         ub = kzalloc(sizeof(*ub), GFP_KERNEL);
1052         if (!ub)
1053                 return ERR_PTR(-ENOMEM);
1054
1055         ret = __ublk_alloc_dev_number(ub, idx);
1056         if (ret < 0) {
1057                 kfree(ub);
1058                 return ERR_PTR(ret);
1059         }
1060         return ub;
1061 }
1062
1063 static void __ublk_destroy_dev(struct ublk_device *ub)
1064 {
1065         spin_lock(&ublk_idr_lock);
1066         idr_remove(&ublk_index_idr, ub->ub_number);
1067         wake_up_all(&ublk_idr_wq);
1068         spin_unlock(&ublk_idr_lock);
1069
1070         mutex_destroy(&ub->mutex);
1071
1072         kfree(ub);
1073 }
1074
1075 static void ublk_cdev_rel(struct device *dev)
1076 {
1077         struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
1078
1079         put_disk(ub->ub_disk);
1080
1081         blk_mq_free_tag_set(&ub->tag_set);
1082
1083         ublk_deinit_queues(ub);
1084
1085         __ublk_destroy_dev(ub);
1086 }
1087
1088 static int ublk_add_chdev(struct ublk_device *ub)
1089 {
1090         struct device *dev = &ub->cdev_dev;
1091         int minor = ub->ub_number;
1092         int ret;
1093
1094         dev->parent = ublk_misc.this_device;
1095         dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
1096         dev->class = ublk_chr_class;
1097         dev->release = ublk_cdev_rel;
1098         device_initialize(dev);
1099
1100         ret = dev_set_name(dev, "ublkc%d", minor);
1101         if (ret)
1102                 goto fail;
1103
1104         cdev_init(&ub->cdev, &ublk_ch_fops);
1105         ret = cdev_device_add(&ub->cdev, dev);
1106         if (ret)
1107                 goto fail;
1108         return 0;
1109  fail:
1110         put_device(dev);
1111         return ret;
1112 }
1113
1114 static void ublk_stop_work_fn(struct work_struct *work)
1115 {
1116         struct ublk_device *ub =
1117                 container_of(work, struct ublk_device, stop_work);
1118
1119         ublk_stop_dev(ub);
1120 }
1121
1122 static void ublk_update_capacity(struct ublk_device *ub)
1123 {
1124         unsigned int max_rq_bytes;
1125
1126         /* make max request buffer size aligned with PAGE_SIZE */
1127         max_rq_bytes = round_down(ub->dev_info.rq_max_blocks <<
1128                         ub->bs_shift, PAGE_SIZE);
1129         ub->dev_info.rq_max_blocks = max_rq_bytes >> ub->bs_shift;
1130
1131         set_capacity(ub->ub_disk, ub->dev_info.dev_blocks << (ub->bs_shift - 9));
1132 }
1133
1134 /* add disk & cdev, cleanup everything in case of failure */
1135 static int ublk_add_dev(struct ublk_device *ub)
1136 {
1137         struct gendisk *disk;
1138         int err = -ENOMEM;
1139         int bsize;
1140
1141         /* We are not ready to support zero copy */
1142         ub->dev_info.flags[0] &= ~UBLK_F_SUPPORT_ZERO_COPY;
1143
1144         bsize = ub->dev_info.block_size;
1145         ub->bs_shift = ilog2(bsize);
1146
1147         ub->dev_info.nr_hw_queues = min_t(unsigned int,
1148                         ub->dev_info.nr_hw_queues, nr_cpu_ids);
1149
1150         INIT_WORK(&ub->stop_work, ublk_stop_work_fn);
1151         INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work);
1152
1153         if (ublk_init_queues(ub))
1154                 goto out_destroy_dev;
1155
1156         ub->tag_set.ops = &ublk_mq_ops;
1157         ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
1158         ub->tag_set.queue_depth = ub->dev_info.queue_depth;
1159         ub->tag_set.numa_node = NUMA_NO_NODE;
1160         ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
1161         ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
1162         ub->tag_set.driver_data = ub;
1163
1164         err = blk_mq_alloc_tag_set(&ub->tag_set);
1165         if (err)
1166                 goto out_deinit_queues;
1167
1168         disk = ub->ub_disk = blk_mq_alloc_disk(&ub->tag_set, ub);
1169         if (IS_ERR(disk)) {
1170                 err = PTR_ERR(disk);
1171                 goto out_cleanup_tags;
1172         }
1173         ub->ub_queue = ub->ub_disk->queue;
1174
1175         ub->ub_queue->queuedata = ub;
1176
1177         blk_queue_logical_block_size(ub->ub_queue, bsize);
1178         blk_queue_physical_block_size(ub->ub_queue, bsize);
1179         blk_queue_io_min(ub->ub_queue, bsize);
1180
1181         blk_queue_max_hw_sectors(ub->ub_queue, ub->dev_info.rq_max_blocks <<
1182                         (ub->bs_shift - 9));
1183
1184         ub->ub_queue->limits.discard_granularity = PAGE_SIZE;
1185
1186         blk_queue_max_discard_sectors(ub->ub_queue, UINT_MAX >> 9);
1187         blk_queue_max_write_zeroes_sectors(ub->ub_queue, UINT_MAX >> 9);
1188
1189         ublk_update_capacity(ub);
1190
1191         disk->fops              = &ub_fops;
1192         disk->private_data      = ub;
1193         disk->queue             = ub->ub_queue;
1194         sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
1195
1196         mutex_init(&ub->mutex);
1197
1198         /* add char dev so that ublksrv daemon can be setup */
1199         err = ublk_add_chdev(ub);
1200         if (err)
1201                 return err;
1202
1203         /* don't expose disk now until we got start command from cdev */
1204
1205         return 0;
1206
1207 out_cleanup_tags:
1208         blk_mq_free_tag_set(&ub->tag_set);
1209 out_deinit_queues:
1210         ublk_deinit_queues(ub);
1211 out_destroy_dev:
1212         __ublk_destroy_dev(ub);
1213         return err;
1214 }
1215
1216 static void ublk_remove(struct ublk_device *ub)
1217 {
1218         ublk_ctrl_stop_dev(ub);
1219
1220         cdev_device_del(&ub->cdev, &ub->cdev_dev);
1221         put_device(&ub->cdev_dev);
1222 }
1223
1224 static struct ublk_device *ublk_get_device_from_id(int idx)
1225 {
1226         struct ublk_device *ub = NULL;
1227
1228         if (idx < 0)
1229                 return NULL;
1230
1231         spin_lock(&ublk_idr_lock);
1232         ub = idr_find(&ublk_index_idr, idx);
1233         if (ub)
1234                 ub = ublk_get_device(ub);
1235         spin_unlock(&ublk_idr_lock);
1236
1237         return ub;
1238 }
1239
1240 static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
1241 {
1242         struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1243         int ret = -EINVAL;
1244         int ublksrv_pid = (int)header->data[0];
1245         unsigned long dev_blocks = header->data[1];
1246
1247         if (ublksrv_pid <= 0)
1248                 return ret;
1249
1250         wait_for_completion_interruptible(&ub->completion);
1251
1252         schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD);
1253
1254         mutex_lock(&ub->mutex);
1255         if (!disk_live(ub->ub_disk)) {
1256                 /* We may get disk size updated */
1257                 if (dev_blocks) {
1258                         ub->dev_info.dev_blocks = dev_blocks;
1259                         ublk_update_capacity(ub);
1260                 }
1261                 ub->dev_info.ublksrv_pid = ublksrv_pid;
1262                 ret = add_disk(ub->ub_disk);
1263                 if (!ret)
1264                         ub->dev_info.state = UBLK_S_DEV_LIVE;
1265         } else {
1266                 ret = -EEXIST;
1267         }
1268         mutex_unlock(&ub->mutex);
1269
1270         return ret;
1271 }
1272
1273 static struct blk_mq_hw_ctx *ublk_get_hw_queue(struct ublk_device *ub,
1274                 unsigned int index)
1275 {
1276         struct blk_mq_hw_ctx *hctx;
1277         unsigned long i;
1278
1279         queue_for_each_hw_ctx(ub->ub_queue, hctx, i)
1280                 if (hctx->queue_num == index)
1281                         return hctx;
1282         return NULL;
1283 }
1284
1285 static int ublk_ctrl_get_queue_affinity(struct io_uring_cmd *cmd)
1286 {
1287         struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1288         void __user *argp = (void __user *)(unsigned long)header->addr;
1289         struct blk_mq_hw_ctx *hctx;
1290         struct ublk_device *ub;
1291         unsigned long queue;
1292         unsigned int retlen;
1293         int ret;
1294
1295         ub = ublk_get_device_from_id(header->dev_id);
1296         if (!ub)
1297                 goto out;
1298
1299         ret = -EINVAL;
1300         queue = header->data[0];
1301         if (queue >= ub->dev_info.nr_hw_queues)
1302                 goto out;
1303         hctx = ublk_get_hw_queue(ub, queue);
1304         if (!hctx)
1305                 goto out;
1306
1307         retlen = min_t(unsigned short, header->len, cpumask_size());
1308         if (copy_to_user(argp, hctx->cpumask, retlen)) {
1309                 ret = -EFAULT;
1310                 goto out;
1311         }
1312         if (retlen != header->len) {
1313                 if (clear_user(argp + retlen, header->len - retlen)) {
1314                         ret = -EFAULT;
1315                         goto out;
1316                 }
1317         }
1318         ret = 0;
1319  out:
1320         if (ub)
1321                 ublk_put_device(ub);
1322         return ret;
1323 }
1324
1325 static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_dev_info *info,
1326                 void __user *argp, int idx)
1327 {
1328         struct ublk_device *ub;
1329         int ret;
1330
1331         ret = mutex_lock_killable(&ublk_ctl_mutex);
1332         if (ret)
1333                 return ret;
1334
1335         ub = __ublk_create_dev(idx);
1336         if (!IS_ERR_OR_NULL(ub)) {
1337                 memcpy(&ub->dev_info, info, sizeof(*info));
1338
1339                 /* update device id */
1340                 ub->dev_info.dev_id = ub->ub_number;
1341
1342                 ret = ublk_add_dev(ub);
1343                 if (!ret) {
1344                         if (copy_to_user(argp, &ub->dev_info, sizeof(*info))) {
1345                                 ublk_remove(ub);
1346                                 ret = -EFAULT;
1347                         }
1348                 }
1349         } else {
1350                 if (IS_ERR(ub))
1351                         ret = PTR_ERR(ub);
1352                 else
1353                         ret = -ENOMEM;
1354         }
1355         mutex_unlock(&ublk_ctl_mutex);
1356
1357         return ret;
1358 }
1359
1360 static inline bool ublk_idr_freed(int id)
1361 {
1362         void *ptr;
1363
1364         spin_lock(&ublk_idr_lock);
1365         ptr = idr_find(&ublk_index_idr, id);
1366         spin_unlock(&ublk_idr_lock);
1367
1368         return ptr == NULL;
1369 }
1370
1371 static int ublk_ctrl_del_dev(int idx)
1372 {
1373         struct ublk_device *ub;
1374         int ret;
1375
1376         ret = mutex_lock_killable(&ublk_ctl_mutex);
1377         if (ret)
1378                 return ret;
1379
1380         ub = ublk_get_device_from_id(idx);
1381         if (ub) {
1382                 ublk_remove(ub);
1383                 ublk_put_device(ub);
1384                 ret = 0;
1385         } else {
1386                 ret = -ENODEV;
1387         }
1388
1389         /*
1390          * Wait until the idr is removed, then it can be reused after
1391          * DEL_DEV command is returned.
1392          */
1393         if (!ret)
1394                 wait_event(ublk_idr_wq, ublk_idr_freed(idx));
1395         mutex_unlock(&ublk_ctl_mutex);
1396
1397         return ret;
1398 }
1399
1400
1401 static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
1402 {
1403         pr_devel("%s: dev id %d flags %llx\n", __func__,
1404                         info->dev_id, info->flags[0]);
1405         pr_devel("\t nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n",
1406                         info->nr_hw_queues, info->queue_depth,
1407                         info->block_size, info->dev_blocks);
1408 }
1409
1410 static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd)
1411 {
1412         struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1413
1414         pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
1415                         __func__, cmd->cmd_op, header->dev_id, header->queue_id,
1416                         header->data[0], header->addr, header->len);
1417 }
1418
1419 static int ublk_ctrl_cmd_validate(struct io_uring_cmd *cmd,
1420                 struct ublksrv_ctrl_dev_info *info)
1421 {
1422         struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1423         u32 cmd_op = cmd->cmd_op;
1424         void __user *argp = (void __user *)(unsigned long)header->addr;
1425
1426         if (!capable(CAP_SYS_ADMIN))
1427                 return -EPERM;
1428
1429         switch (cmd_op) {
1430         case UBLK_CMD_GET_DEV_INFO:
1431                 if (header->len < sizeof(*info) || !header->addr)
1432                         return -EINVAL;
1433                 break;
1434         case UBLK_CMD_ADD_DEV:
1435                 if (header->len < sizeof(*info) || !header->addr)
1436                         return -EINVAL;
1437                 if (copy_from_user(info, argp, sizeof(*info)) != 0)
1438                         return -EFAULT;
1439                 ublk_dump_dev_info(info);
1440                 if (header->dev_id != info->dev_id) {
1441                         printk(KERN_WARNING "%s: cmd %x, dev id not match %u %u\n",
1442                                         __func__, cmd_op, header->dev_id,
1443                                         info->dev_id);
1444                         return -EINVAL;
1445                 }
1446                 if (header->queue_id != (u16)-1) {
1447                         printk(KERN_WARNING "%s: cmd %x queue_id is wrong %x\n",
1448                                         __func__, cmd_op, header->queue_id);
1449                         return -EINVAL;
1450                 }
1451                 break;
1452         case UBLK_CMD_GET_QUEUE_AFFINITY:
1453                 if ((header->len * BITS_PER_BYTE) < nr_cpu_ids)
1454                         return -EINVAL;
1455                 if (header->len & (sizeof(unsigned long)-1))
1456                         return -EINVAL;
1457                 if (!header->addr)
1458                         return -EINVAL;
1459         };
1460
1461         return 0;
1462 }
1463
1464 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
1465                 unsigned int issue_flags)
1466 {
1467         struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1468         void __user *argp = (void __user *)(unsigned long)header->addr;
1469         struct ublksrv_ctrl_dev_info info;
1470         u32 cmd_op = cmd->cmd_op;
1471         struct ublk_device *ub;
1472         int ret = -EINVAL;
1473
1474         ublk_ctrl_cmd_dump(cmd);
1475
1476         if (!(issue_flags & IO_URING_F_SQE128))
1477                 goto out;
1478
1479         ret = ublk_ctrl_cmd_validate(cmd, &info);
1480         if (ret)
1481                 goto out;
1482
1483         ret = -ENODEV;
1484         switch (cmd_op) {
1485         case UBLK_CMD_START_DEV:
1486                 ub = ublk_get_device_from_id(header->dev_id);
1487                 if (ub) {
1488                         ret = ublk_ctrl_start_dev(ub, cmd);
1489                         ublk_put_device(ub);
1490                 }
1491                 break;
1492         case UBLK_CMD_STOP_DEV:
1493                 ub = ublk_get_device_from_id(header->dev_id);
1494                 if (ub) {
1495                         ret = ublk_ctrl_stop_dev(ub);
1496                         ublk_put_device(ub);
1497                 }
1498                 break;
1499         case UBLK_CMD_GET_DEV_INFO:
1500                 ub = ublk_get_device_from_id(header->dev_id);
1501                 if (ub) {
1502                         if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
1503                                 ret = -EFAULT;
1504                         else
1505                                 ret = 0;
1506                         ublk_put_device(ub);
1507                 }
1508                 break;
1509         case UBLK_CMD_ADD_DEV:
1510                 ret = ublk_ctrl_add_dev(&info, argp, header->dev_id);
1511                 break;
1512         case UBLK_CMD_DEL_DEV:
1513                 ret = ublk_ctrl_del_dev(header->dev_id);
1514                 break;
1515         case UBLK_CMD_GET_QUEUE_AFFINITY:
1516                 ret = ublk_ctrl_get_queue_affinity(cmd);
1517                 break;
1518         default:
1519                 break;
1520         };
1521  out:
1522         io_uring_cmd_done(cmd, ret, 0);
1523         pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
1524                         __func__, ret, cmd->cmd_op, header->dev_id, header->queue_id);
1525         return -EIOCBQUEUED;
1526 }
1527
1528 static const struct file_operations ublk_ctl_fops = {
1529         .open           = nonseekable_open,
1530         .uring_cmd      = ublk_ctrl_uring_cmd,
1531         .owner          = THIS_MODULE,
1532         .llseek         = noop_llseek,
1533 };
1534
1535 static struct miscdevice ublk_misc = {
1536         .minor          = MISC_DYNAMIC_MINOR,
1537         .name           = "ublk-control",
1538         .fops           = &ublk_ctl_fops,
1539 };
1540
1541 static int __init ublk_init(void)
1542 {
1543         int ret;
1544
1545         init_waitqueue_head(&ublk_idr_wq);
1546
1547         ret = misc_register(&ublk_misc);
1548         if (ret)
1549                 return ret;
1550
1551         ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
1552         if (ret)
1553                 goto unregister_mis;
1554
1555         ublk_chr_class = class_create(THIS_MODULE, "ublk-char");
1556         if (IS_ERR(ublk_chr_class)) {
1557                 ret = PTR_ERR(ublk_chr_class);
1558                 goto free_chrdev_region;
1559         }
1560         return 0;
1561
1562 free_chrdev_region:
1563         unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
1564 unregister_mis:
1565         misc_deregister(&ublk_misc);
1566         return ret;
1567 }
1568
1569 static void __exit ublk_exit(void)
1570 {
1571         struct ublk_device *ub;
1572         int id;
1573
1574         class_destroy(ublk_chr_class);
1575
1576         misc_deregister(&ublk_misc);
1577
1578         idr_for_each_entry(&ublk_index_idr, ub, id)
1579                 ublk_remove(ub);
1580
1581         idr_destroy(&ublk_index_idr);
1582         unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
1583 }
1584
1585 module_init(ublk_init);
1586 module_exit(ublk_exit);
1587
1588 MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
1589 MODULE_LICENSE("GPL");