blktrace: Fix the blk_fill_rwbs() kernel-doc header
[linux-block.git] / drivers / block / ublk_drv.c
CommitLineData
71f28f31
ML
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Userspace block device - block device which IO is handled from userspace
4 *
5 * Take full use of io_uring passthrough command for communicating with
6 * ublk userspace daemon(ublksrvd) for handling basic IO request.
7 *
8 * Copyright 2022 Ming Lei <ming.lei@redhat.com>
9 *
10 * (part of code stolen from loop.c)
11 */
12#include <linux/module.h>
13#include <linux/moduleparam.h>
14#include <linux/sched.h>
15#include <linux/fs.h>
16#include <linux/pagemap.h>
17#include <linux/file.h>
18#include <linux/stat.h>
19#include <linux/errno.h>
20#include <linux/major.h>
21#include <linux/wait.h>
22#include <linux/blkdev.h>
23#include <linux/init.h>
24#include <linux/swap.h>
25#include <linux/slab.h>
26#include <linux/compat.h>
27#include <linux/mutex.h>
28#include <linux/writeback.h>
29#include <linux/completion.h>
30#include <linux/highmem.h>
31#include <linux/sysfs.h>
32#include <linux/miscdevice.h>
33#include <linux/falloc.h>
34#include <linux/uio.h>
35#include <linux/ioprio.h>
36#include <linux/sched/mm.h>
37#include <linux/uaccess.h>
38#include <linux/cdev.h>
39#include <linux/io_uring.h>
40#include <linux/blk-mq.h>
41#include <linux/delay.h>
42#include <linux/mm.h>
43#include <asm/page.h>
0edb3696 44#include <linux/task_work.h>
71f28f31
ML
45#include <uapi/linux/ublk_cmd.h>
46
47#define UBLK_MINORS (1U << MINORBITS)
48
0edb3696
ML
49struct ublk_rq_data {
50 struct callback_head work;
51};
52
71f28f31
ML
53struct ublk_uring_cmd_pdu {
54 struct request *req;
55};
56
57/*
58 * io command is active: sqe cmd is received, and its cqe isn't done
59 *
60 * If the flag is set, the io command is owned by ublk driver, and waited
61 * for incoming blk-mq request from the ublk block device.
62 *
63 * If the flag is cleared, the io command will be completed, and owned by
64 * ublk server.
65 */
66#define UBLK_IO_FLAG_ACTIVE 0x01
67
68/*
69 * IO command is completed via cqe, and it is being handled by ublksrv, and
70 * not committed yet
71 *
72 * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
73 * cross verification
74 */
75#define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
76
77/*
78 * IO command is aborted, so this flag is set in case of
79 * !UBLK_IO_FLAG_ACTIVE.
80 *
81 * After this flag is observed, any pending or new incoming request
82 * associated with this io command will be failed immediately
83 */
84#define UBLK_IO_FLAG_ABORTED 0x04
85
86struct ublk_io {
87 /* userspace buffer address from io cmd */
88 __u64 addr;
89 unsigned int flags;
90 int res;
91
92 struct io_uring_cmd *cmd;
93};
94
95struct ublk_queue {
96 int q_id;
97 int q_depth;
98
0edb3696 99 unsigned long flags;
71f28f31
ML
100 struct task_struct *ubq_daemon;
101 char *io_cmd_buf;
102
103 unsigned long io_addr; /* mapped vm address */
104 unsigned int max_io_sz;
105 bool abort_work_pending;
106 unsigned short nr_io_ready; /* how many ios setup */
107 struct ublk_device *dev;
108 struct ublk_io ios[0];
109};
110
111#define UBLK_DAEMON_MONITOR_PERIOD (5 * HZ)
112
113struct ublk_device {
114 struct gendisk *ub_disk;
115 struct request_queue *ub_queue;
116
117 char *__queues;
118
119 unsigned short queue_size;
120 unsigned short bs_shift;
121 struct ublksrv_ctrl_dev_info dev_info;
122
123 struct blk_mq_tag_set tag_set;
124
125 struct cdev cdev;
126 struct device cdev_dev;
127
128 atomic_t ch_open_cnt;
129 int ub_number;
130
131 struct mutex mutex;
132
133 struct mm_struct *mm;
134
135 struct completion completion;
136 unsigned int nr_queues_ready;
137 atomic_t nr_aborted_queues;
138
139 /*
140 * Our ubq->daemon may be killed without any notification, so
141 * monitor each queue's daemon periodically
142 */
143 struct delayed_work monitor_work;
144 struct work_struct stop_work;
145};
146
147static dev_t ublk_chr_devt;
148static struct class *ublk_chr_class;
149
150static DEFINE_IDR(ublk_index_idr);
151static DEFINE_SPINLOCK(ublk_idr_lock);
152static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */
153
154static DEFINE_MUTEX(ublk_ctl_mutex);
155
156static struct miscdevice ublk_misc;
157
cebbe577
ML
158static struct lock_class_key ublk_bio_compl_lkclass;
159
0edb3696
ML
160static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq)
161{
162 if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) &&
163 !(ubq->flags & UBLK_F_URING_CMD_COMP_IN_TASK))
164 return true;
165 return false;
166}
167
71f28f31
ML
168static struct ublk_device *ublk_get_device(struct ublk_device *ub)
169{
170 if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
171 return ub;
172 return NULL;
173}
174
175static void ublk_put_device(struct ublk_device *ub)
176{
177 put_device(&ub->cdev_dev);
178}
179
180static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
181 int qid)
182{
183 return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
184}
185
186static inline bool ublk_rq_has_data(const struct request *rq)
187{
188 return rq->bio && bio_has_data(rq->bio);
189}
190
191static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
192 int tag)
193{
194 return (struct ublksrv_io_desc *)
195 &(ubq->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]);
196}
197
198static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
199{
200 return ublk_get_queue(ub, q_id)->io_cmd_buf;
201}
202
203static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id)
204{
205 struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
206
207 return round_up(ubq->q_depth * sizeof(struct ublksrv_io_desc),
208 PAGE_SIZE);
209}
210
211static int ublk_open(struct block_device *bdev, fmode_t mode)
212{
213 return 0;
214}
215
216static void ublk_release(struct gendisk *disk, fmode_t mode)
217{
218}
219
220static const struct block_device_operations ub_fops = {
221 .owner = THIS_MODULE,
222 .open = ublk_open,
223 .release = ublk_release,
224};
225
226#define UBLK_MAX_PIN_PAGES 32
227
228struct ublk_map_data {
229 const struct ublk_queue *ubq;
230 const struct request *rq;
231 const struct ublk_io *io;
232 unsigned max_bytes;
233};
234
235struct ublk_io_iter {
236 struct page *pages[UBLK_MAX_PIN_PAGES];
237 unsigned pg_off; /* offset in the 1st page in pages */
238 int nr_pages; /* how many page pointers in pages */
239 struct bio *bio;
240 struct bvec_iter iter;
241};
242
243static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data,
244 unsigned max_bytes, bool to_vm)
245{
246 const unsigned total = min_t(unsigned, max_bytes,
247 PAGE_SIZE - data->pg_off +
248 ((data->nr_pages - 1) << PAGE_SHIFT));
249 unsigned done = 0;
250 unsigned pg_idx = 0;
251
252 while (done < total) {
253 struct bio_vec bv = bio_iter_iovec(data->bio, data->iter);
254 const unsigned int bytes = min3(bv.bv_len, total - done,
255 (unsigned)(PAGE_SIZE - data->pg_off));
256 void *bv_buf = bvec_kmap_local(&bv);
257 void *pg_buf = kmap_local_page(data->pages[pg_idx]);
258
259 if (to_vm)
260 memcpy(pg_buf + data->pg_off, bv_buf, bytes);
261 else
262 memcpy(bv_buf, pg_buf + data->pg_off, bytes);
263
264 kunmap_local(pg_buf);
265 kunmap_local(bv_buf);
266
267 /* advance page array */
268 data->pg_off += bytes;
269 if (data->pg_off == PAGE_SIZE) {
270 pg_idx += 1;
271 data->pg_off = 0;
272 }
273
274 done += bytes;
275
276 /* advance bio */
277 bio_advance_iter_single(data->bio, &data->iter, bytes);
278 if (!data->iter.bi_size) {
279 data->bio = data->bio->bi_next;
280 if (data->bio == NULL)
281 break;
282 data->iter = data->bio->bi_iter;
283 }
284 }
285
286 return done;
287}
288
289static inline int ublk_copy_user_pages(struct ublk_map_data *data,
290 bool to_vm)
291{
292 const unsigned int gup_flags = to_vm ? FOLL_WRITE : 0;
293 const unsigned long start_vm = data->io->addr;
294 unsigned int done = 0;
295 struct ublk_io_iter iter = {
296 .pg_off = start_vm & (PAGE_SIZE - 1),
297 .bio = data->rq->bio,
298 .iter = data->rq->bio->bi_iter,
299 };
300 const unsigned int nr_pages = round_up(data->max_bytes +
301 (start_vm & (PAGE_SIZE - 1)), PAGE_SIZE) >> PAGE_SHIFT;
302
303 while (done < nr_pages) {
304 const unsigned to_pin = min_t(unsigned, UBLK_MAX_PIN_PAGES,
305 nr_pages - done);
306 unsigned i, len;
307
308 iter.nr_pages = get_user_pages_fast(start_vm +
309 (done << PAGE_SHIFT), to_pin, gup_flags,
310 iter.pages);
311 if (iter.nr_pages <= 0)
312 return done == 0 ? iter.nr_pages : done;
313 len = ublk_copy_io_pages(&iter, data->max_bytes, to_vm);
314 for (i = 0; i < iter.nr_pages; i++) {
315 if (to_vm)
316 set_page_dirty(iter.pages[i]);
317 put_page(iter.pages[i]);
318 }
319 data->max_bytes -= len;
320 done += iter.nr_pages;
321 }
322
323 return done;
324}
325
326static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
327 struct ublk_io *io)
328{
329 const unsigned int rq_bytes = blk_rq_bytes(req);
330 /*
331 * no zero copy, we delay copy WRITE request data into ublksrv
332 * context and the big benefit is that pinning pages in current
333 * context is pretty fast, see ublk_pin_user_pages
334 */
335 if (req_op(req) != REQ_OP_WRITE && req_op(req) != REQ_OP_FLUSH)
336 return rq_bytes;
337
338 if (ublk_rq_has_data(req)) {
339 struct ublk_map_data data = {
340 .ubq = ubq,
341 .rq = req,
342 .io = io,
343 .max_bytes = rq_bytes,
344 };
345
346 ublk_copy_user_pages(&data, true);
347
348 return rq_bytes - data.max_bytes;
349 }
350 return rq_bytes;
351}
352
353static int ublk_unmap_io(const struct ublk_queue *ubq,
354 const struct request *req,
355 struct ublk_io *io)
356{
357 const unsigned int rq_bytes = blk_rq_bytes(req);
358
359 if (req_op(req) == REQ_OP_READ && ublk_rq_has_data(req)) {
360 struct ublk_map_data data = {
361 .ubq = ubq,
362 .rq = req,
363 .io = io,
364 .max_bytes = io->res,
365 };
366
367 WARN_ON_ONCE(io->res > rq_bytes);
368
369 ublk_copy_user_pages(&data, false);
370
371 return io->res - data.max_bytes;
372 }
373 return rq_bytes;
374}
375
376static inline unsigned int ublk_req_build_flags(struct request *req)
377{
378 unsigned flags = 0;
379
380 if (req->cmd_flags & REQ_FAILFAST_DEV)
381 flags |= UBLK_IO_F_FAILFAST_DEV;
382
383 if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
384 flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
385
386 if (req->cmd_flags & REQ_FAILFAST_DRIVER)
387 flags |= UBLK_IO_F_FAILFAST_DRIVER;
388
389 if (req->cmd_flags & REQ_META)
390 flags |= UBLK_IO_F_META;
391
392 if (req->cmd_flags & REQ_INTEGRITY)
393 flags |= UBLK_IO_F_INTEGRITY;
394
395 if (req->cmd_flags & REQ_FUA)
396 flags |= UBLK_IO_F_FUA;
397
398 if (req->cmd_flags & REQ_PREFLUSH)
399 flags |= UBLK_IO_F_PREFLUSH;
400
401 if (req->cmd_flags & REQ_NOUNMAP)
402 flags |= UBLK_IO_F_NOUNMAP;
403
404 if (req->cmd_flags & REQ_SWAP)
405 flags |= UBLK_IO_F_SWAP;
406
407 return flags;
408}
409
410static int ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
411{
412 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
413 struct ublk_io *io = &ubq->ios[req->tag];
414 u32 ublk_op;
415
416 switch (req_op(req)) {
417 case REQ_OP_READ:
418 ublk_op = UBLK_IO_OP_READ;
419 break;
420 case REQ_OP_WRITE:
421 ublk_op = UBLK_IO_OP_WRITE;
422 break;
423 case REQ_OP_FLUSH:
424 ublk_op = UBLK_IO_OP_FLUSH;
425 break;
426 case REQ_OP_DISCARD:
427 ublk_op = UBLK_IO_OP_DISCARD;
428 break;
429 case REQ_OP_WRITE_ZEROES:
430 ublk_op = UBLK_IO_OP_WRITE_ZEROES;
431 break;
432 default:
433 return BLK_STS_IOERR;
434 }
435
436 /* need to translate since kernel may change */
437 iod->op_flags = ublk_op | ublk_req_build_flags(req);
438 iod->nr_sectors = blk_rq_sectors(req);
439 iod->start_sector = blk_rq_pos(req);
440 iod->addr = io->addr;
441
442 return BLK_STS_OK;
443}
444
445static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
446 struct io_uring_cmd *ioucmd)
447{
448 return (struct ublk_uring_cmd_pdu *)&ioucmd->pdu;
449}
450
451static bool ubq_daemon_is_dying(struct ublk_queue *ubq)
452{
453 return ubq->ubq_daemon->flags & PF_EXITING;
454}
455
456/* todo: handle partial completion */
457static void ublk_complete_rq(struct request *req)
458{
459 struct ublk_queue *ubq = req->mq_hctx->driver_data;
460 struct ublk_io *io = &ubq->ios[req->tag];
461 unsigned int unmapped_bytes;
462
463 /* failed read IO if nothing is read */
464 if (!io->res && req_op(req) == REQ_OP_READ)
465 io->res = -EIO;
466
467 if (io->res < 0) {
468 blk_mq_end_request(req, errno_to_blk_status(io->res));
469 return;
470 }
471
472 /*
473 * FLUSH or DISCARD usually won't return bytes returned, so end them
474 * directly.
475 *
476 * Both the two needn't unmap.
477 */
478 if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) {
479 blk_mq_end_request(req, BLK_STS_OK);
480 return;
481 }
482
483 /* for READ request, writing data in iod->addr to rq buffers */
484 unmapped_bytes = ublk_unmap_io(ubq, req, io);
485
486 /*
487 * Extremely impossible since we got data filled in just before
488 *
489 * Re-read simply for this unlikely case.
490 */
491 if (unlikely(unmapped_bytes < io->res))
492 io->res = unmapped_bytes;
493
494 if (blk_update_request(req, BLK_STS_OK, io->res))
495 blk_mq_requeue_request(req, true);
496 else
497 __blk_mq_end_request(req, BLK_STS_OK);
498}
499
500/*
501 * __ublk_fail_req() may be called from abort context or ->ubq_daemon
502 * context during exiting, so lock is required.
503 *
504 * Also aborting may not be started yet, keep in mind that one failed
505 * request may be issued by block layer again.
506 */
507static void __ublk_fail_req(struct ublk_io *io, struct request *req)
508{
509 WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
510
511 if (!(io->flags & UBLK_IO_FLAG_ABORTED)) {
512 io->flags |= UBLK_IO_FLAG_ABORTED;
513 blk_mq_end_request(req, BLK_STS_IOERR);
514 }
515}
516
517#define UBLK_REQUEUE_DELAY_MS 3
518
0edb3696 519static inline void __ublk_rq_task_work(struct request *req)
71f28f31 520{
71f28f31 521 struct ublk_queue *ubq = req->mq_hctx->driver_data;
0edb3696 522 struct ublk_device *ub = ubq->dev;
71f28f31
ML
523 int tag = req->tag;
524 struct ublk_io *io = &ubq->ios[tag];
525 bool task_exiting = current != ubq->ubq_daemon ||
526 (current->flags & PF_EXITING);
527 unsigned int mapped_bytes;
528
529 pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n",
530 __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
531 ublk_get_iod(ubq, req->tag)->addr);
532
533 if (unlikely(task_exiting)) {
534 blk_mq_end_request(req, BLK_STS_IOERR);
535 mod_delayed_work(system_wq, &ub->monitor_work, 0);
536 return;
537 }
538
539 mapped_bytes = ublk_map_io(ubq, req, io);
540
541 /* partially mapped, update io descriptor */
542 if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
543 /*
544 * Nothing mapped, retry until we succeed.
545 *
546 * We may never succeed in mapping any bytes here because
547 * of OOM. TODO: reserve one buffer with single page pinned
548 * for providing forward progress guarantee.
549 */
550 if (unlikely(!mapped_bytes)) {
551 blk_mq_requeue_request(req, false);
552 blk_mq_delay_kick_requeue_list(req->q,
553 UBLK_REQUEUE_DELAY_MS);
554 return;
555 }
556
557 ublk_get_iod(ubq, req->tag)->nr_sectors =
558 mapped_bytes >> 9;
559 }
560
561 /* mark this cmd owned by ublksrv */
562 io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
563
564 /*
565 * clear ACTIVE since we are done with this sqe/cmd slot
566 * We can only accept io cmd in case of being not active.
567 */
568 io->flags &= ~UBLK_IO_FLAG_ACTIVE;
569
570 /* tell ublksrv one io request is coming */
571 io_uring_cmd_done(io->cmd, UBLK_IO_RES_OK, 0);
572}
573
0edb3696
ML
574static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd)
575{
576 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
577
578 __ublk_rq_task_work(pdu->req);
579}
580
581static void ublk_rq_task_work_fn(struct callback_head *work)
582{
583 struct ublk_rq_data *data = container_of(work,
584 struct ublk_rq_data, work);
585 struct request *req = blk_mq_rq_from_pdu(data);
586
587 __ublk_rq_task_work(req);
588}
589
71f28f31
ML
590static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
591 const struct blk_mq_queue_data *bd)
592{
593 struct ublk_queue *ubq = hctx->driver_data;
594 struct request *rq = bd->rq;
71f28f31
ML
595 blk_status_t res;
596
597 /* fill iod to slot in io cmd buffer */
598 res = ublk_setup_iod(ubq, rq);
599 if (unlikely(res != BLK_STS_OK))
600 return BLK_STS_IOERR;
601
602 blk_mq_start_request(bd->rq);
603
604 if (unlikely(ubq_daemon_is_dying(ubq))) {
0edb3696 605 fail:
71f28f31
ML
606 mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0);
607 return BLK_STS_IOERR;
608 }
609
0edb3696
ML
610 if (ublk_can_use_task_work(ubq)) {
611 struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
612 enum task_work_notify_mode notify_mode = bd->last ?
613 TWA_SIGNAL_NO_IPI : TWA_NONE;
614
615 if (task_work_add(ubq->ubq_daemon, &data->work, notify_mode))
616 goto fail;
617 } else {
618 struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
619 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
620
621 pdu->req = rq;
622 io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
623 }
71f28f31
ML
624
625 return BLK_STS_OK;
626}
627
0edb3696
ML
628static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx)
629{
630 struct ublk_queue *ubq = hctx->driver_data;
631
632 if (ublk_can_use_task_work(ubq))
633 __set_notify_signal(ubq->ubq_daemon);
634}
71f28f31
ML
635
636static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
637 unsigned int hctx_idx)
638{
cebbe577 639 struct ublk_device *ub = driver_data;
71f28f31
ML
640 struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
641
642 hctx->driver_data = ubq;
643 return 0;
644}
645
0edb3696
ML
646static int ublk_init_rq(struct blk_mq_tag_set *set, struct request *req,
647 unsigned int hctx_idx, unsigned int numa_node)
648{
649 struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
650
651 init_task_work(&data->work, ublk_rq_task_work_fn);
652 return 0;
653}
654
71f28f31
ML
655static const struct blk_mq_ops ublk_mq_ops = {
656 .queue_rq = ublk_queue_rq,
0edb3696 657 .commit_rqs = ublk_commit_rqs,
71f28f31 658 .init_hctx = ublk_init_hctx,
0edb3696 659 .init_request = ublk_init_rq,
71f28f31
ML
660};
661
662static int ublk_ch_open(struct inode *inode, struct file *filp)
663{
664 struct ublk_device *ub = container_of(inode->i_cdev,
665 struct ublk_device, cdev);
666
667 if (atomic_cmpxchg(&ub->ch_open_cnt, 0, 1) == 0) {
668 filp->private_data = ub;
669 return 0;
670 }
671 return -EBUSY;
672}
673
674static int ublk_ch_release(struct inode *inode, struct file *filp)
675{
676 struct ublk_device *ub = filp->private_data;
677
678 while (atomic_cmpxchg(&ub->ch_open_cnt, 1, 0) != 1)
679 cpu_relax();
680
681 filp->private_data = NULL;
682 return 0;
683}
684
685/* map pre-allocated per-queue cmd buffer to ublksrv daemon */
686static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
687{
688 struct ublk_device *ub = filp->private_data;
689 size_t sz = vma->vm_end - vma->vm_start;
690 unsigned max_sz = UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc);
691 unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
692 int q_id, ret = 0;
693
694 mutex_lock(&ub->mutex);
695 if (!ub->mm)
696 ub->mm = current->mm;
697 if (current->mm != ub->mm)
698 ret = -EINVAL;
699 mutex_unlock(&ub->mutex);
700
701 if (ret)
702 return ret;
703
704 if (vma->vm_flags & VM_WRITE)
705 return -EPERM;
706
707 end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
708 if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
709 return -EINVAL;
710
711 q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
712 pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
713 __func__, q_id, current->pid, vma->vm_start,
714 phys_off, (unsigned long)sz);
715
716 if (sz != ublk_queue_cmd_buf_size(ub, q_id))
717 return -EINVAL;
718
719 pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
720 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
721}
722
723static void ublk_commit_completion(struct ublk_device *ub,
724 struct ublksrv_io_cmd *ub_cmd)
725{
726 u32 qid = ub_cmd->q_id, tag = ub_cmd->tag;
727 struct ublk_queue *ubq = ublk_get_queue(ub, qid);
728 struct ublk_io *io = &ubq->ios[tag];
729 struct request *req;
730
731 /* now this cmd slot is owned by nbd driver */
732 io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
733 io->res = ub_cmd->result;
734
735 /* find the io request and complete */
736 req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
737
738 if (req && likely(!blk_should_fake_timeout(req->q)))
739 ublk_complete_rq(req);
740}
741
742/*
743 * When ->ubq_daemon is exiting, either new request is ended immediately,
744 * or any queued io command is drained, so it is safe to abort queue
745 * lockless
746 */
747static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
748{
749 int i;
750
751 if (!ublk_get_device(ub))
752 return;
753
754 for (i = 0; i < ubq->q_depth; i++) {
755 struct ublk_io *io = &ubq->ios[i];
756
757 if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
758 struct request *rq;
759
760 /*
761 * Either we fail the request or ublk_rq_task_work_fn
762 * will do it
763 */
764 rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
765 if (rq)
766 __ublk_fail_req(io, rq);
767 }
768 }
769 ublk_put_device(ub);
770}
771
772static void ublk_daemon_monitor_work(struct work_struct *work)
773{
774 struct ublk_device *ub =
775 container_of(work, struct ublk_device, monitor_work.work);
776 int i;
777
778 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
779 struct ublk_queue *ubq = ublk_get_queue(ub, i);
780
781 if (ubq_daemon_is_dying(ubq)) {
782 schedule_work(&ub->stop_work);
783
784 /* abort queue is for making forward progress */
785 ublk_abort_queue(ub, ubq);
786 }
787 }
788
789 /*
790 * We can't schedule monitor work after ublk_remove() is started.
791 *
792 * No need ub->mutex, monitor work are canceled after state is marked
793 * as DEAD, so DEAD state is observed reliably.
794 */
795 if (ub->dev_info.state != UBLK_S_DEV_DEAD)
796 schedule_delayed_work(&ub->monitor_work,
797 UBLK_DAEMON_MONITOR_PERIOD);
798}
799
800static void ublk_cancel_queue(struct ublk_queue *ubq)
801{
802 int i;
803
804 for (i = 0; i < ubq->q_depth; i++) {
805 struct ublk_io *io = &ubq->ios[i];
806
807 if (io->flags & UBLK_IO_FLAG_ACTIVE)
808 io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0);
809 }
810}
811
812/* Cancel all pending commands, must be called after del_gendisk() returns */
813static void ublk_cancel_dev(struct ublk_device *ub)
814{
815 int i;
816
817 for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
818 ublk_cancel_queue(ublk_get_queue(ub, i));
819}
820
821static void ublk_stop_dev(struct ublk_device *ub)
822{
823 mutex_lock(&ub->mutex);
824 if (!disk_live(ub->ub_disk))
825 goto unlock;
826
827 del_gendisk(ub->ub_disk);
828 ub->dev_info.state = UBLK_S_DEV_DEAD;
829 ub->dev_info.ublksrv_pid = -1;
830 ublk_cancel_dev(ub);
831 unlock:
832 mutex_unlock(&ub->mutex);
833 cancel_delayed_work_sync(&ub->monitor_work);
834}
835
836static int ublk_ctrl_stop_dev(struct ublk_device *ub)
837{
838 ublk_stop_dev(ub);
839 cancel_work_sync(&ub->stop_work);
840 return 0;
841}
842
843static inline bool ublk_queue_ready(struct ublk_queue *ubq)
844{
845 return ubq->nr_io_ready == ubq->q_depth;
846}
847
848/* device can only be started after all IOs are ready */
849static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
850{
851 mutex_lock(&ub->mutex);
852 ubq->nr_io_ready++;
853 if (ublk_queue_ready(ubq)) {
854 ubq->ubq_daemon = current;
855 get_task_struct(ubq->ubq_daemon);
856 ub->nr_queues_ready++;
857 }
858 if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues)
859 complete_all(&ub->completion);
860 mutex_unlock(&ub->mutex);
861}
862
863static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
864{
865 struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->cmd;
866 struct ublk_device *ub = cmd->file->private_data;
867 struct ublk_queue *ubq;
868 struct ublk_io *io;
869 u32 cmd_op = cmd->cmd_op;
870 unsigned tag = ub_cmd->tag;
871 int ret = -EINVAL;
872
873 pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
874 __func__, cmd->cmd_op, ub_cmd->q_id, tag,
875 ub_cmd->result);
876
877 if (!(issue_flags & IO_URING_F_SQE128))
878 goto out;
879
880 if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
881 goto out;
882
883 ubq = ublk_get_queue(ub, ub_cmd->q_id);
884 if (!ubq || ub_cmd->q_id != ubq->q_id)
885 goto out;
886
887 if (ubq->ubq_daemon && ubq->ubq_daemon != current)
888 goto out;
889
890 if (tag >= ubq->q_depth)
891 goto out;
892
893 io = &ubq->ios[tag];
894
895 /* there is pending io cmd, something must be wrong */
896 if (io->flags & UBLK_IO_FLAG_ACTIVE) {
897 ret = -EBUSY;
898 goto out;
899 }
900
901 switch (cmd_op) {
902 case UBLK_IO_FETCH_REQ:
903 /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
904 if (ublk_queue_ready(ubq)) {
905 ret = -EBUSY;
906 goto out;
907 }
908 /*
909 * The io is being handled by server, so COMMIT_RQ is expected
910 * instead of FETCH_REQ
911 */
912 if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
913 goto out;
914 /* FETCH_RQ has to provide IO buffer */
915 if (!ub_cmd->addr)
916 goto out;
917 io->cmd = cmd;
918 io->flags |= UBLK_IO_FLAG_ACTIVE;
919 io->addr = ub_cmd->addr;
920
921 ublk_mark_io_ready(ub, ubq);
922 break;
923 case UBLK_IO_COMMIT_AND_FETCH_REQ:
924 /* FETCH_RQ has to provide IO buffer */
925 if (!ub_cmd->addr)
926 goto out;
927 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
928 goto out;
929 io->addr = ub_cmd->addr;
930 io->flags |= UBLK_IO_FLAG_ACTIVE;
931 io->cmd = cmd;
932 ublk_commit_completion(ub, ub_cmd);
933 break;
934 default:
935 goto out;
936 }
937 return -EIOCBQUEUED;
938
939 out:
940 io->flags &= ~UBLK_IO_FLAG_ACTIVE;
941 io_uring_cmd_done(cmd, ret, 0);
942 pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
943 __func__, cmd_op, tag, ret, io->flags);
944 return -EIOCBQUEUED;
945}
946
947static const struct file_operations ublk_ch_fops = {
948 .owner = THIS_MODULE,
949 .open = ublk_ch_open,
950 .release = ublk_ch_release,
951 .llseek = no_llseek,
952 .uring_cmd = ublk_ch_uring_cmd,
953 .mmap = ublk_ch_mmap,
954};
955
956static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
957{
958 int size = ublk_queue_cmd_buf_size(ub, q_id);
959 struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
960
961 if (ubq->ubq_daemon)
962 put_task_struct(ubq->ubq_daemon);
963 if (ubq->io_cmd_buf)
964 free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
965}
966
967static int ublk_init_queue(struct ublk_device *ub, int q_id)
968{
969 struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
970 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
971 void *ptr;
972 int size;
973
0edb3696 974 ubq->flags = ub->dev_info.flags[0];
71f28f31
ML
975 ubq->q_id = q_id;
976 ubq->q_depth = ub->dev_info.queue_depth;
977 size = ublk_queue_cmd_buf_size(ub, q_id);
978
979 ptr = (void *) __get_free_pages(gfp_flags, get_order(size));
980 if (!ptr)
981 return -ENOMEM;
982
983 ubq->io_cmd_buf = ptr;
984 ubq->dev = ub;
985 return 0;
986}
987
988static void ublk_deinit_queues(struct ublk_device *ub)
989{
990 int nr_queues = ub->dev_info.nr_hw_queues;
991 int i;
992
993 if (!ub->__queues)
994 return;
995
996 for (i = 0; i < nr_queues; i++)
997 ublk_deinit_queue(ub, i);
998 kfree(ub->__queues);
999}
1000
1001static int ublk_init_queues(struct ublk_device *ub)
1002{
1003 int nr_queues = ub->dev_info.nr_hw_queues;
1004 int depth = ub->dev_info.queue_depth;
1005 int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io);
1006 int i, ret = -ENOMEM;
1007
1008 ub->queue_size = ubq_size;
1009 ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL);
1010 if (!ub->__queues)
1011 return ret;
1012
1013 for (i = 0; i < nr_queues; i++) {
1014 if (ublk_init_queue(ub, i))
1015 goto fail;
1016 }
1017
1018 init_completion(&ub->completion);
1019 return 0;
1020
1021 fail:
1022 ublk_deinit_queues(ub);
1023 return ret;
1024}
1025
1026static int __ublk_alloc_dev_number(struct ublk_device *ub, int idx)
1027{
1028 int i = idx;
1029 int err;
1030
1031 spin_lock(&ublk_idr_lock);
1032 /* allocate id, if @id >= 0, we're requesting that specific id */
1033 if (i >= 0) {
1034 err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
1035 if (err == -ENOSPC)
1036 err = -EEXIST;
1037 } else {
1038 err = idr_alloc(&ublk_index_idr, ub, 0, 0, GFP_NOWAIT);
1039 }
1040 spin_unlock(&ublk_idr_lock);
1041
1042 if (err >= 0)
1043 ub->ub_number = err;
1044
1045 return err;
1046}
1047
1048static struct ublk_device *__ublk_create_dev(int idx)
1049{
1050 struct ublk_device *ub = NULL;
1051 int ret;
1052
1053 ub = kzalloc(sizeof(*ub), GFP_KERNEL);
1054 if (!ub)
1055 return ERR_PTR(-ENOMEM);
1056
1057 ret = __ublk_alloc_dev_number(ub, idx);
1058 if (ret < 0) {
1059 kfree(ub);
1060 return ERR_PTR(ret);
1061 }
1062 return ub;
1063}
1064
1065static void __ublk_destroy_dev(struct ublk_device *ub)
1066{
1067 spin_lock(&ublk_idr_lock);
1068 idr_remove(&ublk_index_idr, ub->ub_number);
1069 wake_up_all(&ublk_idr_wq);
1070 spin_unlock(&ublk_idr_lock);
1071
1072 mutex_destroy(&ub->mutex);
1073
1074 kfree(ub);
1075}
1076
1077static void ublk_cdev_rel(struct device *dev)
1078{
1079 struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
1080
cebbe577
ML
1081 blk_mq_destroy_queue(ub->ub_queue);
1082
71f28f31
ML
1083 put_disk(ub->ub_disk);
1084
1085 blk_mq_free_tag_set(&ub->tag_set);
1086
1087 ublk_deinit_queues(ub);
1088
1089 __ublk_destroy_dev(ub);
1090}
1091
1092static int ublk_add_chdev(struct ublk_device *ub)
1093{
1094 struct device *dev = &ub->cdev_dev;
1095 int minor = ub->ub_number;
1096 int ret;
1097
1098 dev->parent = ublk_misc.this_device;
1099 dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
1100 dev->class = ublk_chr_class;
1101 dev->release = ublk_cdev_rel;
1102 device_initialize(dev);
1103
1104 ret = dev_set_name(dev, "ublkc%d", minor);
1105 if (ret)
1106 goto fail;
1107
1108 cdev_init(&ub->cdev, &ublk_ch_fops);
1109 ret = cdev_device_add(&ub->cdev, dev);
1110 if (ret)
1111 goto fail;
1112 return 0;
1113 fail:
1114 put_device(dev);
1115 return ret;
1116}
1117
1118static void ublk_stop_work_fn(struct work_struct *work)
1119{
1120 struct ublk_device *ub =
1121 container_of(work, struct ublk_device, stop_work);
1122
1123 ublk_stop_dev(ub);
1124}
1125
1126static void ublk_update_capacity(struct ublk_device *ub)
1127{
1128 unsigned int max_rq_bytes;
1129
1130 /* make max request buffer size aligned with PAGE_SIZE */
1131 max_rq_bytes = round_down(ub->dev_info.rq_max_blocks <<
1132 ub->bs_shift, PAGE_SIZE);
1133 ub->dev_info.rq_max_blocks = max_rq_bytes >> ub->bs_shift;
1134
1135 set_capacity(ub->ub_disk, ub->dev_info.dev_blocks << (ub->bs_shift - 9));
1136}
1137
1138/* add disk & cdev, cleanup everything in case of failure */
1139static int ublk_add_dev(struct ublk_device *ub)
1140{
1141 struct gendisk *disk;
1142 int err = -ENOMEM;
1143 int bsize;
1144
1145 /* We are not ready to support zero copy */
1146 ub->dev_info.flags[0] &= ~UBLK_F_SUPPORT_ZERO_COPY;
1147
1148 bsize = ub->dev_info.block_size;
1149 ub->bs_shift = ilog2(bsize);
1150
1151 ub->dev_info.nr_hw_queues = min_t(unsigned int,
1152 ub->dev_info.nr_hw_queues, nr_cpu_ids);
1153
1154 INIT_WORK(&ub->stop_work, ublk_stop_work_fn);
1155 INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work);
1156
1157 if (ublk_init_queues(ub))
1158 goto out_destroy_dev;
1159
1160 ub->tag_set.ops = &ublk_mq_ops;
1161 ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
1162 ub->tag_set.queue_depth = ub->dev_info.queue_depth;
1163 ub->tag_set.numa_node = NUMA_NO_NODE;
0edb3696 1164 ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
71f28f31
ML
1165 ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
1166 ub->tag_set.driver_data = ub;
1167
1168 err = blk_mq_alloc_tag_set(&ub->tag_set);
1169 if (err)
1170 goto out_deinit_queues;
1171
cebbe577
ML
1172 ub->ub_queue = blk_mq_init_queue(&ub->tag_set);
1173 if (IS_ERR(ub->ub_queue))
1174 goto out_cleanup_tags;
1175 ub->ub_queue->queuedata = ub;
1176
1177 disk = ub->ub_disk = blk_mq_alloc_disk_for_queue(ub->ub_queue,
1178 &ublk_bio_compl_lkclass);
71f28f31
ML
1179 if (IS_ERR(disk)) {
1180 err = PTR_ERR(disk);
cebbe577 1181 goto out_free_request_queue;
71f28f31 1182 }
71f28f31
ML
1183
1184 blk_queue_logical_block_size(ub->ub_queue, bsize);
1185 blk_queue_physical_block_size(ub->ub_queue, bsize);
1186 blk_queue_io_min(ub->ub_queue, bsize);
1187
1188 blk_queue_max_hw_sectors(ub->ub_queue, ub->dev_info.rq_max_blocks <<
1189 (ub->bs_shift - 9));
1190
1191 ub->ub_queue->limits.discard_granularity = PAGE_SIZE;
1192
1193 blk_queue_max_discard_sectors(ub->ub_queue, UINT_MAX >> 9);
1194 blk_queue_max_write_zeroes_sectors(ub->ub_queue, UINT_MAX >> 9);
1195
1196 ublk_update_capacity(ub);
1197
1198 disk->fops = &ub_fops;
1199 disk->private_data = ub;
1200 disk->queue = ub->ub_queue;
1201 sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
1202
1203 mutex_init(&ub->mutex);
1204
1205 /* add char dev so that ublksrv daemon can be setup */
1206 err = ublk_add_chdev(ub);
1207 if (err)
1208 return err;
1209
1210 /* don't expose disk now until we got start command from cdev */
1211
1212 return 0;
1213
cebbe577
ML
1214out_free_request_queue:
1215 blk_mq_destroy_queue(ub->ub_queue);
71f28f31
ML
1216out_cleanup_tags:
1217 blk_mq_free_tag_set(&ub->tag_set);
1218out_deinit_queues:
1219 ublk_deinit_queues(ub);
1220out_destroy_dev:
1221 __ublk_destroy_dev(ub);
1222 return err;
1223}
1224
1225static void ublk_remove(struct ublk_device *ub)
1226{
1227 ublk_ctrl_stop_dev(ub);
1228
1229 cdev_device_del(&ub->cdev, &ub->cdev_dev);
1230 put_device(&ub->cdev_dev);
1231}
1232
1233static struct ublk_device *ublk_get_device_from_id(int idx)
1234{
1235 struct ublk_device *ub = NULL;
1236
1237 if (idx < 0)
1238 return NULL;
1239
1240 spin_lock(&ublk_idr_lock);
1241 ub = idr_find(&ublk_index_idr, idx);
1242 if (ub)
1243 ub = ublk_get_device(ub);
1244 spin_unlock(&ublk_idr_lock);
1245
1246 return ub;
1247}
1248
1249static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
1250{
1251 struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1252 int ret = -EINVAL;
1253 int ublksrv_pid = (int)header->data[0];
1254 unsigned long dev_blocks = header->data[1];
1255
1256 if (ublksrv_pid <= 0)
1257 return ret;
1258
1259 wait_for_completion_interruptible(&ub->completion);
1260
1261 schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD);
1262
1263 mutex_lock(&ub->mutex);
1264 if (!disk_live(ub->ub_disk)) {
1265 /* We may get disk size updated */
1266 if (dev_blocks) {
1267 ub->dev_info.dev_blocks = dev_blocks;
1268 ublk_update_capacity(ub);
1269 }
1270 ub->dev_info.ublksrv_pid = ublksrv_pid;
1271 ret = add_disk(ub->ub_disk);
1272 if (!ret)
1273 ub->dev_info.state = UBLK_S_DEV_LIVE;
1274 } else {
1275 ret = -EEXIST;
1276 }
1277 mutex_unlock(&ub->mutex);
1278
1279 return ret;
1280}
1281
1282static struct blk_mq_hw_ctx *ublk_get_hw_queue(struct ublk_device *ub,
1283 unsigned int index)
1284{
1285 struct blk_mq_hw_ctx *hctx;
1286 unsigned long i;
1287
1288 queue_for_each_hw_ctx(ub->ub_queue, hctx, i)
1289 if (hctx->queue_num == index)
1290 return hctx;
1291 return NULL;
1292}
1293
1294static int ublk_ctrl_get_queue_affinity(struct io_uring_cmd *cmd)
1295{
1296 struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1297 void __user *argp = (void __user *)(unsigned long)header->addr;
1298 struct blk_mq_hw_ctx *hctx;
1299 struct ublk_device *ub;
1300 unsigned long queue;
1301 unsigned int retlen;
1302 int ret;
1303
1304 ub = ublk_get_device_from_id(header->dev_id);
1305 if (!ub)
1306 goto out;
1307
1308 ret = -EINVAL;
1309 queue = header->data[0];
1310 if (queue >= ub->dev_info.nr_hw_queues)
1311 goto out;
1312 hctx = ublk_get_hw_queue(ub, queue);
1313 if (!hctx)
1314 goto out;
1315
1316 retlen = min_t(unsigned short, header->len, cpumask_size());
1317 if (copy_to_user(argp, hctx->cpumask, retlen)) {
1318 ret = -EFAULT;
1319 goto out;
1320 }
1321 if (retlen != header->len) {
1322 if (clear_user(argp + retlen, header->len - retlen)) {
1323 ret = -EFAULT;
1324 goto out;
1325 }
1326 }
1327 ret = 0;
1328 out:
1329 if (ub)
1330 ublk_put_device(ub);
1331 return ret;
1332}
1333
1334static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_dev_info *info,
1335 void __user *argp, int idx)
1336{
1337 struct ublk_device *ub;
1338 int ret;
1339
1340 ret = mutex_lock_killable(&ublk_ctl_mutex);
1341 if (ret)
1342 return ret;
1343
1344 ub = __ublk_create_dev(idx);
1345 if (!IS_ERR_OR_NULL(ub)) {
1346 memcpy(&ub->dev_info, info, sizeof(*info));
1347
1348 /* update device id */
1349 ub->dev_info.dev_id = ub->ub_number;
1350
1351 ret = ublk_add_dev(ub);
1352 if (!ret) {
1353 if (copy_to_user(argp, &ub->dev_info, sizeof(*info))) {
1354 ublk_remove(ub);
1355 ret = -EFAULT;
1356 }
1357 }
1358 } else {
1359 if (IS_ERR(ub))
1360 ret = PTR_ERR(ub);
1361 else
1362 ret = -ENOMEM;
1363 }
1364 mutex_unlock(&ublk_ctl_mutex);
1365
1366 return ret;
1367}
1368
1369static inline bool ublk_idr_freed(int id)
1370{
1371 void *ptr;
1372
1373 spin_lock(&ublk_idr_lock);
1374 ptr = idr_find(&ublk_index_idr, id);
1375 spin_unlock(&ublk_idr_lock);
1376
1377 return ptr == NULL;
1378}
1379
1380static int ublk_ctrl_del_dev(int idx)
1381{
1382 struct ublk_device *ub;
1383 int ret;
1384
1385 ret = mutex_lock_killable(&ublk_ctl_mutex);
1386 if (ret)
1387 return ret;
1388
1389 ub = ublk_get_device_from_id(idx);
1390 if (ub) {
1391 ublk_remove(ub);
1392 ublk_put_device(ub);
1393 ret = 0;
1394 } else {
1395 ret = -ENODEV;
1396 }
1397
1398 /*
1399 * Wait until the idr is removed, then it can be reused after
1400 * DEL_DEV command is returned.
1401 */
1402 if (!ret)
1403 wait_event(ublk_idr_wq, ublk_idr_freed(idx));
1404 mutex_unlock(&ublk_ctl_mutex);
1405
1406 return ret;
1407}
1408
1409
1410static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
1411{
1412 pr_devel("%s: dev id %d flags %llx\n", __func__,
1413 info->dev_id, info->flags[0]);
1414 pr_devel("\t nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n",
1415 info->nr_hw_queues, info->queue_depth,
1416 info->block_size, info->dev_blocks);
1417}
1418
1419static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd)
1420{
1421 struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1422
1423 pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
1424 __func__, cmd->cmd_op, header->dev_id, header->queue_id,
1425 header->data[0], header->addr, header->len);
1426}
1427
1428static int ublk_ctrl_cmd_validate(struct io_uring_cmd *cmd,
1429 struct ublksrv_ctrl_dev_info *info)
1430{
1431 struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1432 u32 cmd_op = cmd->cmd_op;
1433 void __user *argp = (void __user *)(unsigned long)header->addr;
1434
1435 if (!capable(CAP_SYS_ADMIN))
1436 return -EPERM;
1437
1438 switch (cmd_op) {
1439 case UBLK_CMD_GET_DEV_INFO:
1440 if (header->len < sizeof(*info) || !header->addr)
1441 return -EINVAL;
1442 break;
1443 case UBLK_CMD_ADD_DEV:
1444 if (header->len < sizeof(*info) || !header->addr)
1445 return -EINVAL;
1446 if (copy_from_user(info, argp, sizeof(*info)) != 0)
1447 return -EFAULT;
1448 ublk_dump_dev_info(info);
1449 if (header->dev_id != info->dev_id) {
1450 printk(KERN_WARNING "%s: cmd %x, dev id not match %u %u\n",
1451 __func__, cmd_op, header->dev_id,
1452 info->dev_id);
1453 return -EINVAL;
1454 }
1455 if (header->queue_id != (u16)-1) {
1456 printk(KERN_WARNING "%s: cmd %x queue_id is wrong %x\n",
1457 __func__, cmd_op, header->queue_id);
1458 return -EINVAL;
1459 }
1460 break;
1461 case UBLK_CMD_GET_QUEUE_AFFINITY:
1462 if ((header->len * BITS_PER_BYTE) < nr_cpu_ids)
1463 return -EINVAL;
1464 if (header->len & (sizeof(unsigned long)-1))
1465 return -EINVAL;
1466 if (!header->addr)
1467 return -EINVAL;
1468 };
1469
1470 return 0;
1471}
1472
1473static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
1474 unsigned int issue_flags)
1475{
1476 struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
1477 void __user *argp = (void __user *)(unsigned long)header->addr;
1478 struct ublksrv_ctrl_dev_info info;
1479 u32 cmd_op = cmd->cmd_op;
1480 struct ublk_device *ub;
1481 int ret = -EINVAL;
1482
1483 ublk_ctrl_cmd_dump(cmd);
1484
1485 if (!(issue_flags & IO_URING_F_SQE128))
1486 goto out;
1487
1488 ret = ublk_ctrl_cmd_validate(cmd, &info);
1489 if (ret)
1490 goto out;
1491
1492 ret = -ENODEV;
1493 switch (cmd_op) {
1494 case UBLK_CMD_START_DEV:
1495 ub = ublk_get_device_from_id(header->dev_id);
1496 if (ub) {
1497 ret = ublk_ctrl_start_dev(ub, cmd);
1498 ublk_put_device(ub);
1499 }
1500 break;
1501 case UBLK_CMD_STOP_DEV:
1502 ub = ublk_get_device_from_id(header->dev_id);
1503 if (ub) {
1504 ret = ublk_ctrl_stop_dev(ub);
1505 ublk_put_device(ub);
1506 }
1507 break;
1508 case UBLK_CMD_GET_DEV_INFO:
1509 ub = ublk_get_device_from_id(header->dev_id);
1510 if (ub) {
1511 if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
1512 ret = -EFAULT;
1513 else
1514 ret = 0;
1515 ublk_put_device(ub);
1516 }
1517 break;
1518 case UBLK_CMD_ADD_DEV:
1519 ret = ublk_ctrl_add_dev(&info, argp, header->dev_id);
1520 break;
1521 case UBLK_CMD_DEL_DEV:
1522 ret = ublk_ctrl_del_dev(header->dev_id);
1523 break;
1524 case UBLK_CMD_GET_QUEUE_AFFINITY:
1525 ret = ublk_ctrl_get_queue_affinity(cmd);
1526 break;
1527 default:
1528 break;
1529 };
1530 out:
1531 io_uring_cmd_done(cmd, ret, 0);
1532 pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
1533 __func__, ret, cmd->cmd_op, header->dev_id, header->queue_id);
1534 return -EIOCBQUEUED;
1535}
1536
1537static const struct file_operations ublk_ctl_fops = {
1538 .open = nonseekable_open,
1539 .uring_cmd = ublk_ctrl_uring_cmd,
1540 .owner = THIS_MODULE,
1541 .llseek = noop_llseek,
1542};
1543
1544static struct miscdevice ublk_misc = {
1545 .minor = MISC_DYNAMIC_MINOR,
1546 .name = "ublk-control",
1547 .fops = &ublk_ctl_fops,
1548};
1549
1550static int __init ublk_init(void)
1551{
1552 int ret;
1553
1554 init_waitqueue_head(&ublk_idr_wq);
1555
1556 ret = misc_register(&ublk_misc);
1557 if (ret)
1558 return ret;
1559
1560 ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
1561 if (ret)
1562 goto unregister_mis;
1563
1564 ublk_chr_class = class_create(THIS_MODULE, "ublk-char");
1565 if (IS_ERR(ublk_chr_class)) {
1566 ret = PTR_ERR(ublk_chr_class);
1567 goto free_chrdev_region;
1568 }
1569 return 0;
1570
1571free_chrdev_region:
1572 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
1573unregister_mis:
1574 misc_deregister(&ublk_misc);
1575 return ret;
1576}
1577
1578static void __exit ublk_exit(void)
1579{
1580 struct ublk_device *ub;
1581 int id;
1582
1583 class_destroy(ublk_chr_class);
1584
1585 misc_deregister(&ublk_misc);
1586
1587 idr_for_each_entry(&ublk_index_idr, ub, id)
1588 ublk_remove(ub);
1589
1590 idr_destroy(&ublk_index_idr);
1591 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
1592}
1593
1594module_init(ublk_init);
1595module_exit(ublk_exit);
1596
1597MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
1598MODULE_LICENSE("GPL");