176657dce3e3850c34d01936c9cd19a68d48f7a1
[linux-2.6-block.git] / drivers / block / ublk_drv.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Userspace block device - block device which IO is handled from userspace
4  *
5  * Take full use of io_uring passthrough command for communicating with
6  * ublk userspace daemon(ublksrvd) for handling basic IO request.
7  *
8  * Copyright 2022 Ming Lei <ming.lei@redhat.com>
9  *
10  * (part of code stolen from loop.c)
11  */
12 #include <linux/module.h>
13 #include <linux/moduleparam.h>
14 #include <linux/sched.h>
15 #include <linux/fs.h>
16 #include <linux/pagemap.h>
17 #include <linux/file.h>
18 #include <linux/stat.h>
19 #include <linux/errno.h>
20 #include <linux/major.h>
21 #include <linux/wait.h>
22 #include <linux/blkdev.h>
23 #include <linux/init.h>
24 #include <linux/swap.h>
25 #include <linux/slab.h>
26 #include <linux/compat.h>
27 #include <linux/mutex.h>
28 #include <linux/writeback.h>
29 #include <linux/completion.h>
30 #include <linux/highmem.h>
31 #include <linux/sysfs.h>
32 #include <linux/miscdevice.h>
33 #include <linux/falloc.h>
34 #include <linux/uio.h>
35 #include <linux/ioprio.h>
36 #include <linux/sched/mm.h>
37 #include <linux/uaccess.h>
38 #include <linux/cdev.h>
39 #include <linux/io_uring/cmd.h>
40 #include <linux/blk-mq.h>
41 #include <linux/delay.h>
42 #include <linux/mm.h>
43 #include <asm/page.h>
44 #include <linux/task_work.h>
45 #include <linux/namei.h>
46 #include <linux/kref.h>
47 #include <uapi/linux/ublk_cmd.h>
48
49 #define UBLK_MINORS             (1U << MINORBITS)
50
51 /* All UBLK_F_* have to be included into UBLK_F_ALL */
52 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
53                 | UBLK_F_URING_CMD_COMP_IN_TASK \
54                 | UBLK_F_NEED_GET_DATA \
55                 | UBLK_F_USER_RECOVERY \
56                 | UBLK_F_USER_RECOVERY_REISSUE \
57                 | UBLK_F_UNPRIVILEGED_DEV \
58                 | UBLK_F_CMD_IOCTL_ENCODE \
59                 | UBLK_F_USER_COPY \
60                 | UBLK_F_ZONED)
61
62 /* All UBLK_PARAM_TYPE_* should be included here */
63 #define UBLK_PARAM_TYPE_ALL                                \
64         (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
65          UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED)
66
67 struct ublk_rq_data {
68         struct llist_node node;
69
70         struct kref ref;
71         __u64 sector;
72         __u32 operation;
73         __u32 nr_zones;
74 };
75
76 struct ublk_uring_cmd_pdu {
77         struct ublk_queue *ubq;
78         u16 tag;
79 };
80
81 /*
82  * io command is active: sqe cmd is received, and its cqe isn't done
83  *
84  * If the flag is set, the io command is owned by ublk driver, and waited
85  * for incoming blk-mq request from the ublk block device.
86  *
87  * If the flag is cleared, the io command will be completed, and owned by
88  * ublk server.
89  */
90 #define UBLK_IO_FLAG_ACTIVE     0x01
91
92 /*
93  * IO command is completed via cqe, and it is being handled by ublksrv, and
94  * not committed yet
95  *
96  * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
97  * cross verification
98  */
99 #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
100
101 /*
102  * IO command is aborted, so this flag is set in case of
103  * !UBLK_IO_FLAG_ACTIVE.
104  *
105  * After this flag is observed, any pending or new incoming request
106  * associated with this io command will be failed immediately
107  */
108 #define UBLK_IO_FLAG_ABORTED 0x04
109
110 /*
111  * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
112  * get data buffer address from ublksrv.
113  *
114  * Then, bio data could be copied into this data buffer for a WRITE request
115  * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
116  */
117 #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
118
119 /* atomic RW with ubq->cancel_lock */
120 #define UBLK_IO_FLAG_CANCELED   0x80000000
121
122 struct ublk_io {
123         /* userspace buffer address from io cmd */
124         __u64   addr;
125         unsigned int flags;
126         int res;
127
128         struct io_uring_cmd *cmd;
129 };
130
131 struct ublk_queue {
132         int q_id;
133         int q_depth;
134
135         unsigned long flags;
136         struct task_struct      *ubq_daemon;
137         char *io_cmd_buf;
138
139         struct llist_head       io_cmds;
140
141         unsigned long io_addr;  /* mapped vm address */
142         unsigned int max_io_sz;
143         bool force_abort;
144         bool timeout;
145         bool canceling;
146         unsigned short nr_io_ready;     /* how many ios setup */
147         spinlock_t              cancel_lock;
148         struct ublk_device *dev;
149         struct ublk_io ios[];
150 };
151
152 struct ublk_device {
153         struct gendisk          *ub_disk;
154
155         char    *__queues;
156
157         unsigned int    queue_size;
158         struct ublksrv_ctrl_dev_info    dev_info;
159
160         struct blk_mq_tag_set   tag_set;
161
162         struct cdev             cdev;
163         struct device           cdev_dev;
164
165 #define UB_STATE_OPEN           0
166 #define UB_STATE_USED           1
167 #define UB_STATE_DELETED        2
168         unsigned long           state;
169         int                     ub_number;
170
171         struct mutex            mutex;
172
173         spinlock_t              lock;
174         struct mm_struct        *mm;
175
176         struct ublk_params      params;
177
178         struct completion       completion;
179         unsigned int            nr_queues_ready;
180         unsigned int            nr_privileged_daemon;
181
182         struct work_struct      quiesce_work;
183         struct work_struct      stop_work;
184 };
185
186 /* header of ublk_params */
187 struct ublk_params_header {
188         __u32   len;
189         __u32   types;
190 };
191
192 static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq);
193
194 static inline unsigned int ublk_req_build_flags(struct request *req);
195 static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
196                                                    int tag);
197 static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub)
198 {
199         return ub->dev_info.flags & UBLK_F_USER_COPY;
200 }
201
202 static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
203 {
204         return ub->dev_info.flags & UBLK_F_ZONED;
205 }
206
207 static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq)
208 {
209         return ubq->flags & UBLK_F_ZONED;
210 }
211
212 #ifdef CONFIG_BLK_DEV_ZONED
213
214 static int ublk_get_nr_zones(const struct ublk_device *ub)
215 {
216         const struct ublk_param_basic *p = &ub->params.basic;
217
218         /* Zone size is a power of 2 */
219         return p->dev_sectors >> ilog2(p->chunk_sectors);
220 }
221
222 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
223 {
224         return blk_revalidate_disk_zones(ub->ub_disk);
225 }
226
227 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
228 {
229         const struct ublk_param_zoned *p = &ub->params.zoned;
230         int nr_zones;
231
232         if (!ublk_dev_is_zoned(ub))
233                 return -EINVAL;
234
235         if (!p->max_zone_append_sectors)
236                 return -EINVAL;
237
238         nr_zones = ublk_get_nr_zones(ub);
239
240         if (p->max_active_zones > nr_zones)
241                 return -EINVAL;
242
243         if (p->max_open_zones > nr_zones)
244                 return -EINVAL;
245
246         return 0;
247 }
248
249 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
250 {
251         blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue);
252
253         ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
254 }
255
256 /* Based on virtblk_alloc_report_buffer */
257 static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
258                                       unsigned int nr_zones, size_t *buflen)
259 {
260         struct request_queue *q = ublk->ub_disk->queue;
261         size_t bufsize;
262         void *buf;
263
264         nr_zones = min_t(unsigned int, nr_zones,
265                          ublk->ub_disk->nr_zones);
266
267         bufsize = nr_zones * sizeof(struct blk_zone);
268         bufsize =
269                 min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
270
271         while (bufsize >= sizeof(struct blk_zone)) {
272                 buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
273                 if (buf) {
274                         *buflen = bufsize;
275                         return buf;
276                 }
277                 bufsize >>= 1;
278         }
279
280         *buflen = 0;
281         return NULL;
282 }
283
284 static int ublk_report_zones(struct gendisk *disk, sector_t sector,
285                       unsigned int nr_zones, report_zones_cb cb, void *data)
286 {
287         struct ublk_device *ub = disk->private_data;
288         unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
289         unsigned int first_zone = sector >> ilog2(zone_size_sectors);
290         unsigned int done_zones = 0;
291         unsigned int max_zones_per_request;
292         int ret;
293         struct blk_zone *buffer;
294         size_t buffer_length;
295
296         nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
297                          nr_zones);
298
299         buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
300         if (!buffer)
301                 return -ENOMEM;
302
303         max_zones_per_request = buffer_length / sizeof(struct blk_zone);
304
305         while (done_zones < nr_zones) {
306                 unsigned int remaining_zones = nr_zones - done_zones;
307                 unsigned int zones_in_request =
308                         min_t(unsigned int, remaining_zones, max_zones_per_request);
309                 struct request *req;
310                 struct ublk_rq_data *pdu;
311                 blk_status_t status;
312
313                 memset(buffer, 0, buffer_length);
314
315                 req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
316                 if (IS_ERR(req)) {
317                         ret = PTR_ERR(req);
318                         goto out;
319                 }
320
321                 pdu = blk_mq_rq_to_pdu(req);
322                 pdu->operation = UBLK_IO_OP_REPORT_ZONES;
323                 pdu->sector = sector;
324                 pdu->nr_zones = zones_in_request;
325
326                 ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
327                                         GFP_KERNEL);
328                 if (ret) {
329                         blk_mq_free_request(req);
330                         goto out;
331                 }
332
333                 status = blk_execute_rq(req, 0);
334                 ret = blk_status_to_errno(status);
335                 blk_mq_free_request(req);
336                 if (ret)
337                         goto out;
338
339                 for (unsigned int i = 0; i < zones_in_request; i++) {
340                         struct blk_zone *zone = buffer + i;
341
342                         /* A zero length zone means no more zones in this response */
343                         if (!zone->len)
344                                 break;
345
346                         ret = cb(zone, i, data);
347                         if (ret)
348                                 goto out;
349
350                         done_zones++;
351                         sector += zone_size_sectors;
352
353                 }
354         }
355
356         ret = done_zones;
357
358 out:
359         kvfree(buffer);
360         return ret;
361 }
362
363 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
364                                          struct request *req)
365 {
366         struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
367         struct ublk_io *io = &ubq->ios[req->tag];
368         struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req);
369         u32 ublk_op;
370
371         switch (req_op(req)) {
372         case REQ_OP_ZONE_OPEN:
373                 ublk_op = UBLK_IO_OP_ZONE_OPEN;
374                 break;
375         case REQ_OP_ZONE_CLOSE:
376                 ublk_op = UBLK_IO_OP_ZONE_CLOSE;
377                 break;
378         case REQ_OP_ZONE_FINISH:
379                 ublk_op = UBLK_IO_OP_ZONE_FINISH;
380                 break;
381         case REQ_OP_ZONE_RESET:
382                 ublk_op = UBLK_IO_OP_ZONE_RESET;
383                 break;
384         case REQ_OP_ZONE_APPEND:
385                 ublk_op = UBLK_IO_OP_ZONE_APPEND;
386                 break;
387         case REQ_OP_ZONE_RESET_ALL:
388                 ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
389                 break;
390         case REQ_OP_DRV_IN:
391                 ublk_op = pdu->operation;
392                 switch (ublk_op) {
393                 case UBLK_IO_OP_REPORT_ZONES:
394                         iod->op_flags = ublk_op | ublk_req_build_flags(req);
395                         iod->nr_zones = pdu->nr_zones;
396                         iod->start_sector = pdu->sector;
397                         return BLK_STS_OK;
398                 default:
399                         return BLK_STS_IOERR;
400                 }
401         case REQ_OP_DRV_OUT:
402                 /* We do not support drv_out */
403                 return BLK_STS_NOTSUPP;
404         default:
405                 return BLK_STS_IOERR;
406         }
407
408         iod->op_flags = ublk_op | ublk_req_build_flags(req);
409         iod->nr_sectors = blk_rq_sectors(req);
410         iod->start_sector = blk_rq_pos(req);
411         iod->addr = io->addr;
412
413         return BLK_STS_OK;
414 }
415
416 #else
417
418 #define ublk_report_zones (NULL)
419
420 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
421 {
422         return -EOPNOTSUPP;
423 }
424
425 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
426 {
427 }
428
429 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
430 {
431         return 0;
432 }
433
434 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
435                                          struct request *req)
436 {
437         return BLK_STS_NOTSUPP;
438 }
439
440 #endif
441
442 static inline void __ublk_complete_rq(struct request *req);
443 static void ublk_complete_rq(struct kref *ref);
444
445 static dev_t ublk_chr_devt;
446 static const struct class ublk_chr_class = {
447         .name = "ublk-char",
448 };
449
450 static DEFINE_IDR(ublk_index_idr);
451 static DEFINE_SPINLOCK(ublk_idr_lock);
452 static wait_queue_head_t ublk_idr_wq;   /* wait until one idr is freed */
453
454 static DEFINE_MUTEX(ublk_ctl_mutex);
455
456 /*
457  * Max ublk devices allowed to add
458  *
459  * It can be extended to one per-user limit in future or even controlled
460  * by cgroup.
461  */
462 #define UBLK_MAX_UBLKS UBLK_MINORS
463 static unsigned int ublks_max = 64;
464 static unsigned int ublks_added;        /* protected by ublk_ctl_mutex */
465
466 static struct miscdevice ublk_misc;
467
468 static inline unsigned ublk_pos_to_hwq(loff_t pos)
469 {
470         return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
471                 UBLK_QID_BITS_MASK;
472 }
473
474 static inline unsigned ublk_pos_to_buf_off(loff_t pos)
475 {
476         return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
477 }
478
479 static inline unsigned ublk_pos_to_tag(loff_t pos)
480 {
481         return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
482                 UBLK_TAG_BITS_MASK;
483 }
484
485 static void ublk_dev_param_basic_apply(struct ublk_device *ub)
486 {
487         struct request_queue *q = ub->ub_disk->queue;
488         const struct ublk_param_basic *p = &ub->params.basic;
489
490         blk_queue_write_cache(q, p->attrs & UBLK_ATTR_VOLATILE_CACHE,
491                         p->attrs & UBLK_ATTR_FUA);
492         if (p->attrs & UBLK_ATTR_ROTATIONAL)
493                 blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
494         else
495                 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
496
497         if (p->attrs & UBLK_ATTR_READ_ONLY)
498                 set_disk_ro(ub->ub_disk, true);
499
500         set_capacity(ub->ub_disk, p->dev_sectors);
501 }
502
503 static int ublk_validate_params(const struct ublk_device *ub)
504 {
505         /* basic param is the only one which must be set */
506         if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
507                 const struct ublk_param_basic *p = &ub->params.basic;
508
509                 if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9)
510                         return -EINVAL;
511
512                 if (p->logical_bs_shift > p->physical_bs_shift)
513                         return -EINVAL;
514
515                 if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
516                         return -EINVAL;
517
518                 if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
519                         return -EINVAL;
520         } else
521                 return -EINVAL;
522
523         if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
524                 const struct ublk_param_discard *p = &ub->params.discard;
525
526                 /* So far, only support single segment discard */
527                 if (p->max_discard_sectors && p->max_discard_segments != 1)
528                         return -EINVAL;
529
530                 if (!p->discard_granularity)
531                         return -EINVAL;
532         }
533
534         /* dev_t is read-only */
535         if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
536                 return -EINVAL;
537
538         if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
539                 return ublk_dev_param_zoned_validate(ub);
540         else if (ublk_dev_is_zoned(ub))
541                 return -EINVAL;
542
543         return 0;
544 }
545
546 static void ublk_apply_params(struct ublk_device *ub)
547 {
548         ublk_dev_param_basic_apply(ub);
549
550         if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
551                 ublk_dev_param_zoned_apply(ub);
552 }
553
554 static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
555 {
556         return ubq->flags & UBLK_F_USER_COPY;
557 }
558
559 static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
560 {
561         /*
562          * read()/write() is involved in user copy, so request reference
563          * has to be grabbed
564          */
565         return ublk_support_user_copy(ubq);
566 }
567
568 static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
569                 struct request *req)
570 {
571         if (ublk_need_req_ref(ubq)) {
572                 struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
573
574                 kref_init(&data->ref);
575         }
576 }
577
578 static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
579                 struct request *req)
580 {
581         if (ublk_need_req_ref(ubq)) {
582                 struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
583
584                 return kref_get_unless_zero(&data->ref);
585         }
586
587         return true;
588 }
589
590 static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
591                 struct request *req)
592 {
593         if (ublk_need_req_ref(ubq)) {
594                 struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
595
596                 kref_put(&data->ref, ublk_complete_rq);
597         } else {
598                 __ublk_complete_rq(req);
599         }
600 }
601
602 static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
603 {
604         return ubq->flags & UBLK_F_NEED_GET_DATA;
605 }
606
607 /* Called in slow path only, keep it noinline for trace purpose */
608 static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
609 {
610         if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
611                 return ub;
612         return NULL;
613 }
614
615 /* Called in slow path only, keep it noinline for trace purpose */
616 static noinline void ublk_put_device(struct ublk_device *ub)
617 {
618         put_device(&ub->cdev_dev);
619 }
620
621 static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
622                 int qid)
623 {
624        return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
625 }
626
627 static inline bool ublk_rq_has_data(const struct request *rq)
628 {
629         return bio_has_data(rq->bio);
630 }
631
632 static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
633                 int tag)
634 {
635         return (struct ublksrv_io_desc *)
636                 &(ubq->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]);
637 }
638
639 static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
640 {
641         return ublk_get_queue(ub, q_id)->io_cmd_buf;
642 }
643
644 static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id)
645 {
646         struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
647
648         return round_up(ubq->q_depth * sizeof(struct ublksrv_io_desc),
649                         PAGE_SIZE);
650 }
651
652 static inline bool ublk_queue_can_use_recovery_reissue(
653                 struct ublk_queue *ubq)
654 {
655         return (ubq->flags & UBLK_F_USER_RECOVERY) &&
656                         (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE);
657 }
658
659 static inline bool ublk_queue_can_use_recovery(
660                 struct ublk_queue *ubq)
661 {
662         return ubq->flags & UBLK_F_USER_RECOVERY;
663 }
664
665 static inline bool ublk_can_use_recovery(struct ublk_device *ub)
666 {
667         return ub->dev_info.flags & UBLK_F_USER_RECOVERY;
668 }
669
670 static void ublk_free_disk(struct gendisk *disk)
671 {
672         struct ublk_device *ub = disk->private_data;
673
674         clear_bit(UB_STATE_USED, &ub->state);
675         ublk_put_device(ub);
676 }
677
678 static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
679                 unsigned int *owner_gid)
680 {
681         kuid_t uid;
682         kgid_t gid;
683
684         current_uid_gid(&uid, &gid);
685
686         *owner_uid = from_kuid(&init_user_ns, uid);
687         *owner_gid = from_kgid(&init_user_ns, gid);
688 }
689
690 static int ublk_open(struct gendisk *disk, blk_mode_t mode)
691 {
692         struct ublk_device *ub = disk->private_data;
693
694         if (capable(CAP_SYS_ADMIN))
695                 return 0;
696
697         /*
698          * If it is one unprivileged device, only owner can open
699          * the disk. Otherwise it could be one trap made by one
700          * evil user who grants this disk's privileges to other
701          * users deliberately.
702          *
703          * This way is reasonable too given anyone can create
704          * unprivileged device, and no need other's grant.
705          */
706         if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) {
707                 unsigned int curr_uid, curr_gid;
708
709                 ublk_store_owner_uid_gid(&curr_uid, &curr_gid);
710
711                 if (curr_uid != ub->dev_info.owner_uid || curr_gid !=
712                                 ub->dev_info.owner_gid)
713                         return -EPERM;
714         }
715
716         return 0;
717 }
718
719 static const struct block_device_operations ub_fops = {
720         .owner =        THIS_MODULE,
721         .open =         ublk_open,
722         .free_disk =    ublk_free_disk,
723         .report_zones = ublk_report_zones,
724 };
725
726 #define UBLK_MAX_PIN_PAGES      32
727
728 struct ublk_io_iter {
729         struct page *pages[UBLK_MAX_PIN_PAGES];
730         struct bio *bio;
731         struct bvec_iter iter;
732 };
733
734 /* return how many pages are copied */
735 static void ublk_copy_io_pages(struct ublk_io_iter *data,
736                 size_t total, size_t pg_off, int dir)
737 {
738         unsigned done = 0;
739         unsigned pg_idx = 0;
740
741         while (done < total) {
742                 struct bio_vec bv = bio_iter_iovec(data->bio, data->iter);
743                 unsigned int bytes = min3(bv.bv_len, (unsigned)total - done,
744                                 (unsigned)(PAGE_SIZE - pg_off));
745                 void *bv_buf = bvec_kmap_local(&bv);
746                 void *pg_buf = kmap_local_page(data->pages[pg_idx]);
747
748                 if (dir == ITER_DEST)
749                         memcpy(pg_buf + pg_off, bv_buf, bytes);
750                 else
751                         memcpy(bv_buf, pg_buf + pg_off, bytes);
752
753                 kunmap_local(pg_buf);
754                 kunmap_local(bv_buf);
755
756                 /* advance page array */
757                 pg_off += bytes;
758                 if (pg_off == PAGE_SIZE) {
759                         pg_idx += 1;
760                         pg_off = 0;
761                 }
762
763                 done += bytes;
764
765                 /* advance bio */
766                 bio_advance_iter_single(data->bio, &data->iter, bytes);
767                 if (!data->iter.bi_size) {
768                         data->bio = data->bio->bi_next;
769                         if (data->bio == NULL)
770                                 break;
771                         data->iter = data->bio->bi_iter;
772                 }
773         }
774 }
775
776 static bool ublk_advance_io_iter(const struct request *req,
777                 struct ublk_io_iter *iter, unsigned int offset)
778 {
779         struct bio *bio = req->bio;
780
781         for_each_bio(bio) {
782                 if (bio->bi_iter.bi_size > offset) {
783                         iter->bio = bio;
784                         iter->iter = bio->bi_iter;
785                         bio_advance_iter(iter->bio, &iter->iter, offset);
786                         return true;
787                 }
788                 offset -= bio->bi_iter.bi_size;
789         }
790         return false;
791 }
792
793 /*
794  * Copy data between request pages and io_iter, and 'offset'
795  * is the start point of linear offset of request.
796  */
797 static size_t ublk_copy_user_pages(const struct request *req,
798                 unsigned offset, struct iov_iter *uiter, int dir)
799 {
800         struct ublk_io_iter iter;
801         size_t done = 0;
802
803         if (!ublk_advance_io_iter(req, &iter, offset))
804                 return 0;
805
806         while (iov_iter_count(uiter) && iter.bio) {
807                 unsigned nr_pages;
808                 ssize_t len;
809                 size_t off;
810                 int i;
811
812                 len = iov_iter_get_pages2(uiter, iter.pages,
813                                 iov_iter_count(uiter),
814                                 UBLK_MAX_PIN_PAGES, &off);
815                 if (len <= 0)
816                         return done;
817
818                 ublk_copy_io_pages(&iter, len, off, dir);
819                 nr_pages = DIV_ROUND_UP(len + off, PAGE_SIZE);
820                 for (i = 0; i < nr_pages; i++) {
821                         if (dir == ITER_DEST)
822                                 set_page_dirty(iter.pages[i]);
823                         put_page(iter.pages[i]);
824                 }
825                 done += len;
826         }
827
828         return done;
829 }
830
831 static inline bool ublk_need_map_req(const struct request *req)
832 {
833         return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
834 }
835
836 static inline bool ublk_need_unmap_req(const struct request *req)
837 {
838         return ublk_rq_has_data(req) &&
839                (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
840 }
841
842 static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
843                 struct ublk_io *io)
844 {
845         const unsigned int rq_bytes = blk_rq_bytes(req);
846
847         if (ublk_support_user_copy(ubq))
848                 return rq_bytes;
849
850         /*
851          * no zero copy, we delay copy WRITE request data into ublksrv
852          * context and the big benefit is that pinning pages in current
853          * context is pretty fast, see ublk_pin_user_pages
854          */
855         if (ublk_need_map_req(req)) {
856                 struct iov_iter iter;
857                 const int dir = ITER_DEST;
858
859                 import_ubuf(dir, u64_to_user_ptr(io->addr), rq_bytes, &iter);
860                 return ublk_copy_user_pages(req, 0, &iter, dir);
861         }
862         return rq_bytes;
863 }
864
865 static int ublk_unmap_io(const struct ublk_queue *ubq,
866                 const struct request *req,
867                 struct ublk_io *io)
868 {
869         const unsigned int rq_bytes = blk_rq_bytes(req);
870
871         if (ublk_support_user_copy(ubq))
872                 return rq_bytes;
873
874         if (ublk_need_unmap_req(req)) {
875                 struct iov_iter iter;
876                 const int dir = ITER_SOURCE;
877
878                 WARN_ON_ONCE(io->res > rq_bytes);
879
880                 import_ubuf(dir, u64_to_user_ptr(io->addr), io->res, &iter);
881                 return ublk_copy_user_pages(req, 0, &iter, dir);
882         }
883         return rq_bytes;
884 }
885
886 static inline unsigned int ublk_req_build_flags(struct request *req)
887 {
888         unsigned flags = 0;
889
890         if (req->cmd_flags & REQ_FAILFAST_DEV)
891                 flags |= UBLK_IO_F_FAILFAST_DEV;
892
893         if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
894                 flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
895
896         if (req->cmd_flags & REQ_FAILFAST_DRIVER)
897                 flags |= UBLK_IO_F_FAILFAST_DRIVER;
898
899         if (req->cmd_flags & REQ_META)
900                 flags |= UBLK_IO_F_META;
901
902         if (req->cmd_flags & REQ_FUA)
903                 flags |= UBLK_IO_F_FUA;
904
905         if (req->cmd_flags & REQ_NOUNMAP)
906                 flags |= UBLK_IO_F_NOUNMAP;
907
908         if (req->cmd_flags & REQ_SWAP)
909                 flags |= UBLK_IO_F_SWAP;
910
911         return flags;
912 }
913
914 static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
915 {
916         struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
917         struct ublk_io *io = &ubq->ios[req->tag];
918         enum req_op op = req_op(req);
919         u32 ublk_op;
920
921         if (!ublk_queue_is_zoned(ubq) &&
922             (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND))
923                 return BLK_STS_IOERR;
924
925         switch (req_op(req)) {
926         case REQ_OP_READ:
927                 ublk_op = UBLK_IO_OP_READ;
928                 break;
929         case REQ_OP_WRITE:
930                 ublk_op = UBLK_IO_OP_WRITE;
931                 break;
932         case REQ_OP_FLUSH:
933                 ublk_op = UBLK_IO_OP_FLUSH;
934                 break;
935         case REQ_OP_DISCARD:
936                 ublk_op = UBLK_IO_OP_DISCARD;
937                 break;
938         case REQ_OP_WRITE_ZEROES:
939                 ublk_op = UBLK_IO_OP_WRITE_ZEROES;
940                 break;
941         default:
942                 if (ublk_queue_is_zoned(ubq))
943                         return ublk_setup_iod_zoned(ubq, req);
944                 return BLK_STS_IOERR;
945         }
946
947         /* need to translate since kernel may change */
948         iod->op_flags = ublk_op | ublk_req_build_flags(req);
949         iod->nr_sectors = blk_rq_sectors(req);
950         iod->start_sector = blk_rq_pos(req);
951         iod->addr = io->addr;
952
953         return BLK_STS_OK;
954 }
955
956 static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
957                 struct io_uring_cmd *ioucmd)
958 {
959         return (struct ublk_uring_cmd_pdu *)&ioucmd->pdu;
960 }
961
962 static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq)
963 {
964         return ubq->ubq_daemon->flags & PF_EXITING;
965 }
966
967 /* todo: handle partial completion */
968 static inline void __ublk_complete_rq(struct request *req)
969 {
970         struct ublk_queue *ubq = req->mq_hctx->driver_data;
971         struct ublk_io *io = &ubq->ios[req->tag];
972         unsigned int unmapped_bytes;
973         blk_status_t res = BLK_STS_OK;
974
975         /* called from ublk_abort_queue() code path */
976         if (io->flags & UBLK_IO_FLAG_ABORTED) {
977                 res = BLK_STS_IOERR;
978                 goto exit;
979         }
980
981         /* failed read IO if nothing is read */
982         if (!io->res && req_op(req) == REQ_OP_READ)
983                 io->res = -EIO;
984
985         if (io->res < 0) {
986                 res = errno_to_blk_status(io->res);
987                 goto exit;
988         }
989
990         /*
991          * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them
992          * directly.
993          *
994          * Both the two needn't unmap.
995          */
996         if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
997             req_op(req) != REQ_OP_DRV_IN)
998                 goto exit;
999
1000         /* for READ request, writing data in iod->addr to rq buffers */
1001         unmapped_bytes = ublk_unmap_io(ubq, req, io);
1002
1003         /*
1004          * Extremely impossible since we got data filled in just before
1005          *
1006          * Re-read simply for this unlikely case.
1007          */
1008         if (unlikely(unmapped_bytes < io->res))
1009                 io->res = unmapped_bytes;
1010
1011         if (blk_update_request(req, BLK_STS_OK, io->res))
1012                 blk_mq_requeue_request(req, true);
1013         else
1014                 __blk_mq_end_request(req, BLK_STS_OK);
1015
1016         return;
1017 exit:
1018         blk_mq_end_request(req, res);
1019 }
1020
1021 static void ublk_complete_rq(struct kref *ref)
1022 {
1023         struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data,
1024                         ref);
1025         struct request *req = blk_mq_rq_from_pdu(data);
1026
1027         __ublk_complete_rq(req);
1028 }
1029
1030 /*
1031  * Since __ublk_rq_task_work always fails requests immediately during
1032  * exiting, __ublk_fail_req() is only called from abort context during
1033  * exiting. So lock is unnecessary.
1034  *
1035  * Also aborting may not be started yet, keep in mind that one failed
1036  * request may be issued by block layer again.
1037  */
1038 static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
1039                 struct request *req)
1040 {
1041         WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
1042
1043         if (ublk_queue_can_use_recovery_reissue(ubq))
1044                 blk_mq_requeue_request(req, false);
1045         else
1046                 ublk_put_req_ref(ubq, req);
1047 }
1048
1049 static void ubq_complete_io_cmd(struct ublk_io *io, int res,
1050                                 unsigned issue_flags)
1051 {
1052         /* mark this cmd owned by ublksrv */
1053         io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
1054
1055         /*
1056          * clear ACTIVE since we are done with this sqe/cmd slot
1057          * We can only accept io cmd in case of being not active.
1058          */
1059         io->flags &= ~UBLK_IO_FLAG_ACTIVE;
1060
1061         /* tell ublksrv one io request is coming */
1062         io_uring_cmd_done(io->cmd, res, 0, issue_flags);
1063 }
1064
1065 #define UBLK_REQUEUE_DELAY_MS   3
1066
1067 static inline void __ublk_abort_rq(struct ublk_queue *ubq,
1068                 struct request *rq)
1069 {
1070         /* We cannot process this rq so just requeue it. */
1071         if (ublk_queue_can_use_recovery(ubq))
1072                 blk_mq_requeue_request(rq, false);
1073         else
1074                 blk_mq_end_request(rq, BLK_STS_IOERR);
1075 }
1076
1077 static inline void __ublk_rq_task_work(struct request *req,
1078                                        unsigned issue_flags)
1079 {
1080         struct ublk_queue *ubq = req->mq_hctx->driver_data;
1081         int tag = req->tag;
1082         struct ublk_io *io = &ubq->ios[tag];
1083         unsigned int mapped_bytes;
1084
1085         pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n",
1086                         __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
1087                         ublk_get_iod(ubq, req->tag)->addr);
1088
1089         /*
1090          * Task is exiting if either:
1091          *
1092          * (1) current != ubq_daemon.
1093          * io_uring_cmd_complete_in_task() tries to run task_work
1094          * in a workqueue if ubq_daemon(cmd's task) is PF_EXITING.
1095          *
1096          * (2) current->flags & PF_EXITING.
1097          */
1098         if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) {
1099                 __ublk_abort_rq(ubq, req);
1100                 return;
1101         }
1102
1103         if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
1104                 /*
1105                  * We have not handled UBLK_IO_NEED_GET_DATA command yet,
1106                  * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
1107                  * and notify it.
1108                  */
1109                 if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) {
1110                         io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
1111                         pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n",
1112                                         __func__, io->cmd->cmd_op, ubq->q_id,
1113                                         req->tag, io->flags);
1114                         ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA, issue_flags);
1115                         return;
1116                 }
1117                 /*
1118                  * We have handled UBLK_IO_NEED_GET_DATA command,
1119                  * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
1120                  * do the copy work.
1121                  */
1122                 io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
1123                 /* update iod->addr because ublksrv may have passed a new io buffer */
1124                 ublk_get_iod(ubq, req->tag)->addr = io->addr;
1125                 pr_devel("%s: update iod->addr: op %d, qid %d tag %d io_flags %x addr %llx\n",
1126                                 __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
1127                                 ublk_get_iod(ubq, req->tag)->addr);
1128         }
1129
1130         mapped_bytes = ublk_map_io(ubq, req, io);
1131
1132         /* partially mapped, update io descriptor */
1133         if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
1134                 /*
1135                  * Nothing mapped, retry until we succeed.
1136                  *
1137                  * We may never succeed in mapping any bytes here because
1138                  * of OOM. TODO: reserve one buffer with single page pinned
1139                  * for providing forward progress guarantee.
1140                  */
1141                 if (unlikely(!mapped_bytes)) {
1142                         blk_mq_requeue_request(req, false);
1143                         blk_mq_delay_kick_requeue_list(req->q,
1144                                         UBLK_REQUEUE_DELAY_MS);
1145                         return;
1146                 }
1147
1148                 ublk_get_iod(ubq, req->tag)->nr_sectors =
1149                         mapped_bytes >> 9;
1150         }
1151
1152         ublk_init_req_ref(ubq, req);
1153         ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags);
1154 }
1155
1156 static inline void ublk_forward_io_cmds(struct ublk_queue *ubq,
1157                                         unsigned issue_flags)
1158 {
1159         struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds);
1160         struct ublk_rq_data *data, *tmp;
1161
1162         io_cmds = llist_reverse_order(io_cmds);
1163         llist_for_each_entry_safe(data, tmp, io_cmds, node)
1164                 __ublk_rq_task_work(blk_mq_rq_from_pdu(data), issue_flags);
1165 }
1166
1167 static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, unsigned issue_flags)
1168 {
1169         struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1170         struct ublk_queue *ubq = pdu->ubq;
1171
1172         ublk_forward_io_cmds(ubq, issue_flags);
1173 }
1174
1175 static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
1176 {
1177         struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
1178
1179         if (llist_add(&data->node, &ubq->io_cmds)) {
1180                 struct ublk_io *io = &ubq->ios[rq->tag];
1181
1182                 io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb);
1183         }
1184 }
1185
1186 static enum blk_eh_timer_return ublk_timeout(struct request *rq)
1187 {
1188         struct ublk_queue *ubq = rq->mq_hctx->driver_data;
1189         unsigned int nr_inflight = 0;
1190         int i;
1191
1192         if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) {
1193                 if (!ubq->timeout) {
1194                         send_sig(SIGKILL, ubq->ubq_daemon, 0);
1195                         ubq->timeout = true;
1196                 }
1197
1198                 return BLK_EH_DONE;
1199         }
1200
1201         if (!ubq_daemon_is_dying(ubq))
1202                 return BLK_EH_RESET_TIMER;
1203
1204         for (i = 0; i < ubq->q_depth; i++) {
1205                 struct ublk_io *io = &ubq->ios[i];
1206
1207                 if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
1208                         nr_inflight++;
1209         }
1210
1211         /* cancelable uring_cmd can't help us if all commands are in-flight */
1212         if (nr_inflight == ubq->q_depth) {
1213                 struct ublk_device *ub = ubq->dev;
1214
1215                 if (ublk_abort_requests(ub, ubq)) {
1216                         if (ublk_can_use_recovery(ub))
1217                                 schedule_work(&ub->quiesce_work);
1218                         else
1219                                 schedule_work(&ub->stop_work);
1220                 }
1221                 return BLK_EH_DONE;
1222         }
1223
1224         return BLK_EH_RESET_TIMER;
1225 }
1226
1227 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
1228                 const struct blk_mq_queue_data *bd)
1229 {
1230         struct ublk_queue *ubq = hctx->driver_data;
1231         struct request *rq = bd->rq;
1232         blk_status_t res;
1233
1234         /* fill iod to slot in io cmd buffer */
1235         res = ublk_setup_iod(ubq, rq);
1236         if (unlikely(res != BLK_STS_OK))
1237                 return BLK_STS_IOERR;
1238
1239         /* With recovery feature enabled, force_abort is set in
1240          * ublk_stop_dev() before calling del_gendisk(). We have to
1241          * abort all requeued and new rqs here to let del_gendisk()
1242          * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
1243          * to avoid UAF on io_uring ctx.
1244          *
1245          * Note: force_abort is guaranteed to be seen because it is set
1246          * before request queue is unqiuesced.
1247          */
1248         if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort))
1249                 return BLK_STS_IOERR;
1250
1251         if (unlikely(ubq->canceling)) {
1252                 __ublk_abort_rq(ubq, rq);
1253                 return BLK_STS_OK;
1254         }
1255
1256         blk_mq_start_request(bd->rq);
1257         ublk_queue_cmd(ubq, rq);
1258
1259         return BLK_STS_OK;
1260 }
1261
1262 static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
1263                 unsigned int hctx_idx)
1264 {
1265         struct ublk_device *ub = driver_data;
1266         struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
1267
1268         hctx->driver_data = ubq;
1269         return 0;
1270 }
1271
1272 static const struct blk_mq_ops ublk_mq_ops = {
1273         .queue_rq       = ublk_queue_rq,
1274         .init_hctx      = ublk_init_hctx,
1275         .timeout        = ublk_timeout,
1276 };
1277
1278 static int ublk_ch_open(struct inode *inode, struct file *filp)
1279 {
1280         struct ublk_device *ub = container_of(inode->i_cdev,
1281                         struct ublk_device, cdev);
1282
1283         if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
1284                 return -EBUSY;
1285         filp->private_data = ub;
1286         return 0;
1287 }
1288
1289 static int ublk_ch_release(struct inode *inode, struct file *filp)
1290 {
1291         struct ublk_device *ub = filp->private_data;
1292
1293         clear_bit(UB_STATE_OPEN, &ub->state);
1294         return 0;
1295 }
1296
1297 /* map pre-allocated per-queue cmd buffer to ublksrv daemon */
1298 static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
1299 {
1300         struct ublk_device *ub = filp->private_data;
1301         size_t sz = vma->vm_end - vma->vm_start;
1302         unsigned max_sz = UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc);
1303         unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
1304         int q_id, ret = 0;
1305
1306         spin_lock(&ub->lock);
1307         if (!ub->mm)
1308                 ub->mm = current->mm;
1309         if (current->mm != ub->mm)
1310                 ret = -EINVAL;
1311         spin_unlock(&ub->lock);
1312
1313         if (ret)
1314                 return ret;
1315
1316         if (vma->vm_flags & VM_WRITE)
1317                 return -EPERM;
1318
1319         end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
1320         if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
1321                 return -EINVAL;
1322
1323         q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
1324         pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
1325                         __func__, q_id, current->pid, vma->vm_start,
1326                         phys_off, (unsigned long)sz);
1327
1328         if (sz != ublk_queue_cmd_buf_size(ub, q_id))
1329                 return -EINVAL;
1330
1331         pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
1332         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
1333 }
1334
1335 static void ublk_commit_completion(struct ublk_device *ub,
1336                 const struct ublksrv_io_cmd *ub_cmd)
1337 {
1338         u32 qid = ub_cmd->q_id, tag = ub_cmd->tag;
1339         struct ublk_queue *ubq = ublk_get_queue(ub, qid);
1340         struct ublk_io *io = &ubq->ios[tag];
1341         struct request *req;
1342
1343         /* now this cmd slot is owned by nbd driver */
1344         io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
1345         io->res = ub_cmd->result;
1346
1347         /* find the io request and complete */
1348         req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
1349         if (WARN_ON_ONCE(unlikely(!req)))
1350                 return;
1351
1352         if (req_op(req) == REQ_OP_ZONE_APPEND)
1353                 req->__sector = ub_cmd->zone_append_lba;
1354
1355         if (likely(!blk_should_fake_timeout(req->q)))
1356                 ublk_put_req_ref(ubq, req);
1357 }
1358
1359 /*
1360  * Called from ubq_daemon context via cancel fn, meantime quiesce ublk
1361  * blk-mq queue, so we are called exclusively with blk-mq and ubq_daemon
1362  * context, so everything is serialized.
1363  */
1364 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
1365 {
1366         int i;
1367
1368         for (i = 0; i < ubq->q_depth; i++) {
1369                 struct ublk_io *io = &ubq->ios[i];
1370
1371                 if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
1372                         struct request *rq;
1373
1374                         /*
1375                          * Either we fail the request or ublk_rq_task_work_fn
1376                          * will do it
1377                          */
1378                         rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
1379                         if (rq && blk_mq_request_started(rq)) {
1380                                 io->flags |= UBLK_IO_FLAG_ABORTED;
1381                                 __ublk_fail_req(ubq, io, rq);
1382                         }
1383                 }
1384         }
1385 }
1386
1387 static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq)
1388 {
1389         struct gendisk *disk;
1390
1391         spin_lock(&ubq->cancel_lock);
1392         if (ubq->canceling) {
1393                 spin_unlock(&ubq->cancel_lock);
1394                 return false;
1395         }
1396         ubq->canceling = true;
1397         spin_unlock(&ubq->cancel_lock);
1398
1399         spin_lock(&ub->lock);
1400         disk = ub->ub_disk;
1401         if (disk)
1402                 get_device(disk_to_dev(disk));
1403         spin_unlock(&ub->lock);
1404
1405         /* Our disk has been dead */
1406         if (!disk)
1407                 return false;
1408
1409         /* Now we are serialized with ublk_queue_rq() */
1410         blk_mq_quiesce_queue(disk->queue);
1411         /* abort queue is for making forward progress */
1412         ublk_abort_queue(ub, ubq);
1413         blk_mq_unquiesce_queue(disk->queue);
1414         put_device(disk_to_dev(disk));
1415
1416         return true;
1417 }
1418
1419 static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io,
1420                 unsigned int issue_flags)
1421 {
1422         bool done;
1423
1424         if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
1425                 return;
1426
1427         spin_lock(&ubq->cancel_lock);
1428         done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
1429         if (!done)
1430                 io->flags |= UBLK_IO_FLAG_CANCELED;
1431         spin_unlock(&ubq->cancel_lock);
1432
1433         if (!done)
1434                 io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0, issue_flags);
1435 }
1436
1437 /*
1438  * The ublk char device won't be closed when calling cancel fn, so both
1439  * ublk device and queue are guaranteed to be live
1440  */
1441 static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
1442                 unsigned int issue_flags)
1443 {
1444         struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1445         struct ublk_queue *ubq = pdu->ubq;
1446         struct task_struct *task;
1447         struct ublk_device *ub;
1448         bool need_schedule;
1449         struct ublk_io *io;
1450
1451         if (WARN_ON_ONCE(!ubq))
1452                 return;
1453
1454         if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
1455                 return;
1456
1457         task = io_uring_cmd_get_task(cmd);
1458         if (WARN_ON_ONCE(task && task != ubq->ubq_daemon))
1459                 return;
1460
1461         ub = ubq->dev;
1462         need_schedule = ublk_abort_requests(ub, ubq);
1463
1464         io = &ubq->ios[pdu->tag];
1465         WARN_ON_ONCE(io->cmd != cmd);
1466         ublk_cancel_cmd(ubq, io, issue_flags);
1467
1468         if (need_schedule) {
1469                 if (ublk_can_use_recovery(ub))
1470                         schedule_work(&ub->quiesce_work);
1471                 else
1472                         schedule_work(&ub->stop_work);
1473         }
1474 }
1475
1476 static inline bool ublk_queue_ready(struct ublk_queue *ubq)
1477 {
1478         return ubq->nr_io_ready == ubq->q_depth;
1479 }
1480
1481 static void ublk_cancel_queue(struct ublk_queue *ubq)
1482 {
1483         int i;
1484
1485         for (i = 0; i < ubq->q_depth; i++)
1486                 ublk_cancel_cmd(ubq, &ubq->ios[i], IO_URING_F_UNLOCKED);
1487 }
1488
1489 /* Cancel all pending commands, must be called after del_gendisk() returns */
1490 static void ublk_cancel_dev(struct ublk_device *ub)
1491 {
1492         int i;
1493
1494         for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1495                 ublk_cancel_queue(ublk_get_queue(ub, i));
1496 }
1497
1498 static bool ublk_check_inflight_rq(struct request *rq, void *data)
1499 {
1500         bool *idle = data;
1501
1502         if (blk_mq_request_started(rq)) {
1503                 *idle = false;
1504                 return false;
1505         }
1506         return true;
1507 }
1508
1509 static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
1510 {
1511         bool idle;
1512
1513         WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
1514         while (true) {
1515                 idle = true;
1516                 blk_mq_tagset_busy_iter(&ub->tag_set,
1517                                 ublk_check_inflight_rq, &idle);
1518                 if (idle)
1519                         break;
1520                 msleep(UBLK_REQUEUE_DELAY_MS);
1521         }
1522 }
1523
1524 static void __ublk_quiesce_dev(struct ublk_device *ub)
1525 {
1526         pr_devel("%s: quiesce ub: dev_id %d state %s\n",
1527                         __func__, ub->dev_info.dev_id,
1528                         ub->dev_info.state == UBLK_S_DEV_LIVE ?
1529                         "LIVE" : "QUIESCED");
1530         blk_mq_quiesce_queue(ub->ub_disk->queue);
1531         ublk_wait_tagset_rqs_idle(ub);
1532         ub->dev_info.state = UBLK_S_DEV_QUIESCED;
1533 }
1534
1535 static void ublk_quiesce_work_fn(struct work_struct *work)
1536 {
1537         struct ublk_device *ub =
1538                 container_of(work, struct ublk_device, quiesce_work);
1539
1540         mutex_lock(&ub->mutex);
1541         if (ub->dev_info.state != UBLK_S_DEV_LIVE)
1542                 goto unlock;
1543         __ublk_quiesce_dev(ub);
1544  unlock:
1545         mutex_unlock(&ub->mutex);
1546         ublk_cancel_dev(ub);
1547 }
1548
1549 static void ublk_unquiesce_dev(struct ublk_device *ub)
1550 {
1551         int i;
1552
1553         pr_devel("%s: unquiesce ub: dev_id %d state %s\n",
1554                         __func__, ub->dev_info.dev_id,
1555                         ub->dev_info.state == UBLK_S_DEV_LIVE ?
1556                         "LIVE" : "QUIESCED");
1557         /* quiesce_work has run. We let requeued rqs be aborted
1558          * before running fallback_wq. "force_abort" must be seen
1559          * after request queue is unqiuesced. Then del_gendisk()
1560          * can move on.
1561          */
1562         for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1563                 ublk_get_queue(ub, i)->force_abort = true;
1564
1565         blk_mq_unquiesce_queue(ub->ub_disk->queue);
1566         /* We may have requeued some rqs in ublk_quiesce_queue() */
1567         blk_mq_kick_requeue_list(ub->ub_disk->queue);
1568 }
1569
1570 static void ublk_stop_dev(struct ublk_device *ub)
1571 {
1572         struct gendisk *disk;
1573
1574         mutex_lock(&ub->mutex);
1575         if (ub->dev_info.state == UBLK_S_DEV_DEAD)
1576                 goto unlock;
1577         if (ublk_can_use_recovery(ub)) {
1578                 if (ub->dev_info.state == UBLK_S_DEV_LIVE)
1579                         __ublk_quiesce_dev(ub);
1580                 ublk_unquiesce_dev(ub);
1581         }
1582         del_gendisk(ub->ub_disk);
1583
1584         /* Sync with ublk_abort_queue() by holding the lock */
1585         spin_lock(&ub->lock);
1586         disk = ub->ub_disk;
1587         ub->dev_info.state = UBLK_S_DEV_DEAD;
1588         ub->dev_info.ublksrv_pid = -1;
1589         ub->ub_disk = NULL;
1590         spin_unlock(&ub->lock);
1591         put_disk(disk);
1592  unlock:
1593         mutex_unlock(&ub->mutex);
1594         ublk_cancel_dev(ub);
1595 }
1596
1597 /* device can only be started after all IOs are ready */
1598 static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
1599 {
1600         mutex_lock(&ub->mutex);
1601         ubq->nr_io_ready++;
1602         if (ublk_queue_ready(ubq)) {
1603                 ubq->ubq_daemon = current;
1604                 get_task_struct(ubq->ubq_daemon);
1605                 ub->nr_queues_ready++;
1606
1607                 if (capable(CAP_SYS_ADMIN))
1608                         ub->nr_privileged_daemon++;
1609         }
1610         if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues)
1611                 complete_all(&ub->completion);
1612         mutex_unlock(&ub->mutex);
1613 }
1614
1615 static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id,
1616                 int tag)
1617 {
1618         struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
1619         struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
1620
1621         ublk_queue_cmd(ubq, req);
1622 }
1623
1624 static inline int ublk_check_cmd_op(u32 cmd_op)
1625 {
1626         u32 ioc_type = _IOC_TYPE(cmd_op);
1627
1628         if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
1629                 return -EOPNOTSUPP;
1630
1631         if (ioc_type != 'u' && ioc_type != 0)
1632                 return -EOPNOTSUPP;
1633
1634         return 0;
1635 }
1636
1637 static inline void ublk_fill_io_cmd(struct ublk_io *io,
1638                 struct io_uring_cmd *cmd, unsigned long buf_addr)
1639 {
1640         io->cmd = cmd;
1641         io->flags |= UBLK_IO_FLAG_ACTIVE;
1642         io->addr = buf_addr;
1643 }
1644
1645 static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
1646                                     unsigned int issue_flags,
1647                                     struct ublk_queue *ubq, unsigned int tag)
1648 {
1649         struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1650
1651         /*
1652          * Safe to refer to @ubq since ublk_queue won't be died until its
1653          * commands are completed
1654          */
1655         pdu->ubq = ubq;
1656         pdu->tag = tag;
1657         io_uring_cmd_mark_cancelable(cmd, issue_flags);
1658 }
1659
1660 static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
1661                                unsigned int issue_flags,
1662                                const struct ublksrv_io_cmd *ub_cmd)
1663 {
1664         struct ublk_device *ub = cmd->file->private_data;
1665         struct ublk_queue *ubq;
1666         struct ublk_io *io;
1667         u32 cmd_op = cmd->cmd_op;
1668         unsigned tag = ub_cmd->tag;
1669         int ret = -EINVAL;
1670         struct request *req;
1671
1672         pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
1673                         __func__, cmd->cmd_op, ub_cmd->q_id, tag,
1674                         ub_cmd->result);
1675
1676         if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
1677                 goto out;
1678
1679         ubq = ublk_get_queue(ub, ub_cmd->q_id);
1680         if (!ubq || ub_cmd->q_id != ubq->q_id)
1681                 goto out;
1682
1683         if (ubq->ubq_daemon && ubq->ubq_daemon != current)
1684                 goto out;
1685
1686         if (tag >= ubq->q_depth)
1687                 goto out;
1688
1689         io = &ubq->ios[tag];
1690
1691         /* there is pending io cmd, something must be wrong */
1692         if (io->flags & UBLK_IO_FLAG_ACTIVE) {
1693                 ret = -EBUSY;
1694                 goto out;
1695         }
1696
1697         /*
1698          * ensure that the user issues UBLK_IO_NEED_GET_DATA
1699          * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
1700          */
1701         if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
1702                         ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
1703                 goto out;
1704
1705         ret = ublk_check_cmd_op(cmd_op);
1706         if (ret)
1707                 goto out;
1708
1709         ret = -EINVAL;
1710         switch (_IOC_NR(cmd_op)) {
1711         case UBLK_IO_FETCH_REQ:
1712                 /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
1713                 if (ublk_queue_ready(ubq)) {
1714                         ret = -EBUSY;
1715                         goto out;
1716                 }
1717                 /*
1718                  * The io is being handled by server, so COMMIT_RQ is expected
1719                  * instead of FETCH_REQ
1720                  */
1721                 if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
1722                         goto out;
1723
1724                 if (!ublk_support_user_copy(ubq)) {
1725                         /*
1726                          * FETCH_RQ has to provide IO buffer if NEED GET
1727                          * DATA is not enabled
1728                          */
1729                         if (!ub_cmd->addr && !ublk_need_get_data(ubq))
1730                                 goto out;
1731                 } else if (ub_cmd->addr) {
1732                         /* User copy requires addr to be unset */
1733                         ret = -EINVAL;
1734                         goto out;
1735                 }
1736
1737                 ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
1738                 ublk_mark_io_ready(ub, ubq);
1739                 break;
1740         case UBLK_IO_COMMIT_AND_FETCH_REQ:
1741                 req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag);
1742
1743                 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
1744                         goto out;
1745
1746                 if (!ublk_support_user_copy(ubq)) {
1747                         /*
1748                          * COMMIT_AND_FETCH_REQ has to provide IO buffer if
1749                          * NEED GET DATA is not enabled or it is Read IO.
1750                          */
1751                         if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
1752                                                 req_op(req) == REQ_OP_READ))
1753                                 goto out;
1754                 } else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) {
1755                         /*
1756                          * User copy requires addr to be unset when command is
1757                          * not zone append
1758                          */
1759                         ret = -EINVAL;
1760                         goto out;
1761                 }
1762
1763                 ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
1764                 ublk_commit_completion(ub, ub_cmd);
1765                 break;
1766         case UBLK_IO_NEED_GET_DATA:
1767                 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
1768                         goto out;
1769                 ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
1770                 ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag);
1771                 break;
1772         default:
1773                 goto out;
1774         }
1775         ublk_prep_cancel(cmd, issue_flags, ubq, tag);
1776         return -EIOCBQUEUED;
1777
1778  out:
1779         io_uring_cmd_done(cmd, ret, 0, issue_flags);
1780         pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
1781                         __func__, cmd_op, tag, ret, io->flags);
1782         return -EIOCBQUEUED;
1783 }
1784
1785 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
1786                 struct ublk_queue *ubq, int tag, size_t offset)
1787 {
1788         struct request *req;
1789
1790         if (!ublk_need_req_ref(ubq))
1791                 return NULL;
1792
1793         req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
1794         if (!req)
1795                 return NULL;
1796
1797         if (!ublk_get_req_ref(ubq, req))
1798                 return NULL;
1799
1800         if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
1801                 goto fail_put;
1802
1803         if (!ublk_rq_has_data(req))
1804                 goto fail_put;
1805
1806         if (offset > blk_rq_bytes(req))
1807                 goto fail_put;
1808
1809         return req;
1810 fail_put:
1811         ublk_put_req_ref(ubq, req);
1812         return NULL;
1813 }
1814
1815 static inline int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
1816                 unsigned int issue_flags)
1817 {
1818         /*
1819          * Not necessary for async retry, but let's keep it simple and always
1820          * copy the values to avoid any potential reuse.
1821          */
1822         const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe);
1823         const struct ublksrv_io_cmd ub_cmd = {
1824                 .q_id = READ_ONCE(ub_src->q_id),
1825                 .tag = READ_ONCE(ub_src->tag),
1826                 .result = READ_ONCE(ub_src->result),
1827                 .addr = READ_ONCE(ub_src->addr)
1828         };
1829
1830         WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
1831
1832         return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd);
1833 }
1834
1835 static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd,
1836                 unsigned int issue_flags)
1837 {
1838         ublk_ch_uring_cmd_local(cmd, issue_flags);
1839 }
1840
1841 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
1842 {
1843         if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
1844                 ublk_uring_cmd_cancel_fn(cmd, issue_flags);
1845                 return 0;
1846         }
1847
1848         /* well-implemented server won't run into unlocked */
1849         if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
1850                 io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
1851                 return -EIOCBQUEUED;
1852         }
1853
1854         return ublk_ch_uring_cmd_local(cmd, issue_flags);
1855 }
1856
1857 static inline bool ublk_check_ubuf_dir(const struct request *req,
1858                 int ubuf_dir)
1859 {
1860         /* copy ubuf to request pages */
1861         if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
1862             ubuf_dir == ITER_SOURCE)
1863                 return true;
1864
1865         /* copy request pages to ubuf */
1866         if ((req_op(req) == REQ_OP_WRITE ||
1867              req_op(req) == REQ_OP_ZONE_APPEND) &&
1868             ubuf_dir == ITER_DEST)
1869                 return true;
1870
1871         return false;
1872 }
1873
1874 static struct request *ublk_check_and_get_req(struct kiocb *iocb,
1875                 struct iov_iter *iter, size_t *off, int dir)
1876 {
1877         struct ublk_device *ub = iocb->ki_filp->private_data;
1878         struct ublk_queue *ubq;
1879         struct request *req;
1880         size_t buf_off;
1881         u16 tag, q_id;
1882
1883         if (!ub)
1884                 return ERR_PTR(-EACCES);
1885
1886         if (!user_backed_iter(iter))
1887                 return ERR_PTR(-EACCES);
1888
1889         if (ub->dev_info.state == UBLK_S_DEV_DEAD)
1890                 return ERR_PTR(-EACCES);
1891
1892         tag = ublk_pos_to_tag(iocb->ki_pos);
1893         q_id = ublk_pos_to_hwq(iocb->ki_pos);
1894         buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
1895
1896         if (q_id >= ub->dev_info.nr_hw_queues)
1897                 return ERR_PTR(-EINVAL);
1898
1899         ubq = ublk_get_queue(ub, q_id);
1900         if (!ubq)
1901                 return ERR_PTR(-EINVAL);
1902
1903         if (tag >= ubq->q_depth)
1904                 return ERR_PTR(-EINVAL);
1905
1906         req = __ublk_check_and_get_req(ub, ubq, tag, buf_off);
1907         if (!req)
1908                 return ERR_PTR(-EINVAL);
1909
1910         if (!req->mq_hctx || !req->mq_hctx->driver_data)
1911                 goto fail;
1912
1913         if (!ublk_check_ubuf_dir(req, dir))
1914                 goto fail;
1915
1916         *off = buf_off;
1917         return req;
1918 fail:
1919         ublk_put_req_ref(ubq, req);
1920         return ERR_PTR(-EACCES);
1921 }
1922
1923 static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
1924 {
1925         struct ublk_queue *ubq;
1926         struct request *req;
1927         size_t buf_off;
1928         size_t ret;
1929
1930         req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST);
1931         if (IS_ERR(req))
1932                 return PTR_ERR(req);
1933
1934         ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST);
1935         ubq = req->mq_hctx->driver_data;
1936         ublk_put_req_ref(ubq, req);
1937
1938         return ret;
1939 }
1940
1941 static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
1942 {
1943         struct ublk_queue *ubq;
1944         struct request *req;
1945         size_t buf_off;
1946         size_t ret;
1947
1948         req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE);
1949         if (IS_ERR(req))
1950                 return PTR_ERR(req);
1951
1952         ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE);
1953         ubq = req->mq_hctx->driver_data;
1954         ublk_put_req_ref(ubq, req);
1955
1956         return ret;
1957 }
1958
1959 static const struct file_operations ublk_ch_fops = {
1960         .owner = THIS_MODULE,
1961         .open = ublk_ch_open,
1962         .release = ublk_ch_release,
1963         .llseek = no_llseek,
1964         .read_iter = ublk_ch_read_iter,
1965         .write_iter = ublk_ch_write_iter,
1966         .uring_cmd = ublk_ch_uring_cmd,
1967         .mmap = ublk_ch_mmap,
1968 };
1969
1970 static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
1971 {
1972         int size = ublk_queue_cmd_buf_size(ub, q_id);
1973         struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
1974
1975         if (ubq->ubq_daemon)
1976                 put_task_struct(ubq->ubq_daemon);
1977         if (ubq->io_cmd_buf)
1978                 free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
1979 }
1980
1981 static int ublk_init_queue(struct ublk_device *ub, int q_id)
1982 {
1983         struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
1984         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
1985         void *ptr;
1986         int size;
1987
1988         spin_lock_init(&ubq->cancel_lock);
1989         ubq->flags = ub->dev_info.flags;
1990         ubq->q_id = q_id;
1991         ubq->q_depth = ub->dev_info.queue_depth;
1992         size = ublk_queue_cmd_buf_size(ub, q_id);
1993
1994         ptr = (void *) __get_free_pages(gfp_flags, get_order(size));
1995         if (!ptr)
1996                 return -ENOMEM;
1997
1998         ubq->io_cmd_buf = ptr;
1999         ubq->dev = ub;
2000         return 0;
2001 }
2002
2003 static void ublk_deinit_queues(struct ublk_device *ub)
2004 {
2005         int nr_queues = ub->dev_info.nr_hw_queues;
2006         int i;
2007
2008         if (!ub->__queues)
2009                 return;
2010
2011         for (i = 0; i < nr_queues; i++)
2012                 ublk_deinit_queue(ub, i);
2013         kfree(ub->__queues);
2014 }
2015
2016 static int ublk_init_queues(struct ublk_device *ub)
2017 {
2018         int nr_queues = ub->dev_info.nr_hw_queues;
2019         int depth = ub->dev_info.queue_depth;
2020         int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io);
2021         int i, ret = -ENOMEM;
2022
2023         ub->queue_size = ubq_size;
2024         ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL);
2025         if (!ub->__queues)
2026                 return ret;
2027
2028         for (i = 0; i < nr_queues; i++) {
2029                 if (ublk_init_queue(ub, i))
2030                         goto fail;
2031         }
2032
2033         init_completion(&ub->completion);
2034         return 0;
2035
2036  fail:
2037         ublk_deinit_queues(ub);
2038         return ret;
2039 }
2040
2041 static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
2042 {
2043         int i = idx;
2044         int err;
2045
2046         spin_lock(&ublk_idr_lock);
2047         /* allocate id, if @id >= 0, we're requesting that specific id */
2048         if (i >= 0) {
2049                 err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
2050                 if (err == -ENOSPC)
2051                         err = -EEXIST;
2052         } else {
2053                 err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS,
2054                                 GFP_NOWAIT);
2055         }
2056         spin_unlock(&ublk_idr_lock);
2057
2058         if (err >= 0)
2059                 ub->ub_number = err;
2060
2061         return err;
2062 }
2063
2064 static void ublk_free_dev_number(struct ublk_device *ub)
2065 {
2066         spin_lock(&ublk_idr_lock);
2067         idr_remove(&ublk_index_idr, ub->ub_number);
2068         wake_up_all(&ublk_idr_wq);
2069         spin_unlock(&ublk_idr_lock);
2070 }
2071
2072 static void ublk_cdev_rel(struct device *dev)
2073 {
2074         struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
2075
2076         blk_mq_free_tag_set(&ub->tag_set);
2077         ublk_deinit_queues(ub);
2078         ublk_free_dev_number(ub);
2079         mutex_destroy(&ub->mutex);
2080         kfree(ub);
2081 }
2082
2083 static int ublk_add_chdev(struct ublk_device *ub)
2084 {
2085         struct device *dev = &ub->cdev_dev;
2086         int minor = ub->ub_number;
2087         int ret;
2088
2089         dev->parent = ublk_misc.this_device;
2090         dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
2091         dev->class = &ublk_chr_class;
2092         dev->release = ublk_cdev_rel;
2093         device_initialize(dev);
2094
2095         ret = dev_set_name(dev, "ublkc%d", minor);
2096         if (ret)
2097                 goto fail;
2098
2099         cdev_init(&ub->cdev, &ublk_ch_fops);
2100         ret = cdev_device_add(&ub->cdev, dev);
2101         if (ret)
2102                 goto fail;
2103
2104         ublks_added++;
2105         return 0;
2106  fail:
2107         put_device(dev);
2108         return ret;
2109 }
2110
2111 static void ublk_stop_work_fn(struct work_struct *work)
2112 {
2113         struct ublk_device *ub =
2114                 container_of(work, struct ublk_device, stop_work);
2115
2116         ublk_stop_dev(ub);
2117 }
2118
2119 /* align max io buffer size with PAGE_SIZE */
2120 static void ublk_align_max_io_size(struct ublk_device *ub)
2121 {
2122         unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
2123
2124         ub->dev_info.max_io_buf_bytes =
2125                 round_down(max_io_bytes, PAGE_SIZE);
2126 }
2127
2128 static int ublk_add_tag_set(struct ublk_device *ub)
2129 {
2130         ub->tag_set.ops = &ublk_mq_ops;
2131         ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
2132         ub->tag_set.queue_depth = ub->dev_info.queue_depth;
2133         ub->tag_set.numa_node = NUMA_NO_NODE;
2134         ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
2135         ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
2136         ub->tag_set.driver_data = ub;
2137         return blk_mq_alloc_tag_set(&ub->tag_set);
2138 }
2139
2140 static void ublk_remove(struct ublk_device *ub)
2141 {
2142         ublk_stop_dev(ub);
2143         cancel_work_sync(&ub->stop_work);
2144         cancel_work_sync(&ub->quiesce_work);
2145         cdev_device_del(&ub->cdev, &ub->cdev_dev);
2146         ublk_put_device(ub);
2147         ublks_added--;
2148 }
2149
2150 static struct ublk_device *ublk_get_device_from_id(int idx)
2151 {
2152         struct ublk_device *ub = NULL;
2153
2154         if (idx < 0)
2155                 return NULL;
2156
2157         spin_lock(&ublk_idr_lock);
2158         ub = idr_find(&ublk_index_idr, idx);
2159         if (ub)
2160                 ub = ublk_get_device(ub);
2161         spin_unlock(&ublk_idr_lock);
2162
2163         return ub;
2164 }
2165
2166 static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
2167 {
2168         const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2169         const struct ublk_param_basic *p = &ub->params.basic;
2170         int ublksrv_pid = (int)header->data[0];
2171         struct queue_limits lim = {
2172                 .logical_block_size     = 1 << p->logical_bs_shift,
2173                 .physical_block_size    = 1 << p->physical_bs_shift,
2174                 .io_min                 = 1 << p->io_min_shift,
2175                 .io_opt                 = 1 << p->io_opt_shift,
2176                 .max_hw_sectors         = p->max_sectors,
2177                 .chunk_sectors          = p->chunk_sectors,
2178                 .virt_boundary_mask     = p->virt_boundary_mask,
2179                 .max_segments           = USHRT_MAX,
2180                 .max_segment_size       = UINT_MAX,
2181         };
2182         struct gendisk *disk;
2183         int ret = -EINVAL;
2184
2185         if (ublksrv_pid <= 0)
2186                 return -EINVAL;
2187         if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
2188                 return -EINVAL;
2189
2190         if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
2191                 const struct ublk_param_discard *pd = &ub->params.discard;
2192
2193                 lim.discard_alignment = pd->discard_alignment;
2194                 lim.discard_granularity = pd->discard_granularity;
2195                 lim.max_hw_discard_sectors = pd->max_discard_sectors;
2196                 lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors;
2197                 lim.max_discard_segments = pd->max_discard_segments;
2198         }
2199
2200         if (ub->params.types & UBLK_PARAM_TYPE_ZONED) {
2201                 const struct ublk_param_zoned *p = &ub->params.zoned;
2202
2203                 if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
2204                         return -EOPNOTSUPP;
2205
2206                 lim.zoned = true;
2207                 lim.max_active_zones = p->max_active_zones;
2208                 lim.max_open_zones =  p->max_open_zones;
2209                 lim.max_zone_append_sectors = p->max_zone_append_sectors;
2210         }
2211
2212         if (wait_for_completion_interruptible(&ub->completion) != 0)
2213                 return -EINTR;
2214
2215         mutex_lock(&ub->mutex);
2216         if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
2217             test_bit(UB_STATE_USED, &ub->state)) {
2218                 ret = -EEXIST;
2219                 goto out_unlock;
2220         }
2221
2222         disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL);
2223         if (IS_ERR(disk)) {
2224                 ret = PTR_ERR(disk);
2225                 goto out_unlock;
2226         }
2227         sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
2228         disk->fops = &ub_fops;
2229         disk->private_data = ub;
2230
2231         ub->dev_info.ublksrv_pid = ublksrv_pid;
2232         ub->ub_disk = disk;
2233
2234         ublk_apply_params(ub);
2235
2236         /* don't probe partitions if any one ubq daemon is un-trusted */
2237         if (ub->nr_privileged_daemon != ub->nr_queues_ready)
2238                 set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
2239
2240         ublk_get_device(ub);
2241         ub->dev_info.state = UBLK_S_DEV_LIVE;
2242
2243         if (ublk_dev_is_zoned(ub)) {
2244                 ret = ublk_revalidate_disk_zones(ub);
2245                 if (ret)
2246                         goto out_put_cdev;
2247         }
2248
2249         ret = add_disk(disk);
2250         if (ret)
2251                 goto out_put_cdev;
2252
2253         set_bit(UB_STATE_USED, &ub->state);
2254
2255 out_put_cdev:
2256         if (ret) {
2257                 ub->dev_info.state = UBLK_S_DEV_DEAD;
2258                 ublk_put_device(ub);
2259         }
2260         if (ret)
2261                 put_disk(disk);
2262 out_unlock:
2263         mutex_unlock(&ub->mutex);
2264         return ret;
2265 }
2266
2267 static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
2268                 struct io_uring_cmd *cmd)
2269 {
2270         const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2271         void __user *argp = (void __user *)(unsigned long)header->addr;
2272         cpumask_var_t cpumask;
2273         unsigned long queue;
2274         unsigned int retlen;
2275         unsigned int i;
2276         int ret;
2277
2278         if (header->len * BITS_PER_BYTE < nr_cpu_ids)
2279                 return -EINVAL;
2280         if (header->len & (sizeof(unsigned long)-1))
2281                 return -EINVAL;
2282         if (!header->addr)
2283                 return -EINVAL;
2284
2285         queue = header->data[0];
2286         if (queue >= ub->dev_info.nr_hw_queues)
2287                 return -EINVAL;
2288
2289         if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
2290                 return -ENOMEM;
2291
2292         for_each_possible_cpu(i) {
2293                 if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
2294                         cpumask_set_cpu(i, cpumask);
2295         }
2296
2297         ret = -EFAULT;
2298         retlen = min_t(unsigned short, header->len, cpumask_size());
2299         if (copy_to_user(argp, cpumask, retlen))
2300                 goto out_free_cpumask;
2301         if (retlen != header->len &&
2302             clear_user(argp + retlen, header->len - retlen))
2303                 goto out_free_cpumask;
2304
2305         ret = 0;
2306 out_free_cpumask:
2307         free_cpumask_var(cpumask);
2308         return ret;
2309 }
2310
2311 static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
2312 {
2313         pr_devel("%s: dev id %d flags %llx\n", __func__,
2314                         info->dev_id, info->flags);
2315         pr_devel("\t nr_hw_queues %d queue_depth %d\n",
2316                         info->nr_hw_queues, info->queue_depth);
2317 }
2318
2319 static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
2320 {
2321         const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2322         void __user *argp = (void __user *)(unsigned long)header->addr;
2323         struct ublksrv_ctrl_dev_info info;
2324         struct ublk_device *ub;
2325         int ret = -EINVAL;
2326
2327         if (header->len < sizeof(info) || !header->addr)
2328                 return -EINVAL;
2329         if (header->queue_id != (u16)-1) {
2330                 pr_warn("%s: queue_id is wrong %x\n",
2331                         __func__, header->queue_id);
2332                 return -EINVAL;
2333         }
2334
2335         if (copy_from_user(&info, argp, sizeof(info)))
2336                 return -EFAULT;
2337
2338         if (capable(CAP_SYS_ADMIN))
2339                 info.flags &= ~UBLK_F_UNPRIVILEGED_DEV;
2340         else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
2341                 return -EPERM;
2342
2343         /*
2344          * unprivileged device can't be trusted, but RECOVERY and
2345          * RECOVERY_REISSUE still may hang error handling, so can't
2346          * support recovery features for unprivileged ublk now
2347          *
2348          * TODO: provide forward progress for RECOVERY handler, so that
2349          * unprivileged device can benefit from it
2350          */
2351         if (info.flags & UBLK_F_UNPRIVILEGED_DEV)
2352                 info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
2353                                 UBLK_F_USER_RECOVERY);
2354
2355         /* the created device is always owned by current user */
2356         ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
2357
2358         if (header->dev_id != info.dev_id) {
2359                 pr_warn("%s: dev id not match %u %u\n",
2360                         __func__, header->dev_id, info.dev_id);
2361                 return -EINVAL;
2362         }
2363
2364         if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) {
2365                 pr_warn("%s: dev id is too large. Max supported is %d\n",
2366                         __func__, UBLK_MAX_UBLKS - 1);
2367                 return -EINVAL;
2368         }
2369
2370         ublk_dump_dev_info(&info);
2371
2372         ret = mutex_lock_killable(&ublk_ctl_mutex);
2373         if (ret)
2374                 return ret;
2375
2376         ret = -EACCES;
2377         if (ublks_added >= ublks_max)
2378                 goto out_unlock;
2379
2380         ret = -ENOMEM;
2381         ub = kzalloc(sizeof(*ub), GFP_KERNEL);
2382         if (!ub)
2383                 goto out_unlock;
2384         mutex_init(&ub->mutex);
2385         spin_lock_init(&ub->lock);
2386         INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn);
2387         INIT_WORK(&ub->stop_work, ublk_stop_work_fn);
2388
2389         ret = ublk_alloc_dev_number(ub, header->dev_id);
2390         if (ret < 0)
2391                 goto out_free_ub;
2392
2393         memcpy(&ub->dev_info, &info, sizeof(info));
2394
2395         /* update device id */
2396         ub->dev_info.dev_id = ub->ub_number;
2397
2398         /*
2399          * 64bit flags will be copied back to userspace as feature
2400          * negotiation result, so have to clear flags which driver
2401          * doesn't support yet, then userspace can get correct flags
2402          * (features) to handle.
2403          */
2404         ub->dev_info.flags &= UBLK_F_ALL;
2405
2406         ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
2407                 UBLK_F_URING_CMD_COMP_IN_TASK;
2408
2409         /* GET_DATA isn't needed any more with USER_COPY */
2410         if (ublk_dev_is_user_copy(ub))
2411                 ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
2412
2413         /* Zoned storage support requires user copy feature */
2414         if (ublk_dev_is_zoned(ub) &&
2415             (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !ublk_dev_is_user_copy(ub))) {
2416                 ret = -EINVAL;
2417                 goto out_free_dev_number;
2418         }
2419
2420         /* We are not ready to support zero copy */
2421         ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
2422
2423         ub->dev_info.nr_hw_queues = min_t(unsigned int,
2424                         ub->dev_info.nr_hw_queues, nr_cpu_ids);
2425         ublk_align_max_io_size(ub);
2426
2427         ret = ublk_init_queues(ub);
2428         if (ret)
2429                 goto out_free_dev_number;
2430
2431         ret = ublk_add_tag_set(ub);
2432         if (ret)
2433                 goto out_deinit_queues;
2434
2435         ret = -EFAULT;
2436         if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
2437                 goto out_free_tag_set;
2438
2439         /*
2440          * Add the char dev so that ublksrv daemon can be setup.
2441          * ublk_add_chdev() will cleanup everything if it fails.
2442          */
2443         ret = ublk_add_chdev(ub);
2444         goto out_unlock;
2445
2446 out_free_tag_set:
2447         blk_mq_free_tag_set(&ub->tag_set);
2448 out_deinit_queues:
2449         ublk_deinit_queues(ub);
2450 out_free_dev_number:
2451         ublk_free_dev_number(ub);
2452 out_free_ub:
2453         mutex_destroy(&ub->mutex);
2454         kfree(ub);
2455 out_unlock:
2456         mutex_unlock(&ublk_ctl_mutex);
2457         return ret;
2458 }
2459
2460 static inline bool ublk_idr_freed(int id)
2461 {
2462         void *ptr;
2463
2464         spin_lock(&ublk_idr_lock);
2465         ptr = idr_find(&ublk_index_idr, id);
2466         spin_unlock(&ublk_idr_lock);
2467
2468         return ptr == NULL;
2469 }
2470
2471 static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait)
2472 {
2473         struct ublk_device *ub = *p_ub;
2474         int idx = ub->ub_number;
2475         int ret;
2476
2477         ret = mutex_lock_killable(&ublk_ctl_mutex);
2478         if (ret)
2479                 return ret;
2480
2481         if (!test_bit(UB_STATE_DELETED, &ub->state)) {
2482                 ublk_remove(ub);
2483                 set_bit(UB_STATE_DELETED, &ub->state);
2484         }
2485
2486         /* Mark the reference as consumed */
2487         *p_ub = NULL;
2488         ublk_put_device(ub);
2489         mutex_unlock(&ublk_ctl_mutex);
2490
2491         /*
2492          * Wait until the idr is removed, then it can be reused after
2493          * DEL_DEV command is returned.
2494          *
2495          * If we returns because of user interrupt, future delete command
2496          * may come:
2497          *
2498          * - the device number isn't freed, this device won't or needn't
2499          *   be deleted again, since UB_STATE_DELETED is set, and device
2500          *   will be released after the last reference is dropped
2501          *
2502          * - the device number is freed already, we will not find this
2503          *   device via ublk_get_device_from_id()
2504          */
2505         if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
2506                 return -EINTR;
2507         return 0;
2508 }
2509
2510 static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd)
2511 {
2512         const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2513
2514         pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
2515                         __func__, cmd->cmd_op, header->dev_id, header->queue_id,
2516                         header->data[0], header->addr, header->len);
2517 }
2518
2519 static int ublk_ctrl_stop_dev(struct ublk_device *ub)
2520 {
2521         ublk_stop_dev(ub);
2522         cancel_work_sync(&ub->stop_work);
2523         cancel_work_sync(&ub->quiesce_work);
2524
2525         return 0;
2526 }
2527
2528 static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
2529                 struct io_uring_cmd *cmd)
2530 {
2531         const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2532         void __user *argp = (void __user *)(unsigned long)header->addr;
2533
2534         if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
2535                 return -EINVAL;
2536
2537         if (copy_to_user(argp, &ub->dev_info, sizeof(ub->dev_info)))
2538                 return -EFAULT;
2539
2540         return 0;
2541 }
2542
2543 /* TYPE_DEVT is readonly, so fill it up before returning to userspace */
2544 static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
2545 {
2546         ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt);
2547         ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt);
2548
2549         if (ub->ub_disk) {
2550                 ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk));
2551                 ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk));
2552         } else {
2553                 ub->params.devt.disk_major = 0;
2554                 ub->params.devt.disk_minor = 0;
2555         }
2556         ub->params.types |= UBLK_PARAM_TYPE_DEVT;
2557 }
2558
2559 static int ublk_ctrl_get_params(struct ublk_device *ub,
2560                 struct io_uring_cmd *cmd)
2561 {
2562         const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2563         void __user *argp = (void __user *)(unsigned long)header->addr;
2564         struct ublk_params_header ph;
2565         int ret;
2566
2567         if (header->len <= sizeof(ph) || !header->addr)
2568                 return -EINVAL;
2569
2570         if (copy_from_user(&ph, argp, sizeof(ph)))
2571                 return -EFAULT;
2572
2573         if (ph.len > header->len || !ph.len)
2574                 return -EINVAL;
2575
2576         if (ph.len > sizeof(struct ublk_params))
2577                 ph.len = sizeof(struct ublk_params);
2578
2579         mutex_lock(&ub->mutex);
2580         ublk_ctrl_fill_params_devt(ub);
2581         if (copy_to_user(argp, &ub->params, ph.len))
2582                 ret = -EFAULT;
2583         else
2584                 ret = 0;
2585         mutex_unlock(&ub->mutex);
2586
2587         return ret;
2588 }
2589
2590 static int ublk_ctrl_set_params(struct ublk_device *ub,
2591                 struct io_uring_cmd *cmd)
2592 {
2593         const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2594         void __user *argp = (void __user *)(unsigned long)header->addr;
2595         struct ublk_params_header ph;
2596         int ret = -EFAULT;
2597
2598         if (header->len <= sizeof(ph) || !header->addr)
2599                 return -EINVAL;
2600
2601         if (copy_from_user(&ph, argp, sizeof(ph)))
2602                 return -EFAULT;
2603
2604         if (ph.len > header->len || !ph.len || !ph.types)
2605                 return -EINVAL;
2606
2607         if (ph.len > sizeof(struct ublk_params))
2608                 ph.len = sizeof(struct ublk_params);
2609
2610         /* parameters can only be changed when device isn't live */
2611         mutex_lock(&ub->mutex);
2612         if (ub->dev_info.state == UBLK_S_DEV_LIVE) {
2613                 ret = -EACCES;
2614         } else if (copy_from_user(&ub->params, argp, ph.len)) {
2615                 ret = -EFAULT;
2616         } else {
2617                 /* clear all we don't support yet */
2618                 ub->params.types &= UBLK_PARAM_TYPE_ALL;
2619                 ret = ublk_validate_params(ub);
2620                 if (ret)
2621                         ub->params.types = 0;
2622         }
2623         mutex_unlock(&ub->mutex);
2624
2625         return ret;
2626 }
2627
2628 static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
2629 {
2630         int i;
2631
2632         WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq)));
2633
2634         /* All old ioucmds have to be completed */
2635         ubq->nr_io_ready = 0;
2636         /* old daemon is PF_EXITING, put it now */
2637         put_task_struct(ubq->ubq_daemon);
2638         /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */
2639         ubq->ubq_daemon = NULL;
2640         ubq->timeout = false;
2641         ubq->canceling = false;
2642
2643         for (i = 0; i < ubq->q_depth; i++) {
2644                 struct ublk_io *io = &ubq->ios[i];
2645
2646                 /* forget everything now and be ready for new FETCH_REQ */
2647                 io->flags = 0;
2648                 io->cmd = NULL;
2649                 io->addr = 0;
2650         }
2651 }
2652
2653 static int ublk_ctrl_start_recovery(struct ublk_device *ub,
2654                 struct io_uring_cmd *cmd)
2655 {
2656         const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2657         int ret = -EINVAL;
2658         int i;
2659
2660         mutex_lock(&ub->mutex);
2661         if (!ublk_can_use_recovery(ub))
2662                 goto out_unlock;
2663         /*
2664          * START_RECOVERY is only allowd after:
2665          *
2666          * (1) UB_STATE_OPEN is not set, which means the dying process is exited
2667          *     and related io_uring ctx is freed so file struct of /dev/ublkcX is
2668          *     released.
2669          *
2670          * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
2671          *     (a)has quiesced request queue
2672          *     (b)has requeued every inflight rqs whose io_flags is ACTIVE
2673          *     (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
2674          *     (d)has completed/camceled all ioucmds owned by ther dying process
2675          */
2676         if (test_bit(UB_STATE_OPEN, &ub->state) ||
2677                         ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
2678                 ret = -EBUSY;
2679                 goto out_unlock;
2680         }
2681         pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id);
2682         for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2683                 ublk_queue_reinit(ub, ublk_get_queue(ub, i));
2684         /* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */
2685         ub->mm = NULL;
2686         ub->nr_queues_ready = 0;
2687         ub->nr_privileged_daemon = 0;
2688         init_completion(&ub->completion);
2689         ret = 0;
2690  out_unlock:
2691         mutex_unlock(&ub->mutex);
2692         return ret;
2693 }
2694
2695 static int ublk_ctrl_end_recovery(struct ublk_device *ub,
2696                 struct io_uring_cmd *cmd)
2697 {
2698         const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2699         int ublksrv_pid = (int)header->data[0];
2700         int ret = -EINVAL;
2701
2702         pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n",
2703                         __func__, ub->dev_info.nr_hw_queues, header->dev_id);
2704         /* wait until new ubq_daemon sending all FETCH_REQ */
2705         if (wait_for_completion_interruptible(&ub->completion))
2706                 return -EINTR;
2707
2708         pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n",
2709                         __func__, ub->dev_info.nr_hw_queues, header->dev_id);
2710
2711         mutex_lock(&ub->mutex);
2712         if (!ublk_can_use_recovery(ub))
2713                 goto out_unlock;
2714
2715         if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
2716                 ret = -EBUSY;
2717                 goto out_unlock;
2718         }
2719         ub->dev_info.ublksrv_pid = ublksrv_pid;
2720         pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
2721                         __func__, ublksrv_pid, header->dev_id);
2722         blk_mq_unquiesce_queue(ub->ub_disk->queue);
2723         pr_devel("%s: queue unquiesced, dev id %d.\n",
2724                         __func__, header->dev_id);
2725         blk_mq_kick_requeue_list(ub->ub_disk->queue);
2726         ub->dev_info.state = UBLK_S_DEV_LIVE;
2727         ret = 0;
2728  out_unlock:
2729         mutex_unlock(&ub->mutex);
2730         return ret;
2731 }
2732
2733 static int ublk_ctrl_get_features(struct io_uring_cmd *cmd)
2734 {
2735         const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2736         void __user *argp = (void __user *)(unsigned long)header->addr;
2737         u64 features = UBLK_F_ALL & ~UBLK_F_SUPPORT_ZERO_COPY;
2738
2739         if (header->len != UBLK_FEATURES_LEN || !header->addr)
2740                 return -EINVAL;
2741
2742         if (copy_to_user(argp, &features, UBLK_FEATURES_LEN))
2743                 return -EFAULT;
2744
2745         return 0;
2746 }
2747
2748 /*
2749  * All control commands are sent via /dev/ublk-control, so we have to check
2750  * the destination device's permission
2751  */
2752 static int ublk_char_dev_permission(struct ublk_device *ub,
2753                 const char *dev_path, int mask)
2754 {
2755         int err;
2756         struct path path;
2757         struct kstat stat;
2758
2759         err = kern_path(dev_path, LOOKUP_FOLLOW, &path);
2760         if (err)
2761                 return err;
2762
2763         err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
2764         if (err)
2765                 goto exit;
2766
2767         err = -EPERM;
2768         if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode))
2769                 goto exit;
2770
2771         err = inode_permission(&nop_mnt_idmap,
2772                         d_backing_inode(path.dentry), mask);
2773 exit:
2774         path_put(&path);
2775         return err;
2776 }
2777
2778 static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
2779                 struct io_uring_cmd *cmd)
2780 {
2781         struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)io_uring_sqe_cmd(cmd->sqe);
2782         bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
2783         void __user *argp = (void __user *)(unsigned long)header->addr;
2784         char *dev_path = NULL;
2785         int ret = 0;
2786         int mask;
2787
2788         if (!unprivileged) {
2789                 if (!capable(CAP_SYS_ADMIN))
2790                         return -EPERM;
2791                 /*
2792                  * The new added command of UBLK_CMD_GET_DEV_INFO2 includes
2793                  * char_dev_path in payload too, since userspace may not
2794                  * know if the specified device is created as unprivileged
2795                  * mode.
2796                  */
2797                 if (_IOC_NR(cmd->cmd_op) != UBLK_CMD_GET_DEV_INFO2)
2798                         return 0;
2799         }
2800
2801         /*
2802          * User has to provide the char device path for unprivileged ublk
2803          *
2804          * header->addr always points to the dev path buffer, and
2805          * header->dev_path_len records length of dev path buffer.
2806          */
2807         if (!header->dev_path_len || header->dev_path_len > PATH_MAX)
2808                 return -EINVAL;
2809
2810         if (header->len < header->dev_path_len)
2811                 return -EINVAL;
2812
2813         dev_path = memdup_user_nul(argp, header->dev_path_len);
2814         if (IS_ERR(dev_path))
2815                 return PTR_ERR(dev_path);
2816
2817         ret = -EINVAL;
2818         switch (_IOC_NR(cmd->cmd_op)) {
2819         case UBLK_CMD_GET_DEV_INFO:
2820         case UBLK_CMD_GET_DEV_INFO2:
2821         case UBLK_CMD_GET_QUEUE_AFFINITY:
2822         case UBLK_CMD_GET_PARAMS:
2823         case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)):
2824                 mask = MAY_READ;
2825                 break;
2826         case UBLK_CMD_START_DEV:
2827         case UBLK_CMD_STOP_DEV:
2828         case UBLK_CMD_ADD_DEV:
2829         case UBLK_CMD_DEL_DEV:
2830         case UBLK_CMD_SET_PARAMS:
2831         case UBLK_CMD_START_USER_RECOVERY:
2832         case UBLK_CMD_END_USER_RECOVERY:
2833                 mask = MAY_READ | MAY_WRITE;
2834                 break;
2835         default:
2836                 goto exit;
2837         }
2838
2839         ret = ublk_char_dev_permission(ub, dev_path, mask);
2840         if (!ret) {
2841                 header->len -= header->dev_path_len;
2842                 header->addr += header->dev_path_len;
2843         }
2844         pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n",
2845                         __func__, ub->ub_number, cmd->cmd_op,
2846                         ub->dev_info.owner_uid, ub->dev_info.owner_gid,
2847                         dev_path, ret);
2848 exit:
2849         kfree(dev_path);
2850         return ret;
2851 }
2852
2853 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
2854                 unsigned int issue_flags)
2855 {
2856         const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2857         struct ublk_device *ub = NULL;
2858         u32 cmd_op = cmd->cmd_op;
2859         int ret = -EINVAL;
2860
2861         if (issue_flags & IO_URING_F_NONBLOCK)
2862                 return -EAGAIN;
2863
2864         ublk_ctrl_cmd_dump(cmd);
2865
2866         if (!(issue_flags & IO_URING_F_SQE128))
2867                 goto out;
2868
2869         ret = ublk_check_cmd_op(cmd_op);
2870         if (ret)
2871                 goto out;
2872
2873         if (cmd_op == UBLK_U_CMD_GET_FEATURES) {
2874                 ret = ublk_ctrl_get_features(cmd);
2875                 goto out;
2876         }
2877
2878         if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) {
2879                 ret = -ENODEV;
2880                 ub = ublk_get_device_from_id(header->dev_id);
2881                 if (!ub)
2882                         goto out;
2883
2884                 ret = ublk_ctrl_uring_cmd_permission(ub, cmd);
2885                 if (ret)
2886                         goto put_dev;
2887         }
2888
2889         switch (_IOC_NR(cmd_op)) {
2890         case UBLK_CMD_START_DEV:
2891                 ret = ublk_ctrl_start_dev(ub, cmd);
2892                 break;
2893         case UBLK_CMD_STOP_DEV:
2894                 ret = ublk_ctrl_stop_dev(ub);
2895                 break;
2896         case UBLK_CMD_GET_DEV_INFO:
2897         case UBLK_CMD_GET_DEV_INFO2:
2898                 ret = ublk_ctrl_get_dev_info(ub, cmd);
2899                 break;
2900         case UBLK_CMD_ADD_DEV:
2901                 ret = ublk_ctrl_add_dev(cmd);
2902                 break;
2903         case UBLK_CMD_DEL_DEV:
2904                 ret = ublk_ctrl_del_dev(&ub, true);
2905                 break;
2906         case UBLK_U_CMD_DEL_DEV_ASYNC:
2907                 ret = ublk_ctrl_del_dev(&ub, false);
2908                 break;
2909         case UBLK_CMD_GET_QUEUE_AFFINITY:
2910                 ret = ublk_ctrl_get_queue_affinity(ub, cmd);
2911                 break;
2912         case UBLK_CMD_GET_PARAMS:
2913                 ret = ublk_ctrl_get_params(ub, cmd);
2914                 break;
2915         case UBLK_CMD_SET_PARAMS:
2916                 ret = ublk_ctrl_set_params(ub, cmd);
2917                 break;
2918         case UBLK_CMD_START_USER_RECOVERY:
2919                 ret = ublk_ctrl_start_recovery(ub, cmd);
2920                 break;
2921         case UBLK_CMD_END_USER_RECOVERY:
2922                 ret = ublk_ctrl_end_recovery(ub, cmd);
2923                 break;
2924         default:
2925                 ret = -ENOTSUPP;
2926                 break;
2927         }
2928
2929  put_dev:
2930         if (ub)
2931                 ublk_put_device(ub);
2932  out:
2933         io_uring_cmd_done(cmd, ret, 0, issue_flags);
2934         pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
2935                         __func__, ret, cmd->cmd_op, header->dev_id, header->queue_id);
2936         return -EIOCBQUEUED;
2937 }
2938
2939 static const struct file_operations ublk_ctl_fops = {
2940         .open           = nonseekable_open,
2941         .uring_cmd      = ublk_ctrl_uring_cmd,
2942         .owner          = THIS_MODULE,
2943         .llseek         = noop_llseek,
2944 };
2945
2946 static struct miscdevice ublk_misc = {
2947         .minor          = MISC_DYNAMIC_MINOR,
2948         .name           = "ublk-control",
2949         .fops           = &ublk_ctl_fops,
2950 };
2951
2952 static int __init ublk_init(void)
2953 {
2954         int ret;
2955
2956         BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
2957                         UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
2958
2959         init_waitqueue_head(&ublk_idr_wq);
2960
2961         ret = misc_register(&ublk_misc);
2962         if (ret)
2963                 return ret;
2964
2965         ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
2966         if (ret)
2967                 goto unregister_mis;
2968
2969         ret = class_register(&ublk_chr_class);
2970         if (ret)
2971                 goto free_chrdev_region;
2972
2973         return 0;
2974
2975 free_chrdev_region:
2976         unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
2977 unregister_mis:
2978         misc_deregister(&ublk_misc);
2979         return ret;
2980 }
2981
2982 static void __exit ublk_exit(void)
2983 {
2984         struct ublk_device *ub;
2985         int id;
2986
2987         idr_for_each_entry(&ublk_index_idr, ub, id)
2988                 ublk_remove(ub);
2989
2990         class_unregister(&ublk_chr_class);
2991         misc_deregister(&ublk_misc);
2992
2993         idr_destroy(&ublk_index_idr);
2994         unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
2995 }
2996
2997 module_init(ublk_init);
2998 module_exit(ublk_exit);
2999
3000 static int ublk_set_max_ublks(const char *buf, const struct kernel_param *kp)
3001 {
3002         return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS);
3003 }
3004
3005 static int ublk_get_max_ublks(char *buf, const struct kernel_param *kp)
3006 {
3007         return sysfs_emit(buf, "%u\n", ublks_max);
3008 }
3009
3010 static const struct kernel_param_ops ublk_max_ublks_ops = {
3011         .set = ublk_set_max_ublks,
3012         .get = ublk_get_max_ublks,
3013 };
3014
3015 module_param_cb(ublks_max, &ublk_max_ublks_ops, &ublks_max, 0644);
3016 MODULE_PARM_DESC(ublks_max, "max number of ublk devices allowed to add(default: 64)");
3017
3018 MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
3019 MODULE_LICENSE("GPL");