ublk: add read()/write() support for ublk char device
authorMing Lei <ming.lei@redhat.com>
Fri, 19 May 2023 06:50:29 +0000 (14:50 +0800)
committerJens Axboe <axboe@kernel.dk>
Sat, 20 May 2023 01:59:17 +0000 (19:59 -0600)
Support pread()/pwrite() on ublk char device for reading/writing request
io buffer, so data copy between io request buffer and userspace buffer
can be moved to ublk server from ublk driver. Then UBLK_F_NEED_GET_DATA
becomes not necessary, so ublk server can allocate buffer without one
extra round uring command communication for userspace to provide buffer.

IO buffer can be located by iocb->ki_pos which encodes buffer offset, io
tag and queue id info, and type of iocb->ki_pos is u64, so it is big
enough for holding reasonable queue depth, nr_queues and max io buffer
size.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20230519065030.351216-7-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
drivers/block/ublk_drv.c
include/uapi/linux/ublk_cmd.h

index 13523c37a165d64eea629b81599d516a6063dbda..ec40ac4f9af3c8ac9a487fa6668a653296cb40ff 100644 (file)
@@ -207,6 +207,23 @@ static unsigned int ublks_added;   /* protected by ublk_ctl_mutex */
 
 static struct miscdevice ublk_misc;
 
+static inline unsigned ublk_pos_to_hwq(loff_t pos)
+{
+       return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
+               UBLK_QID_BITS_MASK;
+}
+
+static inline unsigned ublk_pos_to_buf_off(loff_t pos)
+{
+       return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
+}
+
+static inline unsigned ublk_pos_to_tag(loff_t pos)
+{
+       return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
+               UBLK_TAG_BITS_MASK;
+}
+
 static void ublk_dev_param_basic_apply(struct ublk_device *ub)
 {
        struct request_queue *q = ub->ub_disk->queue;
@@ -1429,6 +1446,36 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
        return -EIOCBQUEUED;
 }
 
+static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
+               struct ublk_queue *ubq, int tag, size_t offset)
+{
+       struct request *req;
+
+       if (!ublk_need_req_ref(ubq))
+               return NULL;
+
+       req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
+       if (!req)
+               return NULL;
+
+       if (!ublk_get_req_ref(ubq, req))
+               return NULL;
+
+       if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
+               goto fail_put;
+
+       if (!ublk_rq_has_data(req))
+               goto fail_put;
+
+       if (offset > blk_rq_bytes(req))
+               goto fail_put;
+
+       return req;
+fail_put:
+       ublk_put_req_ref(ubq, req);
+       return NULL;
+}
+
 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 {
        /*
@@ -1446,11 +1493,112 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
        return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd);
 }
 
+static inline bool ublk_check_ubuf_dir(const struct request *req,
+               int ubuf_dir)
+{
+       /* copy ubuf to request pages */
+       if (req_op(req) == REQ_OP_READ && ubuf_dir == ITER_SOURCE)
+               return true;
+
+       /* copy request pages to ubuf */
+       if (req_op(req) == REQ_OP_WRITE && ubuf_dir == ITER_DEST)
+               return true;
+
+       return false;
+}
+
+static struct request *ublk_check_and_get_req(struct kiocb *iocb,
+               struct iov_iter *iter, size_t *off, int dir)
+{
+       struct ublk_device *ub = iocb->ki_filp->private_data;
+       struct ublk_queue *ubq;
+       struct request *req;
+       size_t buf_off;
+       u16 tag, q_id;
+
+       if (!ub)
+               return ERR_PTR(-EACCES);
+
+       if (!user_backed_iter(iter))
+               return ERR_PTR(-EACCES);
+
+       if (ub->dev_info.state == UBLK_S_DEV_DEAD)
+               return ERR_PTR(-EACCES);
+
+       tag = ublk_pos_to_tag(iocb->ki_pos);
+       q_id = ublk_pos_to_hwq(iocb->ki_pos);
+       buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
+
+       if (q_id >= ub->dev_info.nr_hw_queues)
+               return ERR_PTR(-EINVAL);
+
+       ubq = ublk_get_queue(ub, q_id);
+       if (!ubq)
+               return ERR_PTR(-EINVAL);
+
+       if (tag >= ubq->q_depth)
+               return ERR_PTR(-EINVAL);
+
+       req = __ublk_check_and_get_req(ub, ubq, tag, buf_off);
+       if (!req)
+               return ERR_PTR(-EINVAL);
+
+       if (!req->mq_hctx || !req->mq_hctx->driver_data)
+               goto fail;
+
+       if (!ublk_check_ubuf_dir(req, dir))
+               goto fail;
+
+       *off = buf_off;
+       return req;
+fail:
+       ublk_put_req_ref(ubq, req);
+       return ERR_PTR(-EACCES);
+}
+
+static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+       struct ublk_queue *ubq;
+       struct request *req;
+       size_t buf_off;
+       size_t ret;
+
+       req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST);
+       if (IS_ERR(req))
+               return PTR_ERR(req);
+
+       ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST);
+       ubq = req->mq_hctx->driver_data;
+       ublk_put_req_ref(ubq, req);
+
+       return ret;
+}
+
+static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+       struct ublk_queue *ubq;
+       struct request *req;
+       size_t buf_off;
+       size_t ret;
+
+       req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE);
+       if (IS_ERR(req))
+               return PTR_ERR(req);
+
+       ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE);
+       ubq = req->mq_hctx->driver_data;
+       ublk_put_req_ref(ubq, req);
+
+       return ret;
+}
+
 static const struct file_operations ublk_ch_fops = {
        .owner = THIS_MODULE,
        .open = ublk_ch_open,
        .release = ublk_ch_release,
        .llseek = no_llseek,
+       .read_iter = ublk_ch_read_iter,
+       .write_iter = ublk_ch_write_iter,
        .uring_cmd = ublk_ch_uring_cmd,
        .mmap = ublk_ch_mmap,
 };
@@ -2362,6 +2510,9 @@ static int __init ublk_init(void)
 {
        int ret;
 
+       BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
+                       UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
+
        init_waitqueue_head(&ublk_idr_wq);
 
        ret = misc_register(&ublk_misc);
index 640bf687b94a9e235751956cf36d9c1210258d8a..c0c1632c671e51de32674b94fe8a28ce3d6b7bb2 100644 (file)
 #define UBLKSRV_CMD_BUF_OFFSET 0
 #define UBLKSRV_IO_BUF_OFFSET  0x80000000
 
-/* tag bit is 12bit, so at most 4096 IOs for each queue */
+/* tag bit is 16bit, so far limit at most 4096 IOs for each queue */
 #define UBLK_MAX_QUEUE_DEPTH   4096
 
+/* single IO buffer max size is 32MB */
+#define UBLK_IO_BUF_OFF                0
+#define UBLK_IO_BUF_BITS       25
+#define UBLK_IO_BUF_BITS_MASK  ((1ULL << UBLK_IO_BUF_BITS) - 1)
+
+/* so at most 64K IOs for each queue */
+#define UBLK_TAG_OFF           UBLK_IO_BUF_BITS
+#define UBLK_TAG_BITS          16
+#define UBLK_TAG_BITS_MASK     ((1ULL << UBLK_TAG_BITS) - 1)
+
+/* max 4096 queues */
+#define UBLK_QID_OFF           (UBLK_TAG_OFF + UBLK_TAG_BITS)
+#define UBLK_QID_BITS          12
+#define UBLK_QID_BITS_MASK     ((1ULL << UBLK_QID_BITS) - 1)
+
+#define UBLK_MAX_NR_QUEUES     (1U << UBLK_QID_BITS)
+
+#define UBLKSRV_IO_BUF_TOTAL_BITS      (UBLK_QID_OFF + UBLK_QID_BITS)
+#define UBLKSRV_IO_BUF_TOTAL_SIZE      (1ULL << UBLKSRV_IO_BUF_TOTAL_BITS)
+
 /*
  * zero copy requires 4k block size, and can remap ublk driver's io
  * request into ublksrv's vm space