#include <asm/page.h>
#include <linux/task_work.h>
#include <linux/namei.h>
+#include <linux/kref.h>
#include <uapi/linux/ublk_cmd.h>
#define UBLK_MINORS (1U << MINORBITS)
struct ublk_rq_data {
struct llist_node node;
- struct callback_head work;
+
+ struct kref ref;
};
struct ublk_uring_cmd_pdu {
unsigned long io_addr; /* mapped vm address */
unsigned int max_io_sz;
bool force_abort;
+ bool timeout;
unsigned short nr_io_ready; /* how many ios setup */
struct ublk_device *dev;
struct ublk_io ios[];
__u32 types;
};
+static inline void __ublk_complete_rq(struct request *req);
+static void ublk_complete_rq(struct kref *ref);
+
static dev_t ublk_chr_devt;
static struct class *ublk_chr_class;
return 0;
}
-static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq)
+static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
{
- if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) &&
- !(ubq->flags & UBLK_F_URING_CMD_COMP_IN_TASK))
- return true;
return false;
}
+static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
+ struct request *req)
+{
+ if (ublk_need_req_ref(ubq)) {
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+
+ kref_init(&data->ref);
+ }
+}
+
+static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
+ struct request *req)
+{
+ if (ublk_need_req_ref(ubq)) {
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+
+ return kref_get_unless_zero(&data->ref);
+ }
+
+ return true;
+}
+
+static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
+ struct request *req)
+{
+ if (ublk_need_req_ref(ubq)) {
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+
+ kref_put(&data->ref, ublk_complete_rq);
+ } else {
+ __ublk_complete_rq(req);
+ }
+}
+
static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
{
return ubq->flags & UBLK_F_NEED_GET_DATA;
#define UBLK_MAX_PIN_PAGES 32
-struct ublk_map_data {
- const struct request *rq;
- unsigned long ubuf;
- unsigned int len;
-};
-
struct ublk_io_iter {
struct page *pages[UBLK_MAX_PIN_PAGES];
- unsigned pg_off; /* offset in the 1st page in pages */
- int nr_pages; /* how many page pointers in pages */
struct bio *bio;
struct bvec_iter iter;
};
-static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data,
- unsigned max_bytes, bool to_vm)
+/* return how many pages are copied */
+static void ublk_copy_io_pages(struct ublk_io_iter *data,
+ size_t total, size_t pg_off, int dir)
{
- const unsigned total = min_t(unsigned, max_bytes,
- PAGE_SIZE - data->pg_off +
- ((data->nr_pages - 1) << PAGE_SHIFT));
unsigned done = 0;
unsigned pg_idx = 0;
while (done < total) {
struct bio_vec bv = bio_iter_iovec(data->bio, data->iter);
- const unsigned int bytes = min3(bv.bv_len, total - done,
- (unsigned)(PAGE_SIZE - data->pg_off));
+ unsigned int bytes = min3(bv.bv_len, (unsigned)total - done,
+ (unsigned)(PAGE_SIZE - pg_off));
void *bv_buf = bvec_kmap_local(&bv);
void *pg_buf = kmap_local_page(data->pages[pg_idx]);
- if (to_vm)
- memcpy(pg_buf + data->pg_off, bv_buf, bytes);
+ if (dir == ITER_DEST)
+ memcpy(pg_buf + pg_off, bv_buf, bytes);
else
- memcpy(bv_buf, pg_buf + data->pg_off, bytes);
+ memcpy(bv_buf, pg_buf + pg_off, bytes);
kunmap_local(pg_buf);
kunmap_local(bv_buf);
/* advance page array */
- data->pg_off += bytes;
- if (data->pg_off == PAGE_SIZE) {
+ pg_off += bytes;
+ if (pg_off == PAGE_SIZE) {
pg_idx += 1;
- data->pg_off = 0;
+ pg_off = 0;
}
done += bytes;
data->iter = data->bio->bi_iter;
}
}
-
- return done;
}
-static int ublk_copy_user_pages(struct ublk_map_data *data, bool to_vm)
+/*
+ * Copy data between request pages and io_iter, and 'offset'
+ * is the start point of linear offset of request.
+ */
+static size_t ublk_copy_user_pages(const struct request *req,
+ struct iov_iter *uiter, int dir)
{
- const unsigned int gup_flags = to_vm ? FOLL_WRITE : 0;
- const unsigned long start_vm = data->ubuf;
- unsigned int done = 0;
struct ublk_io_iter iter = {
- .pg_off = start_vm & (PAGE_SIZE - 1),
- .bio = data->rq->bio,
- .iter = data->rq->bio->bi_iter,
+ .bio = req->bio,
+ .iter = req->bio->bi_iter,
};
- const unsigned int nr_pages = round_up(data->len +
- (start_vm & (PAGE_SIZE - 1)), PAGE_SIZE) >> PAGE_SHIFT;
-
- while (done < nr_pages) {
- const unsigned to_pin = min_t(unsigned, UBLK_MAX_PIN_PAGES,
- nr_pages - done);
- unsigned i, len;
-
- iter.nr_pages = get_user_pages_fast(start_vm +
- (done << PAGE_SHIFT), to_pin, gup_flags,
- iter.pages);
- if (iter.nr_pages <= 0)
- return done == 0 ? iter.nr_pages : done;
- len = ublk_copy_io_pages(&iter, data->len, to_vm);
- for (i = 0; i < iter.nr_pages; i++) {
- if (to_vm)
+ size_t done = 0;
+
+ while (iov_iter_count(uiter) && iter.bio) {
+ unsigned nr_pages;
+ size_t len, off;
+ int i;
+
+ len = iov_iter_get_pages2(uiter, iter.pages,
+ iov_iter_count(uiter),
+ UBLK_MAX_PIN_PAGES, &off);
+ if (len <= 0)
+ return done;
+
+ ublk_copy_io_pages(&iter, len, off, dir);
+ nr_pages = DIV_ROUND_UP(len + off, PAGE_SIZE);
+ for (i = 0; i < nr_pages; i++) {
+ if (dir == ITER_DEST)
set_page_dirty(iter.pages[i]);
put_page(iter.pages[i]);
}
- data->len -= len;
- done += iter.nr_pages;
+ done += len;
}
return done;
* context is pretty fast, see ublk_pin_user_pages
*/
if (ublk_need_map_req(req)) {
- struct ublk_map_data data = {
- .rq = req,
- .ubuf = io->addr,
- .len = rq_bytes,
- };
+ struct iov_iter iter;
+ struct iovec iov;
+ const int dir = ITER_DEST;
- ublk_copy_user_pages(&data, true);
+ import_single_range(dir, u64_to_user_ptr(io->addr), rq_bytes,
+ &iov, &iter);
- return rq_bytes - data.len;
+ return ublk_copy_user_pages(req, &iter, dir);
}
return rq_bytes;
}
const unsigned int rq_bytes = blk_rq_bytes(req);
if (ublk_need_unmap_req(req)) {
- struct ublk_map_data data = {
- .rq = req,
- .ubuf = io->addr,
- .len = io->res,
- };
+ struct iov_iter iter;
+ struct iovec iov;
+ const int dir = ITER_SOURCE;
WARN_ON_ONCE(io->res > rq_bytes);
- ublk_copy_user_pages(&data, false);
-
- return io->res - data.len;
+ import_single_range(dir, u64_to_user_ptr(io->addr), io->res,
+ &iov, &iter);
+ return ublk_copy_user_pages(req, &iter, dir);
}
return rq_bytes;
}
}
/* todo: handle partial completion */
-static void ublk_complete_rq(struct request *req)
+static inline void __ublk_complete_rq(struct request *req)
{
struct ublk_queue *ubq = req->mq_hctx->driver_data;
struct ublk_io *io = &ubq->ios[req->tag];
unsigned int unmapped_bytes;
blk_status_t res = BLK_STS_OK;
+ /* called from ublk_abort_queue() code path */
+ if (io->flags & UBLK_IO_FLAG_ABORTED) {
+ res = BLK_STS_IOERR;
+ goto exit;
+ }
+
/* failed read IO if nothing is read */
if (!io->res && req_op(req) == REQ_OP_READ)
io->res = -EIO;
blk_mq_end_request(req, res);
}
+static void ublk_complete_rq(struct kref *ref)
+{
+ struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data,
+ ref);
+ struct request *req = blk_mq_rq_from_pdu(data);
+
+ __ublk_complete_rq(req);
+}
+
/*
* Since __ublk_rq_task_work always fails requests immediately during
* exiting, __ublk_fail_req() is only called from abort context during
if (ublk_queue_can_use_recovery_reissue(ubq))
blk_mq_requeue_request(req, false);
else
- blk_mq_end_request(req, BLK_STS_IOERR);
+ ublk_put_req_ref(ubq, req);
}
}
mapped_bytes >> 9;
}
+ ublk_init_req_ref(ubq, req);
ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags);
}
ublk_forward_io_cmds(ubq, issue_flags);
}
-static void ublk_rq_task_work_fn(struct callback_head *work)
-{
- struct ublk_rq_data *data = container_of(work,
- struct ublk_rq_data, work);
- struct request *req = blk_mq_rq_from_pdu(data);
- struct ublk_queue *ubq = req->mq_hctx->driver_data;
- unsigned issue_flags = IO_URING_F_UNLOCKED;
-
- ublk_forward_io_cmds(ubq, issue_flags);
-}
-
static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
{
struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
*/
if (unlikely(io->flags & UBLK_IO_FLAG_ABORTED)) {
ublk_abort_io_cmds(ubq);
- } else if (ublk_can_use_task_work(ubq)) {
- if (task_work_add(ubq->ubq_daemon, &data->work,
- TWA_SIGNAL_NO_IPI))
- ublk_abort_io_cmds(ubq);
} else {
struct io_uring_cmd *cmd = io->cmd;
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
}
}
+static enum blk_eh_timer_return ublk_timeout(struct request *rq)
+{
+ struct ublk_queue *ubq = rq->mq_hctx->driver_data;
+
+ if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) {
+ if (!ubq->timeout) {
+ send_sig(SIGKILL, ubq->ubq_daemon, 0);
+ ubq->timeout = true;
+ }
+
+ return BLK_EH_DONE;
+ }
+
+ return BLK_EH_RESET_TIMER;
+}
+
static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
return 0;
}
-static int ublk_init_rq(struct blk_mq_tag_set *set, struct request *req,
- unsigned int hctx_idx, unsigned int numa_node)
-{
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
-
- init_task_work(&data->work, ublk_rq_task_work_fn);
- return 0;
-}
-
static const struct blk_mq_ops ublk_mq_ops = {
.queue_rq = ublk_queue_rq,
.init_hctx = ublk_init_hctx,
- .init_request = ublk_init_rq,
+ .timeout = ublk_timeout,
};
static int ublk_ch_open(struct inode *inode, struct file *filp)
}
static void ublk_commit_completion(struct ublk_device *ub,
- struct ublksrv_io_cmd *ub_cmd)
+ const struct ublksrv_io_cmd *ub_cmd)
{
u32 qid = ub_cmd->q_id, tag = ub_cmd->tag;
struct ublk_queue *ubq = ublk_get_queue(ub, qid);
req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
if (req && likely(!blk_should_fake_timeout(req->q)))
- ublk_complete_rq(req);
+ ublk_put_req_ref(ubq, req);
}
/*
{
u32 ioc_type = _IOC_TYPE(cmd_op);
- if (IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
+ if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
return -EOPNOTSUPP;
if (ioc_type != 'u' && ioc_type != 0)
return 0;
}
+static inline void ublk_fill_io_cmd(struct ublk_io *io,
+ struct io_uring_cmd *cmd, unsigned long buf_addr)
+{
+ io->cmd = cmd;
+ io->flags |= UBLK_IO_FLAG_ACTIVE;
+ io->addr = buf_addr;
+}
+
static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
unsigned int issue_flags,
- struct ublksrv_io_cmd *ub_cmd)
+ const struct ublksrv_io_cmd *ub_cmd)
{
struct ublk_device *ub = cmd->file->private_data;
struct ublk_queue *ubq;
/* FETCH_RQ has to provide IO buffer if NEED GET DATA is not enabled */
if (!ub_cmd->addr && !ublk_need_get_data(ubq))
goto out;
- io->cmd = cmd;
- io->flags |= UBLK_IO_FLAG_ACTIVE;
- io->addr = ub_cmd->addr;
+ ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
ublk_mark_io_ready(ub, ubq);
break;
case UBLK_IO_COMMIT_AND_FETCH_REQ:
goto out;
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
goto out;
- io->addr = ub_cmd->addr;
- io->flags |= UBLK_IO_FLAG_ACTIVE;
- io->cmd = cmd;
+ ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
ublk_commit_completion(ub, ub_cmd);
break;
case UBLK_IO_NEED_GET_DATA:
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
goto out;
- io->addr = ub_cmd->addr;
- io->cmd = cmd;
- io->flags |= UBLK_IO_FLAG_ACTIVE;
+ ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag);
break;
default:
static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
- struct ublksrv_io_cmd *ub_src = (struct ublksrv_io_cmd *) cmd->cmd;
- struct ublksrv_io_cmd ub_cmd;
-
/*
* Not necessary for async retry, but let's keep it simple and always
* copy the values to avoid any potential reuse.
*/
- ub_cmd.q_id = READ_ONCE(ub_src->q_id);
- ub_cmd.tag = READ_ONCE(ub_src->tag);
- ub_cmd.result = READ_ONCE(ub_src->result);
- ub_cmd.addr = READ_ONCE(ub_src->addr);
+ const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe);
+ const struct ublksrv_io_cmd ub_cmd = {
+ .q_id = READ_ONCE(ub_src->q_id),
+ .tag = READ_ONCE(ub_src->tag),
+ .result = READ_ONCE(ub_src->result),
+ .addr = READ_ONCE(ub_src->addr)
+ };
return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd);
}
static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
{
- struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+ const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
int ublksrv_pid = (int)header->data[0];
struct gendisk *disk;
int ret = -EINVAL;
static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
struct io_uring_cmd *cmd)
{
- struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+ const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
void __user *argp = (void __user *)(unsigned long)header->addr;
cpumask_var_t cpumask;
unsigned long queue;
static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
{
- struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+ const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
void __user *argp = (void __user *)(unsigned long)header->addr;
struct ublksrv_ctrl_dev_info info;
struct ublk_device *ub;
else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
return -EPERM;
+ /*
+ * unprivileged device can't be trusted, but RECOVERY and
+ * RECOVERY_REISSUE still may hang error handling, so can't
+ * support recovery features for unprivileged ublk now
+ *
+ * TODO: provide forward progress for RECOVERY handler, so that
+ * unprivileged device can benefit from it
+ */
+ if (info.flags & UBLK_F_UNPRIVILEGED_DEV)
+ info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
+ UBLK_F_USER_RECOVERY);
+
/* the created device is always owned by current user */
ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
*/
ub->dev_info.flags &= UBLK_F_ALL;
- if (!IS_BUILTIN(CONFIG_BLK_DEV_UBLK))
- ub->dev_info.flags |= UBLK_F_URING_CMD_COMP_IN_TASK;
-
- ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE;
+ ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
+ UBLK_F_URING_CMD_COMP_IN_TASK;
/* We are not ready to support zero copy */
ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd)
{
- struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+ const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
__func__, cmd->cmd_op, header->dev_id, header->queue_id,
static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
struct io_uring_cmd *cmd)
{
- struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+ const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
void __user *argp = (void __user *)(unsigned long)header->addr;
if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
static int ublk_ctrl_get_params(struct ublk_device *ub,
struct io_uring_cmd *cmd)
{
- struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+ const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
void __user *argp = (void __user *)(unsigned long)header->addr;
struct ublk_params_header ph;
int ret;
static int ublk_ctrl_set_params(struct ublk_device *ub,
struct io_uring_cmd *cmd)
{
- struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+ const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
void __user *argp = (void __user *)(unsigned long)header->addr;
struct ublk_params_header ph;
int ret = -EFAULT;
put_task_struct(ubq->ubq_daemon);
/* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */
ubq->ubq_daemon = NULL;
+ ubq->timeout = false;
for (i = 0; i < ubq->q_depth; i++) {
struct ublk_io *io = &ubq->ios[i];
static int ublk_ctrl_start_recovery(struct ublk_device *ub,
struct io_uring_cmd *cmd)
{
- struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+ const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
int ret = -EINVAL;
int i;
static int ublk_ctrl_end_recovery(struct ublk_device *ub,
struct io_uring_cmd *cmd)
{
- struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+ const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
int ublksrv_pid = (int)header->data[0];
int ret = -EINVAL;
static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
struct io_uring_cmd *cmd)
{
- struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+ struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)io_uring_sqe_cmd(cmd->sqe);
bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
void __user *argp = (void __user *)(unsigned long)header->addr;
char *dev_path = NULL;
static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
- struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+ const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
struct ublk_device *ub = NULL;
u32 cmd_op = cmd->cmd_op;
int ret = -EINVAL;