[linux-block.git] / io_uring / timeout.c

// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/io_uring.h>

#include <trace/events/io_uring.h>

#include <uapi/linux/io_uring.h>

#include "io_uring_types.h"
#include "io_uring.h"
#include "refs.h"
#include "timeout.h"

struct io_timeout {
	struct file			*file;
	u32				off;
	u32				target_seq;
	struct list_head		list;
	/* head of the link, used by linked timeouts only */
	struct io_kiocb			*head;
	/* for linked completions */
	struct io_kiocb			*prev;
};

struct io_timeout_rem {
	struct file			*file;
	u64				addr;

	/* timeout update */
	struct timespec64		ts;
	u32				flags;
	bool				ltimeout;
};

static inline bool io_is_timeout_noseq(struct io_kiocb *req)
{
	struct io_timeout *timeout = io_kiocb_to_cmd(req);

	return !timeout->off;
}

static inline void io_put_req(struct io_kiocb *req)
{
	if (req_ref_put_and_test(req)) {
		io_queue_next(req);
		io_free_req(req);
	}
}

static void io_kill_timeout(struct io_kiocb *req, int status)
	__must_hold(&req->ctx->completion_lock)
	__must_hold(&req->ctx->timeout_lock)
{
	struct io_timeout_data *io = req->async_data;

	if (hrtimer_try_to_cancel(&io->timer) != -1) {
		struct io_timeout *timeout = io_kiocb_to_cmd(req);

		if (status)
			req_set_fail(req);
		atomic_set(&req->ctx->cq_timeouts,
			atomic_read(&req->ctx->cq_timeouts) + 1);
		list_del_init(&timeout->list);
		io_req_tw_post_queue(req, status, 0);
	}
}

__cold void io_flush_timeouts(struct io_ring_ctx *ctx)
	__must_hold(&ctx->completion_lock)
{
	u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
	struct io_timeout *timeout, *tmp;

	spin_lock_irq(&ctx->timeout_lock);
	list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) {
		struct io_kiocb *req = cmd_to_io_kiocb(timeout);
		u32 events_needed, events_got;

		if (io_is_timeout_noseq(req))
			break;

		/*
		 * Since seq can easily wrap around over time, subtract
		 * the last seq at which timeouts were flushed before comparing.
		 * Assuming not more than 2^31-1 events have happened since,
		 * these subtractions won't have wrapped, so we can check if
		 * target is in [last_seq, current_seq] by comparing the two.
		 */
		events_needed = timeout->target_seq - ctx->cq_last_tm_flush;
		events_got = seq - ctx->cq_last_tm_flush;
		if (events_got < events_needed)
			break;

		io_kill_timeout(req, 0);
	}
	ctx->cq_last_tm_flush = seq;
	spin_unlock_irq(&ctx->timeout_lock);
}

static void io_fail_links(struct io_kiocb *req)
	__must_hold(&req->ctx->completion_lock)
{
	struct io_kiocb *nxt, *link = req->link;
	bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES;

	req->link = NULL;
	while (link) {
		long res = -ECANCELED;

		if (link->flags & REQ_F_FAIL)
			res = link->cqe.res;

		nxt = link->link;
		link->link = NULL;

		trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data,
					req->opcode, link);

		if (ignore_cqes)
			link->flags |= REQ_F_CQE_SKIP;
		else
			link->flags &= ~REQ_F_CQE_SKIP;
		io_req_set_res(link, res, 0);
		__io_req_complete_post(link);
		link = nxt;
	}
}

static inline void io_remove_next_linked(struct io_kiocb *req)
{
	struct io_kiocb *nxt = req->link;

	req->link = nxt->link;
	nxt->link = NULL;
}

bool io_disarm_next(struct io_kiocb *req)
	__must_hold(&req->ctx->completion_lock)
{
	struct io_kiocb *link = NULL;
	bool posted = false;

	if (req->flags & REQ_F_ARM_LTIMEOUT) {
		link = req->link;
		req->flags &= ~REQ_F_ARM_LTIMEOUT;
		if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
			io_remove_next_linked(req);
			io_req_tw_post_queue(link, -ECANCELED, 0);
			posted = true;
		}
	} else if (req->flags & REQ_F_LINK_TIMEOUT) {
		struct io_ring_ctx *ctx = req->ctx;

		spin_lock_irq(&ctx->timeout_lock);
		link = io_disarm_linked_timeout(req);
		spin_unlock_irq(&ctx->timeout_lock);
		if (link) {
			posted = true;
			io_req_tw_post_queue(link, -ECANCELED, 0);
		}
	}
	if (unlikely((req->flags & REQ_F_FAIL) &&
		     !(req->flags & REQ_F_HARDLINK))) {
		posted |= (req->link != NULL);
		io_fail_links(req);
	}
	return posted;
}

struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
					    struct io_kiocb *link)
	__must_hold(&req->ctx->completion_lock)
	__must_hold(&req->ctx->timeout_lock)
{
	struct io_timeout_data *io = link->async_data;
	struct io_timeout *timeout = io_kiocb_to_cmd(link);

	io_remove_next_linked(req);
	timeout->head = NULL;
	if (hrtimer_try_to_cancel(&io->timer) != -1) {
		list_del(&timeout->list);
		return link;
	}

	return NULL;
}

static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
{
	struct io_timeout_data *data = container_of(timer,
						struct io_timeout_data, timer);
	struct io_kiocb *req = data->req;
	struct io_timeout *timeout = io_kiocb_to_cmd(req);
	struct io_ring_ctx *ctx = req->ctx;
	unsigned long flags;

	spin_lock_irqsave(&ctx->timeout_lock, flags);
	list_del_init(&timeout->list);
	atomic_set(&req->ctx->cq_timeouts,
		atomic_read(&req->ctx->cq_timeouts) + 1);
	spin_unlock_irqrestore(&ctx->timeout_lock, flags);

	if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
		req_set_fail(req);

	io_req_set_res(req, -ETIME, 0);
	req->io_task_work.func = io_req_task_complete;
	io_req_task_work_add(req);
	return HRTIMER_NORESTART;
}

static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
					   struct io_cancel_data *cd)
	__must_hold(&ctx->timeout_lock)
{
	struct io_timeout *timeout;
	struct io_timeout_data *io;
	struct io_kiocb *req = NULL;

	list_for_each_entry(timeout, &ctx->timeout_list, list) {
		struct io_kiocb *tmp = cmd_to_io_kiocb(timeout);

		if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
		    cd->data != tmp->cqe.user_data)
			continue;
		if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
			if (cd->seq == tmp->work.cancel_seq)
				continue;
			tmp->work.cancel_seq = cd->seq;
		}
		req = tmp;
		break;
	}
	if (!req)
		return ERR_PTR(-ENOENT);

	io = req->async_data;
	if (hrtimer_try_to_cancel(&io->timer) == -1)
		return ERR_PTR(-EALREADY);
	timeout = io_kiocb_to_cmd(req);
	list_del_init(&timeout->list);
	return req;
}

int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
	__must_hold(&ctx->completion_lock)
{
	struct io_kiocb *req;

	spin_lock_irq(&ctx->timeout_lock);
	req = io_timeout_extract(ctx, cd);
	spin_unlock_irq(&ctx->timeout_lock);

	if (IS_ERR(req))
		return PTR_ERR(req);
	io_req_task_queue_fail(req, -ECANCELED);
	return 0;
}

static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
{
	struct io_timeout *timeout = io_kiocb_to_cmd(req);
	struct io_kiocb *prev = timeout->prev;
	int ret = -ENOENT;

	if (prev) {
		if (!(req->task->flags & PF_EXITING)) {
			struct io_cancel_data cd = {
				.ctx		= req->ctx,
				.data		= prev->cqe.user_data,
			};

			ret = io_try_cancel(req, &cd);
		}
		io_req_set_res(req, ret ?: -ETIME, 0);
		io_req_complete_post(req);
		io_put_req(prev);
	} else {
		io_req_set_res(req, -ETIME, 0);
		io_req_complete_post(req);
	}
}

static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
{
	struct io_timeout_data *data = container_of(timer,
						struct io_timeout_data, timer);
	struct io_kiocb *prev, *req = data->req;
	struct io_timeout *timeout = io_kiocb_to_cmd(req);
	struct io_ring_ctx *ctx = req->ctx;
	unsigned long flags;

	spin_lock_irqsave(&ctx->timeout_lock, flags);
	prev = timeout->head;
	timeout->head = NULL;

	/*
	 * We don't expect the list to be empty, that will only happen if we
	 * race with the completion of the linked work.
	 */
	if (prev) {
		io_remove_next_linked(prev);
		if (!req_ref_inc_not_zero(prev))
			prev = NULL;
	}
	list_del(&timeout->list);
	timeout->prev = prev;
	spin_unlock_irqrestore(&ctx->timeout_lock, flags);

	req->io_task_work.func = io_req_task_link_timeout;
	io_req_task_work_add(req);
	return HRTIMER_NORESTART;
}

static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
{
	switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
	case IORING_TIMEOUT_BOOTTIME:
		return CLOCK_BOOTTIME;
	case IORING_TIMEOUT_REALTIME:
		return CLOCK_REALTIME;
	default:
		/* can't happen, vetted at prep time */
		WARN_ON_ONCE(1);
		fallthrough;
	case 0:
		return CLOCK_MONOTONIC;
	}
}

static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
				    struct timespec64 *ts, enum hrtimer_mode mode)
	__must_hold(&ctx->timeout_lock)
{
	struct io_timeout_data *io;
	struct io_timeout *timeout;
	struct io_kiocb *req = NULL;

	list_for_each_entry(timeout, &ctx->ltimeout_list, list) {
		struct io_kiocb *tmp = cmd_to_io_kiocb(timeout);

		if (user_data == tmp->cqe.user_data) {
			req = tmp;
			break;
		}
	}
	if (!req)
		return -ENOENT;

	io = req->async_data;
	if (hrtimer_try_to_cancel(&io->timer) == -1)
		return -EALREADY;
	hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
	io->timer.function = io_link_timeout_fn;
	hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
	return 0;
}

static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
			     struct timespec64 *ts, enum hrtimer_mode mode)
	__must_hold(&ctx->timeout_lock)
{
	struct io_cancel_data cd = { .data = user_data, };
	struct io_kiocb *req = io_timeout_extract(ctx, &cd);
	struct io_timeout *timeout = io_kiocb_to_cmd(req);
	struct io_timeout_data *data;

	if (IS_ERR(req))
		return PTR_ERR(req);

	timeout->off = 0; /* noseq */
	data = req->async_data;
	list_add_tail(&timeout->list, &ctx->timeout_list);
	hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
	data->timer.function = io_timeout_fn;
	hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
	return 0;
}

int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
	struct io_timeout_rem *tr = io_kiocb_to_cmd(req);

	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
		return -EINVAL;
	if (sqe->buf_index || sqe->len || sqe->splice_fd_in)
		return -EINVAL;

	tr->ltimeout = false;
	tr->addr = READ_ONCE(sqe->addr);
	tr->flags = READ_ONCE(sqe->timeout_flags);
	if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
		if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
			return -EINVAL;
		if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
			tr->ltimeout = true;
		if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
			return -EINVAL;
		if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
			return -EFAULT;
		if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0)
			return -EINVAL;
	} else if (tr->flags) {
		/* timeout removal doesn't support flags */
		return -EINVAL;
	}

	return 0;
}

static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
{
	return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
					    : HRTIMER_MODE_REL;
}

/*
 * Remove or update an existing timeout command
 */
int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
{
	struct io_timeout_rem *tr = io_kiocb_to_cmd(req);
	struct io_ring_ctx *ctx = req->ctx;
	int ret;

	if (!(tr->flags & IORING_TIMEOUT_UPDATE)) {
		struct io_cancel_data cd = { .data = tr->addr, };

		spin_lock(&ctx->completion_lock);
		ret = io_timeout_cancel(ctx, &cd);
		spin_unlock(&ctx->completion_lock);
	} else {
		enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);

		spin_lock_irq(&ctx->timeout_lock);
		if (tr->ltimeout)
			ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
		else
			ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
		spin_unlock_irq(&ctx->timeout_lock);
	}

	if (ret < 0)
		req_set_fail(req);
	io_req_set_res(req, ret, 0);
	return IOU_OK;
}

static int __io_timeout_prep(struct io_kiocb *req,
			     const struct io_uring_sqe *sqe,
			     bool is_timeout_link)
{
	struct io_timeout *timeout = io_kiocb_to_cmd(req);
	struct io_timeout_data *data;
	unsigned flags;
	u32 off = READ_ONCE(sqe->off);

	if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in)
		return -EINVAL;
	if (off && is_timeout_link)
		return -EINVAL;
	flags = READ_ONCE(sqe->timeout_flags);
	if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
		      IORING_TIMEOUT_ETIME_SUCCESS))
		return -EINVAL;
	/* more than one clock specified is invalid, obviously */
	if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
		return -EINVAL;

	INIT_LIST_HEAD(&timeout->list);
	timeout->off = off;
	if (unlikely(off && !req->ctx->off_timeout_used))
		req->ctx->off_timeout_used = true;

	if (WARN_ON_ONCE(req_has_async_data(req)))
		return -EFAULT;
	if (io_alloc_async_data(req))
		return -ENOMEM;

	data = req->async_data;
	data->req = req;
	data->flags = flags;

	if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
		return -EFAULT;

	if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0)
		return -EINVAL;

	INIT_LIST_HEAD(&timeout->list);
	data->mode = io_translate_timeout_mode(flags);
	hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);

	if (is_timeout_link) {
		struct io_submit_link *link = &req->ctx->submit_state.link;

		if (!link->head)
			return -EINVAL;
		if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
			return -EINVAL;
		timeout->head = link->last;
		link->last->flags |= REQ_F_ARM_LTIMEOUT;
	}
	return 0;
}

int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
	return __io_timeout_prep(req, sqe, false);
}

int io_link_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
	return __io_timeout_prep(req, sqe, true);
}

int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
{
	struct io_timeout *timeout = io_kiocb_to_cmd(req);
	struct io_ring_ctx *ctx = req->ctx;
	struct io_timeout_data *data = req->async_data;
	struct list_head *entry;
	u32 tail, off = timeout->off;

	spin_lock_irq(&ctx->timeout_lock);

	/*
	 * sqe->off holds how many events that need to occur for this
	 * timeout event to be satisfied. If it isn't set, then this is
	 * a pure timeout request, sequence isn't used.
	 */
	if (io_is_timeout_noseq(req)) {
		entry = ctx->timeout_list.prev;
		goto add;
	}

	tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
	timeout->target_seq = tail + off;

	/* Update the last seq here in case io_flush_timeouts() hasn't.
	 * This is safe because ->completion_lock is held, and submissions
	 * and completions are never mixed in the same ->completion_lock section.
	 */
	ctx->cq_last_tm_flush = tail;

	/*
	 * Insertion sort, ensuring the first entry in the list is always
	 * the one we need first.
	 */
	list_for_each_prev(entry, &ctx->timeout_list) {
		struct io_timeout *nextt = list_entry(entry, struct io_timeout, list);
		struct io_kiocb *nxt = cmd_to_io_kiocb(nextt);

		if (io_is_timeout_noseq(nxt))
			continue;
		/* nxt.seq is behind @tail, otherwise would've been completed */
		if (off >= nextt->target_seq - tail)
			break;
	}
add:
	list_add(&timeout->list, entry);
	data->timer.function = io_timeout_fn;
	hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
	spin_unlock_irq(&ctx->timeout_lock);
	return IOU_ISSUE_SKIP_COMPLETE;
}

void io_queue_linked_timeout(struct io_kiocb *req)
{
	struct io_timeout *timeout = io_kiocb_to_cmd(req);
	struct io_ring_ctx *ctx = req->ctx;

	spin_lock_irq(&ctx->timeout_lock);
	/*
	 * If the back reference is NULL, then our linked request finished
	 * before we got a chance to setup the timer
	 */
	if (timeout->head) {
		struct io_timeout_data *data = req->async_data;

		data->timer.function = io_link_timeout_fn;
		hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
				data->mode);
		list_add_tail(&timeout->list, &ctx->ltimeout_list);
	}
	spin_unlock_irq(&ctx->timeout_lock);
	/* drop submission reference */
	io_put_req(req);
}

static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
			  bool cancel_all)
	__must_hold(&req->ctx->timeout_lock)
{
	struct io_kiocb *req;

	if (task && head->task != task)
		return false;
	if (cancel_all)
		return true;

	io_for_each_link(req, head) {
		if (req->flags & REQ_F_INFLIGHT)
			return true;
	}
	return false;
}

/* Returns true if we found and killed one or more timeouts */
__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
			     bool cancel_all)
{
	struct io_timeout *timeout, *tmp;
	int canceled = 0;

	spin_lock(&ctx->completion_lock);
	spin_lock_irq(&ctx->timeout_lock);
	list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) {
		struct io_kiocb *req = cmd_to_io_kiocb(timeout);

		if (io_match_task(req, tsk, cancel_all)) {
			io_kill_timeout(req, -ECANCELED);
			canceled++;
		}
	}
	spin_unlock_irq(&ctx->timeout_lock);
	io_commit_cqring(ctx);
	spin_unlock(&ctx->completion_lock);
	if (canceled != 0)
		io_cqring_ev_posted(ctx);
	return canceled != 0;
}
Commit	Line	Data
59915143 JA	1	// SPDX-License-Identifier: GPL-2.0
	2	#include <linux/kernel.h>
	3	#include <linux/errno.h>
	4	#include <linux/file.h>
	5	#include <linux/io_uring.h>
	6
	7	#include <trace/events/io_uring.h>
	8
	9	#include <uapi/linux/io_uring.h>
	10
	11	#include "io_uring_types.h"
	12	#include "io_uring.h"
	13	#include "refs.h"
	14	#include "timeout.h"
	15
	16	struct io_timeout {
	17	struct file *file;
	18	u32 off;
	19	u32 target_seq;
	20	struct list_head list;
	21	/* head of the link, used by linked timeouts only */
	22	struct io_kiocb *head;
	23	/* for linked completions */
	24	struct io_kiocb *prev;
	25	};
	26
	27	struct io_timeout_rem {
	28	struct file *file;
	29	u64 addr;
	30
	31	/* timeout update */
	32	struct timespec64 ts;
	33	u32 flags;
	34	bool ltimeout;
	35	};
	36
	37	static inline bool io_is_timeout_noseq(struct io_kiocb *req)
	38	{
	39	struct io_timeout *timeout = io_kiocb_to_cmd(req);
	40
	41	return !timeout->off;
	42	}
	43
	44	static inline void io_put_req(struct io_kiocb *req)
	45	{
	46	if (req_ref_put_and_test(req)) {
	47	io_queue_next(req);
	48	io_free_req(req);
	49	}
	50	}
	51
	52	static void io_kill_timeout(struct io_kiocb *req, int status)
	53	__must_hold(&req->ctx->completion_lock)
	54	__must_hold(&req->ctx->timeout_lock)
	55	{
	56	struct io_timeout_data *io = req->async_data;
	57
	58	if (hrtimer_try_to_cancel(&io->timer) != -1) {
	59	struct io_timeout *timeout = io_kiocb_to_cmd(req);
	60
	61	if (status)
	62	req_set_fail(req);
	63	atomic_set(&req->ctx->cq_timeouts,
	64	atomic_read(&req->ctx->cq_timeouts) + 1);
65	list_del_init(&timeout->list);
66	io_req_tw_post_queue(req, status, 0);
67	}
68	}
69
70	__cold void io_flush_timeouts(struct io_ring_ctx *ctx)
71	__must_hold(&ctx->completion_lock)
72	{
73	u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
74	struct io_timeout timeout, tmp;
75
76	spin_lock_irq(&ctx->timeout_lock);
77	list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) {
78	struct io_kiocb *req = cmd_to_io_kiocb(timeout);
79	u32 events_needed, events_got;
80
81	if (io_is_timeout_noseq(req))
82	break;
83
84	/*
85	* Since seq can easily wrap around over time, subtract
86	* the last seq at which timeouts were flushed before comparing.
87	* Assuming not more than 2^31-1 events have happened since,
88	* these subtractions won't have wrapped, so we can check if
89	* target is in [last_seq, current_seq] by comparing the two.
90	*/
91	events_needed = timeout->target_seq - ctx->cq_last_tm_flush;
92	events_got = seq - ctx->cq_last_tm_flush;
93	if (events_got < events_needed)
94	break;
95
96	io_kill_timeout(req, 0);
97	}
98	ctx->cq_last_tm_flush = seq;
99	spin_unlock_irq(&ctx->timeout_lock);
100	}
101
102	static void io_fail_links(struct io_kiocb *req)
103	__must_hold(&req->ctx->completion_lock)
104	{
105	struct io_kiocb nxt, link = req->link;
106	bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES;
107
108	req->link = NULL;
109	while (link) {
110	long res = -ECANCELED;
111
112	if (link->flags & REQ_F_FAIL)
113	res = link->cqe.res;
114
115	nxt = link->link;
116	link->link = NULL;
117
118	trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data,
119	req->opcode, link);
120
121	if (ignore_cqes)
122	link->flags \|= REQ_F_CQE_SKIP;
123	else
124	link->flags &= ~REQ_F_CQE_SKIP;
125	io_req_set_res(link, res, 0);
126	__io_req_complete_post(link);
127	link = nxt;
128	}
129	}
130
131	static inline void io_remove_next_linked(struct io_kiocb *req)
132	{
133	struct io_kiocb *nxt = req->link;
134
135	req->link = nxt->link;
136	nxt->link = NULL;
137	}
138
139	bool io_disarm_next(struct io_kiocb *req)
140	__must_hold(&req->ctx->completion_lock)
141	{
142	struct io_kiocb *link = NULL;
143	bool posted = false;
144
145	if (req->flags & REQ_F_ARM_LTIMEOUT) {
146	link = req->link;
147	req->flags &= ~REQ_F_ARM_LTIMEOUT;
148	if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
149	io_remove_next_linked(req);
150	io_req_tw_post_queue(link, -ECANCELED, 0);
151	posted = true;
152	}
153	} else if (req->flags & REQ_F_LINK_TIMEOUT) {
154	struct io_ring_ctx *ctx = req->ctx;
155
156	spin_lock_irq(&ctx->timeout_lock);
157	link = io_disarm_linked_timeout(req);
158	spin_unlock_irq(&ctx->timeout_lock);
159	if (link) {
160	posted = true;
161	io_req_tw_post_queue(link, -ECANCELED, 0);
162	}
163	}
164	if (unlikely((req->flags & REQ_F_FAIL) &&
165	!(req->flags & REQ_F_HARDLINK))) {
166	posted \|= (req->link != NULL);
167	io_fail_links(req);
168	}
169	return posted;
170	}
171
172	struct io_kiocb __io_disarm_linked_timeout(struct io_kiocb req,
173	struct io_kiocb *link)
174	__must_hold(&req->ctx->completion_lock)
175	__must_hold(&req->ctx->timeout_lock)
176	{
177	struct io_timeout_data *io = link->async_data;
178	struct io_timeout *timeout = io_kiocb_to_cmd(link);
179
180	io_remove_next_linked(req);
181	timeout->head = NULL;
182	if (hrtimer_try_to_cancel(&io->timer) != -1) {
183	list_del(&timeout->list);
184	return link;
185	}
186
187	return NULL;
188	}
189
190	static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
191	{
192	struct io_timeout_data *data = container_of(timer,
193	struct io_timeout_data, timer);
194	struct io_kiocb *req = data->req;
195	struct io_timeout *timeout = io_kiocb_to_cmd(req);
196	struct io_ring_ctx *ctx = req->ctx;
197	unsigned long flags;
198
199	spin_lock_irqsave(&ctx->timeout_lock, flags);
200	list_del_init(&timeout->list);
201	atomic_set(&req->ctx->cq_timeouts,
202	atomic_read(&req->ctx->cq_timeouts) + 1);
203	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
204
205	if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
206	req_set_fail(req);
207
208	io_req_set_res(req, -ETIME, 0);
209	req->io_task_work.func = io_req_task_complete;
210	io_req_task_work_add(req);
211	return HRTIMER_NORESTART;
212	}
213
214	static struct io_kiocb io_timeout_extract(struct io_ring_ctx ctx,
215	struct io_cancel_data *cd)
216	__must_hold(&ctx->timeout_lock)
217	{
218	struct io_timeout *timeout;
219	struct io_timeout_data *io;
220	struct io_kiocb *req = NULL;
221
222	list_for_each_entry(timeout, &ctx->timeout_list, list) {
223	struct io_kiocb *tmp = cmd_to_io_kiocb(timeout);
224
225	if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
226	cd->data != tmp->cqe.user_data)
227	continue;
228	if (cd->flags & (IORING_ASYNC_CANCEL_ALL\|IORING_ASYNC_CANCEL_ANY)) {
229	if (cd->seq == tmp->work.cancel_seq)
230	continue;
231	tmp->work.cancel_seq = cd->seq;
232	}
233	req = tmp;
234	break;
235	}
236	if (!req)
237	return ERR_PTR(-ENOENT);
238
239	io = req->async_data;
240	if (hrtimer_try_to_cancel(&io->timer) == -1)
241	return ERR_PTR(-EALREADY);
242	timeout = io_kiocb_to_cmd(req);
243	list_del_init(&timeout->list);
244	return req;
245	}
246
247	int io_timeout_cancel(struct io_ring_ctx ctx, struct io_cancel_data cd)
248	__must_hold(&ctx->completion_lock)
249	{
250	struct io_kiocb *req;
251
252	spin_lock_irq(&ctx->timeout_lock);
253	req = io_timeout_extract(ctx, cd);
254	spin_unlock_irq(&ctx->timeout_lock);
255
256	if (IS_ERR(req))
257	return PTR_ERR(req);
258	io_req_task_queue_fail(req, -ECANCELED);
259	return 0;
260	}
261
262	static void io_req_task_link_timeout(struct io_kiocb req, bool locked)
263	{
264	struct io_timeout *timeout = io_kiocb_to_cmd(req);
265	struct io_kiocb *prev = timeout->prev;
266	int ret = -ENOENT;
267
268	if (prev) {
269	if (!(req->task->flags & PF_EXITING)) {
270	struct io_cancel_data cd = {
271	.ctx = req->ctx,
272	.data = prev->cqe.user_data,
273	};
274
275	ret = io_try_cancel(req, &cd);
276	}
277	io_req_set_res(req, ret ?: -ETIME, 0);
278	io_req_complete_post(req);
279	io_put_req(prev);
280	} else {
281	io_req_set_res(req, -ETIME, 0);
282	io_req_complete_post(req);
283	}
284	}
285
286	static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
287	{
288	struct io_timeout_data *data = container_of(timer,
289	struct io_timeout_data, timer);
290	struct io_kiocb prev, req = data->req;
291	struct io_timeout *timeout = io_kiocb_to_cmd(req);
292	struct io_ring_ctx *ctx = req->ctx;
293	unsigned long flags;
294
295	spin_lock_irqsave(&ctx->timeout_lock, flags);
296	prev = timeout->head;
297	timeout->head = NULL;
298
299	/*
300	* We don't expect the list to be empty, that will only happen if we
301	* race with the completion of the linked work.
302	*/
303	if (prev) {
304	io_remove_next_linked(prev);
305	if (!req_ref_inc_not_zero(prev))
306	prev = NULL;
307	}
308	list_del(&timeout->list);
309	timeout->prev = prev;
310	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
311
312	req->io_task_work.func = io_req_task_link_timeout;
313	io_req_task_work_add(req);
314	return HRTIMER_NORESTART;
315	}
316
317	static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
318	{
319	switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
320	case IORING_TIMEOUT_BOOTTIME:
321	return CLOCK_BOOTTIME;
322	case IORING_TIMEOUT_REALTIME:
323	return CLOCK_REALTIME;
324	default:
325	/* can't happen, vetted at prep time */
326	WARN_ON_ONCE(1);
327	fallthrough;
328	case 0:
329	return CLOCK_MONOTONIC;
330	}
331	}
332
333	static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
334	struct timespec64 *ts, enum hrtimer_mode mode)
335	__must_hold(&ctx->timeout_lock)
336	{
337	struct io_timeout_data *io;
338	struct io_timeout *timeout;
339	struct io_kiocb *req = NULL;
340
341	list_for_each_entry(timeout, &ctx->ltimeout_list, list) {
342	struct io_kiocb *tmp = cmd_to_io_kiocb(timeout);
343
344	if (user_data == tmp->cqe.user_data) {
345	req = tmp;
346	break;
347	}
348	}
349	if (!req)
350	return -ENOENT;
351
352	io = req->async_data;
353	if (hrtimer_try_to_cancel(&io->timer) == -1)
354	return -EALREADY;
355	hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
356	io->timer.function = io_link_timeout_fn;
357	hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
358	return 0;
359	}
360
361	static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
362	struct timespec64 *ts, enum hrtimer_mode mode)
363	__must_hold(&ctx->timeout_lock)
364	{
365	struct io_cancel_data cd = { .data = user_data, };
366	struct io_kiocb *req = io_timeout_extract(ctx, &cd);
367	struct io_timeout *timeout = io_kiocb_to_cmd(req);
368	struct io_timeout_data *data;
369
370	if (IS_ERR(req))
371	return PTR_ERR(req);
372
373	timeout->off = 0; /* noseq */
374	data = req->async_data;
375	list_add_tail(&timeout->list, &ctx->timeout_list);
376	hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
377	data->timer.function = io_timeout_fn;
378	hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
379	return 0;
380	}
381
382	int io_timeout_remove_prep(struct io_kiocb req, const struct io_uring_sqe sqe)
383	{
384	struct io_timeout_rem *tr = io_kiocb_to_cmd(req);
385
386	if (unlikely(req->flags & (REQ_F_FIXED_FILE \| REQ_F_BUFFER_SELECT)))
387	return -EINVAL;
388	if (sqe->buf_index \|\| sqe->len \|\| sqe->splice_fd_in)
389	return -EINVAL;
390
391	tr->ltimeout = false;
392	tr->addr = READ_ONCE(sqe->addr);
393	tr->flags = READ_ONCE(sqe->timeout_flags);
394	if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
395	if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
396	return -EINVAL;
397	if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
398	tr->ltimeout = true;
399	if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK\|IORING_TIMEOUT_ABS))
400	return -EINVAL;
401	if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
402	return -EFAULT;
403	if (tr->ts.tv_sec < 0 \|\| tr->ts.tv_nsec < 0)
404	return -EINVAL;
405	} else if (tr->flags) {
406	/* timeout removal doesn't support flags */
407	return -EINVAL;
408	}
409
410	return 0;
411	}
412
413	static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
414	{
415	return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
416	: HRTIMER_MODE_REL;
417	}
418
419	/*
420	* Remove or update an existing timeout command
421	*/
422	int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
423	{
424	struct io_timeout_rem *tr = io_kiocb_to_cmd(req);
425	struct io_ring_ctx *ctx = req->ctx;
426	int ret;
427
428	if (!(tr->flags & IORING_TIMEOUT_UPDATE)) {
429	struct io_cancel_data cd = { .data = tr->addr, };
430
431	spin_lock(&ctx->completion_lock);
432	ret = io_timeout_cancel(ctx, &cd);
433	spin_unlock(&ctx->completion_lock);
434	} else {
435	enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
436
437	spin_lock_irq(&ctx->timeout_lock);
438	if (tr->ltimeout)
439	ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
440	else
441	ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
442	spin_unlock_irq(&ctx->timeout_lock);
443	}
444
445	if (ret < 0)
446	req_set_fail(req);
447	io_req_set_res(req, ret, 0);
448	return IOU_OK;
449	}
450
451	static int __io_timeout_prep(struct io_kiocb *req,
452	const struct io_uring_sqe *sqe,
453	bool is_timeout_link)
454	{
455	struct io_timeout *timeout = io_kiocb_to_cmd(req);
456	struct io_timeout_data *data;
457	unsigned flags;
458	u32 off = READ_ONCE(sqe->off);
459
460	if (sqe->buf_index \|\| sqe->len != 1 \|\| sqe->splice_fd_in)
461	return -EINVAL;
462	if (off && is_timeout_link)
463	return -EINVAL;
464	flags = READ_ONCE(sqe->timeout_flags);
465	if (flags & ~(IORING_TIMEOUT_ABS \| IORING_TIMEOUT_CLOCK_MASK \|
466	IORING_TIMEOUT_ETIME_SUCCESS))
467	return -EINVAL;
468	/* more than one clock specified is invalid, obviously */
469	if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
470	return -EINVAL;
471
472	INIT_LIST_HEAD(&timeout->list);
473	timeout->off = off;
474	if (unlikely(off && !req->ctx->off_timeout_used))
475	req->ctx->off_timeout_used = true;
476
477	if (WARN_ON_ONCE(req_has_async_data(req)))
478	return -EFAULT;
479	if (io_alloc_async_data(req))
480	return -ENOMEM;
481
482	data = req->async_data;
483	data->req = req;
484	data->flags = flags;
485
486	if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
487	return -EFAULT;
488
489	if (data->ts.tv_sec < 0 \|\| data->ts.tv_nsec < 0)
490	return -EINVAL;
491
492	INIT_LIST_HEAD(&timeout->list);
493	data->mode = io_translate_timeout_mode(flags);
494	hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
495
496	if (is_timeout_link) {
497	struct io_submit_link *link = &req->ctx->submit_state.link;
498
499	if (!link->head)
500	return -EINVAL;
501	if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
502	return -EINVAL;
503	timeout->head = link->last;
504	link->last->flags \|= REQ_F_ARM_LTIMEOUT;
505	}
506	return 0;
507	}
508
509	int io_timeout_prep(struct io_kiocb req, const struct io_uring_sqe sqe)
510	{
511	return __io_timeout_prep(req, sqe, false);
512	}
513
514	int io_link_timeout_prep(struct io_kiocb req, const struct io_uring_sqe sqe)
515	{
516	return __io_timeout_prep(req, sqe, true);
517	}
518
519	int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
520	{
521	struct io_timeout *timeout = io_kiocb_to_cmd(req);
522	struct io_ring_ctx *ctx = req->ctx;
523	struct io_timeout_data *data = req->async_data;
524	struct list_head *entry;
525	u32 tail, off = timeout->off;
526
527	spin_lock_irq(&ctx->timeout_lock);
528
529	/*
530	* sqe->off holds how many events that need to occur for this
531	* timeout event to be satisfied. If it isn't set, then this is
532	* a pure timeout request, sequence isn't used.
533	*/
534	if (io_is_timeout_noseq(req)) {
535	entry = ctx->timeout_list.prev;
536	goto add;
537	}
538
539	tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
540	timeout->target_seq = tail + off;
541
542	/* Update the last seq here in case io_flush_timeouts() hasn't.
543	* This is safe because ->completion_lock is held, and submissions
544	* and completions are never mixed in the same ->completion_lock section.
545	*/
546	ctx->cq_last_tm_flush = tail;
547
548	/*
549	* Insertion sort, ensuring the first entry in the list is always
550	* the one we need first.
551	*/
552	list_for_each_prev(entry, &ctx->timeout_list) {
553	struct io_timeout *nextt = list_entry(entry, struct io_timeout, list);
554	struct io_kiocb *nxt = cmd_to_io_kiocb(nextt);
555
556	if (io_is_timeout_noseq(nxt))
557	continue;
558	/* nxt.seq is behind @tail, otherwise would've been completed */
559	if (off >= nextt->target_seq - tail)
560	break;
561	}
562	add:
563	list_add(&timeout->list, entry);
564	data->timer.function = io_timeout_fn;
565	hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
566	spin_unlock_irq(&ctx->timeout_lock);
567	return IOU_ISSUE_SKIP_COMPLETE;
568	}
569
570	void io_queue_linked_timeout(struct io_kiocb *req)
571	{
572	struct io_timeout *timeout = io_kiocb_to_cmd(req);
573	struct io_ring_ctx *ctx = req->ctx;
574
575	spin_lock_irq(&ctx->timeout_lock);
576	/*
577	* If the back reference is NULL, then our linked request finished
578	* before we got a chance to setup the timer
579	*/
580	if (timeout->head) {
581	struct io_timeout_data *data = req->async_data;
582
583	data->timer.function = io_link_timeout_fn;
584	hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
585	data->mode);
586	list_add_tail(&timeout->list, &ctx->ltimeout_list);
587	}
588	spin_unlock_irq(&ctx->timeout_lock);
589	/* drop submission reference */
590	io_put_req(req);
591	}
592
593	static bool io_match_task(struct io_kiocb head, struct task_struct task,
594	bool cancel_all)
595	__must_hold(&req->ctx->timeout_lock)
596	{
597	struct io_kiocb *req;
598
599	if (task && head->task != task)
600	return false;
601	if (cancel_all)
602	return true;
603
604	io_for_each_link(req, head) {
605	if (req->flags & REQ_F_INFLIGHT)
606	return true;
607	}
608	return false;
609	}
610
611	/* Returns true if we found and killed one or more timeouts */
612	__cold bool io_kill_timeouts(struct io_ring_ctx ctx, struct task_struct tsk,
613	bool cancel_all)
614	{
615	struct io_timeout timeout, tmp;
616	int canceled = 0;
617
618	spin_lock(&ctx->completion_lock);
619	spin_lock_irq(&ctx->timeout_lock);
620	list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) {
621	struct io_kiocb *req = cmd_to_io_kiocb(timeout);
622
623	if (io_match_task(req, tsk, cancel_all)) {
624	io_kill_timeout(req, -ECANCELED);
625	canceled++;
626	}
627	}
628	spin_unlock_irq(&ctx->timeout_lock);
629	io_commit_cqring(ctx);
630	spin_unlock(&ctx->completion_lock);
631	if (canceled != 0)
632	io_cqring_ev_posted(ctx);
633	return canceled != 0;
634	}