io_uring/rsrc: protect node refs with uring_lock
[linux-block.git] / io_uring / rsrc.c
CommitLineData
73572984
JA
1// SPDX-License-Identifier: GPL-2.0
2#include <linux/kernel.h>
3#include <linux/errno.h>
4#include <linux/fs.h>
5#include <linux/file.h>
6#include <linux/mm.h>
7#include <linux/slab.h>
8#include <linux/nospec.h>
9#include <linux/hugetlb.h>
10#include <linux/compat.h>
11#include <linux/io_uring.h>
12
13#include <uapi/linux/io_uring.h>
14
73572984
JA
15#include "io_uring.h"
16#include "openclose.h"
17#include "rsrc.h"
18
19struct io_rsrc_update {
20 struct file *file;
21 u64 arg;
22 u32 nr_args;
23 u32 offset;
24};
25
26static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
27 struct io_mapped_ubuf **pimu,
28 struct page **last_hpage);
29
73572984
JA
30/* only define max */
31#define IORING_MAX_FIXED_FILES (1U << 20)
32#define IORING_MAX_REG_BUFFERS (1U << 14)
33
6a9ce66f 34int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
73572984
JA
35{
36 unsigned long page_limit, cur_pages, new_pages;
37
6a9ce66f
PB
38 if (!nr_pages)
39 return 0;
40
73572984
JA
41 /* Don't allow more pages than we can safely lock */
42 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
43
4ccc6db0 44 cur_pages = atomic_long_read(&user->locked_vm);
73572984 45 do {
73572984
JA
46 new_pages = cur_pages + nr_pages;
47 if (new_pages > page_limit)
48 return -ENOMEM;
4ccc6db0
UB
49 } while (!atomic_long_try_cmpxchg(&user->locked_vm,
50 &cur_pages, new_pages));
73572984
JA
51 return 0;
52}
53
54static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
55{
56 if (ctx->user)
57 __io_unaccount_mem(ctx->user, nr_pages);
58
59 if (ctx->mm_account)
60 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
61}
62
63static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
64{
65 int ret;
66
67 if (ctx->user) {
68 ret = __io_account_mem(ctx->user, nr_pages);
69 if (ret)
70 return ret;
71 }
72
73 if (ctx->mm_account)
74 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
75
76 return 0;
77}
78
79static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
80 void __user *arg, unsigned index)
81{
82 struct iovec __user *src;
83
84#ifdef CONFIG_COMPAT
85 if (ctx->compat) {
86 struct compat_iovec __user *ciovs;
87 struct compat_iovec ciov;
88
89 ciovs = (struct compat_iovec __user *) arg;
90 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
91 return -EFAULT;
92
93 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
94 dst->iov_len = ciov.iov_len;
95 return 0;
96 }
97#endif
98 src = (struct iovec __user *) arg;
99 if (copy_from_user(dst, &src[index], sizeof(*dst)))
100 return -EFAULT;
101 return 0;
102}
103
104static int io_buffer_validate(struct iovec *iov)
105{
106 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
107
108 /*
109 * Don't impose further limits on the size and buffer
110 * constraints here, we'll -EINVAL later when IO is
111 * submitted if they are wrong.
112 */
113 if (!iov->iov_base)
114 return iov->iov_len ? -EFAULT : 0;
115 if (!iov->iov_len)
116 return -EFAULT;
117
118 /* arbitrary limit, but we need something */
119 if (iov->iov_len > SZ_1G)
120 return -EFAULT;
121
122 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
123 return -EOVERFLOW;
124
125 return 0;
126}
127
128static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
129{
130 struct io_mapped_ubuf *imu = *slot;
131 unsigned int i;
132
133 if (imu != ctx->dummy_ubuf) {
134 for (i = 0; i < imu->nr_bvecs; i++)
135 unpin_user_page(imu->bvec[i].bv_page);
136 if (imu->acct_pages)
137 io_unaccount_mem(ctx, imu->acct_pages);
138 kvfree(imu);
139 }
140 *slot = NULL;
141}
142
73572984
JA
143static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
144{
145 struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
146 struct io_ring_ctx *ctx = rsrc_data->ctx;
147 struct io_rsrc_put *prsrc, *tmp;
148
149 list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
150 list_del(&prsrc->list);
151
152 if (prsrc->tag) {
d245bca6 153 if (ctx->flags & IORING_SETUP_IOPOLL) {
73572984 154 mutex_lock(&ctx->uring_lock);
b529c96a 155 io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
73572984 156 mutex_unlock(&ctx->uring_lock);
d245bca6 157 } else {
b529c96a 158 io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
d245bca6 159 }
73572984
JA
160 }
161
162 rsrc_data->do_put(ctx, prsrc);
163 kfree(prsrc);
164 }
165
166 io_rsrc_node_destroy(ref_node);
167 if (atomic_dec_and_test(&rsrc_data->refs))
168 complete(&rsrc_data->done);
169}
170
171void io_rsrc_put_work(struct work_struct *work)
172{
173 struct io_ring_ctx *ctx;
174 struct llist_node *node;
175
176 ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
177 node = llist_del_all(&ctx->rsrc_put_llist);
178
179 while (node) {
180 struct io_rsrc_node *ref_node;
181 struct llist_node *next = node->next;
182
183 ref_node = llist_entry(node, struct io_rsrc_node, llist);
184 __io_rsrc_put_work(ref_node);
185 node = next;
186 }
187}
188
d34b1b0b
PB
189void io_rsrc_put_tw(struct callback_head *cb)
190{
191 struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx,
192 rsrc_put_tw);
193
194 io_rsrc_put_work(&ctx->rsrc_put_work.work);
195}
196
73572984
JA
197void io_wait_rsrc_data(struct io_rsrc_data *data)
198{
199 if (data && !atomic_dec_and_test(&data->refs))
200 wait_for_completion(&data->done);
201}
202
203void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
204{
73572984
JA
205 kfree(ref_node);
206}
207
ef8ae64f
PB
208void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
209 __must_hold(&node->rsrc_data->ctx->uring_lock)
73572984 210{
73572984
JA
211 struct io_ring_ctx *ctx = node->rsrc_data->ctx;
212 unsigned long flags;
213 bool first_add = false;
214 unsigned long delay = HZ;
215
216 spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
217 node->done = true;
218
219 /* if we are mid-quiesce then do not delay */
220 if (node->rsrc_data->quiesce)
221 delay = 0;
222
223 while (!list_empty(&ctx->rsrc_ref_list)) {
224 node = list_first_entry(&ctx->rsrc_ref_list,
225 struct io_rsrc_node, node);
226 /* recycle ref nodes in order */
227 if (!node->done)
228 break;
229 list_del(&node->node);
230 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
231 }
232 spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
233
d34b1b0b
PB
234 if (!first_add)
235 return;
236
237 if (ctx->submitter_task) {
238 if (!task_work_add(ctx->submitter_task, &ctx->rsrc_put_tw,
239 ctx->notify_method))
240 return;
241 }
242 mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
73572984
JA
243}
244
245static struct io_rsrc_node *io_rsrc_node_alloc(void)
246{
247 struct io_rsrc_node *ref_node;
248
249 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
250 if (!ref_node)
251 return NULL;
252
ef8ae64f 253 ref_node->refs = 1;
73572984
JA
254 INIT_LIST_HEAD(&ref_node->node);
255 INIT_LIST_HEAD(&ref_node->rsrc_list);
256 ref_node->done = false;
257 return ref_node;
258}
259
260void io_rsrc_node_switch(struct io_ring_ctx *ctx,
261 struct io_rsrc_data *data_to_kill)
262 __must_hold(&ctx->uring_lock)
263{
264 WARN_ON_ONCE(!ctx->rsrc_backup_node);
265 WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
266
73572984
JA
267 if (data_to_kill) {
268 struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
269
270 rsrc_node->rsrc_data = data_to_kill;
271 spin_lock_irq(&ctx->rsrc_ref_lock);
272 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
273 spin_unlock_irq(&ctx->rsrc_ref_lock);
274
275 atomic_inc(&data_to_kill->refs);
b8fb5b4f 276 /* put master ref */
ef8ae64f 277 io_put_rsrc_node(rsrc_node);
73572984
JA
278 ctx->rsrc_node = NULL;
279 }
280
281 if (!ctx->rsrc_node) {
282 ctx->rsrc_node = ctx->rsrc_backup_node;
283 ctx->rsrc_backup_node = NULL;
284 }
285}
286
287int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
288{
289 if (ctx->rsrc_backup_node)
290 return 0;
291 ctx->rsrc_backup_node = io_rsrc_node_alloc();
292 return ctx->rsrc_backup_node ? 0 : -ENOMEM;
293}
294
295__cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
296 struct io_ring_ctx *ctx)
297{
298 int ret;
299
300 /* As we may drop ->uring_lock, other task may have started quiesce */
301 if (data->quiesce)
302 return -ENXIO;
77e3202a
PB
303 ret = io_rsrc_node_switch_start(ctx);
304 if (ret)
305 return ret;
306 io_rsrc_node_switch(ctx, data);
307
308 /* kill initial ref, already quiesced if zero */
309 if (atomic_dec_and_test(&data->refs))
310 return 0;
73572984
JA
311
312 data->quiesce = true;
77e3202a 313 mutex_unlock(&ctx->uring_lock);
73572984 314 do {
ef67fcb4 315 ret = io_run_task_work_sig(ctx);
77e3202a
PB
316 if (ret < 0) {
317 atomic_inc(&data->refs);
318 /* wait for all works potentially completing data->done */
319 flush_delayed_work(&ctx->rsrc_put_work);
320 reinit_completion(&data->done);
321 mutex_lock(&ctx->uring_lock);
322 break;
323 }
ef67fcb4 324
73572984
JA
325 flush_delayed_work(&ctx->rsrc_put_work);
326 ret = wait_for_completion_interruptible(&data->done);
327 if (!ret) {
328 mutex_lock(&ctx->uring_lock);
0ced756f 329 if (atomic_read(&data->refs) <= 0)
73572984 330 break;
0ced756f
PB
331 /*
332 * it has been revived by another thread while
333 * we were unlocked
334 */
335 mutex_unlock(&ctx->uring_lock);
73572984 336 }
77e3202a 337 } while (1);
73572984
JA
338 data->quiesce = false;
339
340 return ret;
341}
342
343static void io_free_page_table(void **table, size_t size)
344{
345 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
346
347 for (i = 0; i < nr_tables; i++)
348 kfree(table[i]);
349 kfree(table);
350}
351
352static void io_rsrc_data_free(struct io_rsrc_data *data)
353{
354 size_t size = data->nr * sizeof(data->tags[0][0]);
355
356 if (data->tags)
357 io_free_page_table((void **)data->tags, size);
358 kfree(data);
359}
360
361static __cold void **io_alloc_page_table(size_t size)
362{
363 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
364 size_t init_size = size;
365 void **table;
366
367 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
368 if (!table)
369 return NULL;
370
371 for (i = 0; i < nr_tables; i++) {
372 unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
373
374 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
375 if (!table[i]) {
376 io_free_page_table(table, init_size);
377 return NULL;
378 }
379 size -= this_size;
380 }
381 return table;
382}
383
384__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx,
385 rsrc_put_fn *do_put, u64 __user *utags,
386 unsigned nr, struct io_rsrc_data **pdata)
387{
388 struct io_rsrc_data *data;
6acd352d 389 int ret = 0;
73572984
JA
390 unsigned i;
391
392 data = kzalloc(sizeof(*data), GFP_KERNEL);
393 if (!data)
394 return -ENOMEM;
395 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
396 if (!data->tags) {
397 kfree(data);
398 return -ENOMEM;
399 }
400
401 data->nr = nr;
402 data->ctx = ctx;
403 data->do_put = do_put;
404 if (utags) {
405 ret = -EFAULT;
406 for (i = 0; i < nr; i++) {
407 u64 *tag_slot = io_get_tag_slot(data, i);
408
409 if (copy_from_user(tag_slot, &utags[i],
410 sizeof(*tag_slot)))
411 goto fail;
412 }
413 }
414
415 atomic_set(&data->refs, 1);
416 init_completion(&data->done);
417 *pdata = data;
418 return 0;
419fail:
420 io_rsrc_data_free(data);
421 return ret;
422}
423
424static int __io_sqe_files_update(struct io_ring_ctx *ctx,
425 struct io_uring_rsrc_update2 *up,
426 unsigned nr_args)
427{
428 u64 __user *tags = u64_to_user_ptr(up->tags);
429 __s32 __user *fds = u64_to_user_ptr(up->data);
430 struct io_rsrc_data *data = ctx->file_data;
431 struct io_fixed_file *file_slot;
432 struct file *file;
433 int fd, i, err = 0;
434 unsigned int done;
435 bool needs_switch = false;
436
437 if (!ctx->file_data)
438 return -ENXIO;
439 if (up->offset + nr_args > ctx->nr_user_files)
440 return -EINVAL;
441
442 for (done = 0; done < nr_args; done++) {
443 u64 tag = 0;
444
445 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
446 copy_from_user(&fd, &fds[done], sizeof(fd))) {
447 err = -EFAULT;
448 break;
449 }
450 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
451 err = -EINVAL;
452 break;
453 }
454 if (fd == IORING_REGISTER_FILES_SKIP)
455 continue;
456
457 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
458 file_slot = io_fixed_file_slot(&ctx->file_table, i);
459
460 if (file_slot->file_ptr) {
461 file = (struct file *)(file_slot->file_ptr & FFS_MASK);
462 err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
463 if (err)
464 break;
465 file_slot->file_ptr = 0;
466 io_file_bitmap_clear(&ctx->file_table, i);
467 needs_switch = true;
468 }
469 if (fd != -1) {
470 file = fget(fd);
471 if (!file) {
472 err = -EBADF;
473 break;
474 }
475 /*
476 * Don't allow io_uring instances to be registered. If
477 * UNIX isn't enabled, then this causes a reference
478 * cycle and this instance can never get freed. If UNIX
479 * is enabled we'll handle it just fine, but there's
480 * still no point in allowing a ring fd as it doesn't
481 * support regular read/write anyway.
482 */
483 if (io_is_uring_fops(file)) {
484 fput(file);
485 err = -EBADF;
486 break;
487 }
488 err = io_scm_file_account(ctx, file);
489 if (err) {
490 fput(file);
491 break;
492 }
493 *io_get_tag_slot(data, i) = tag;
494 io_fixed_file_set(file_slot, file);
495 io_file_bitmap_set(&ctx->file_table, i);
496 }
497 }
498
499 if (needs_switch)
500 io_rsrc_node_switch(ctx, data);
501 return done ? done : err;
502}
503
504static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
505 struct io_uring_rsrc_update2 *up,
506 unsigned int nr_args)
507{
508 u64 __user *tags = u64_to_user_ptr(up->tags);
509 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
510 struct page *last_hpage = NULL;
511 bool needs_switch = false;
512 __u32 done;
513 int i, err;
514
515 if (!ctx->buf_data)
516 return -ENXIO;
517 if (up->offset + nr_args > ctx->nr_user_bufs)
518 return -EINVAL;
519
520 for (done = 0; done < nr_args; done++) {
521 struct io_mapped_ubuf *imu;
522 int offset = up->offset + done;
523 u64 tag = 0;
524
525 err = io_copy_iov(ctx, &iov, iovs, done);
526 if (err)
527 break;
528 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
529 err = -EFAULT;
530 break;
531 }
532 err = io_buffer_validate(&iov);
533 if (err)
534 break;
535 if (!iov.iov_base && tag) {
536 err = -EINVAL;
537 break;
538 }
539 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
540 if (err)
541 break;
542
543 i = array_index_nospec(offset, ctx->nr_user_bufs);
544 if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
545 err = io_queue_rsrc_removal(ctx->buf_data, i,
546 ctx->rsrc_node, ctx->user_bufs[i]);
547 if (unlikely(err)) {
548 io_buffer_unmap(ctx, &imu);
549 break;
550 }
5ff4fdff 551 ctx->user_bufs[i] = ctx->dummy_ubuf;
73572984
JA
552 needs_switch = true;
553 }
554
555 ctx->user_bufs[i] = imu;
556 *io_get_tag_slot(ctx->buf_data, offset) = tag;
557 }
558
559 if (needs_switch)
560 io_rsrc_node_switch(ctx, ctx->buf_data);
561 return done ? done : err;
562}
563
564static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
565 struct io_uring_rsrc_update2 *up,
566 unsigned nr_args)
567{
568 __u32 tmp;
569 int err;
570
571 if (check_add_overflow(up->offset, nr_args, &tmp))
572 return -EOVERFLOW;
573 err = io_rsrc_node_switch_start(ctx);
574 if (err)
575 return err;
576
577 switch (type) {
578 case IORING_RSRC_FILE:
579 return __io_sqe_files_update(ctx, up, nr_args);
580 case IORING_RSRC_BUFFER:
581 return __io_sqe_buffers_update(ctx, up, nr_args);
582 }
583 return -EINVAL;
584}
585
586int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
587 unsigned nr_args)
588{
589 struct io_uring_rsrc_update2 up;
590
591 if (!nr_args)
592 return -EINVAL;
593 memset(&up, 0, sizeof(up));
594 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
595 return -EFAULT;
596 if (up.resv || up.resv2)
597 return -EINVAL;
598 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
599}
600
601int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
602 unsigned size, unsigned type)
603{
604 struct io_uring_rsrc_update2 up;
605
606 if (size != sizeof(up))
607 return -EINVAL;
608 if (copy_from_user(&up, arg, sizeof(up)))
609 return -EFAULT;
610 if (!up.nr || up.resv || up.resv2)
611 return -EINVAL;
612 return __io_register_rsrc_update(ctx, type, &up, up.nr);
613}
614
615__cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
616 unsigned int size, unsigned int type)
617{
618 struct io_uring_rsrc_register rr;
619
620 /* keep it extendible */
621 if (size != sizeof(rr))
622 return -EINVAL;
623
624 memset(&rr, 0, sizeof(rr));
625 if (copy_from_user(&rr, arg, size))
626 return -EFAULT;
627 if (!rr.nr || rr.resv2)
628 return -EINVAL;
629 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
630 return -EINVAL;
631
632 switch (type) {
633 case IORING_RSRC_FILE:
634 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
635 break;
636 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
637 rr.nr, u64_to_user_ptr(rr.tags));
638 case IORING_RSRC_BUFFER:
639 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
640 break;
641 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
642 rr.nr, u64_to_user_ptr(rr.tags));
643 }
644 return -EINVAL;
645}
646
d9808ceb 647int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
73572984 648{
f2ccb5ae 649 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
73572984
JA
650
651 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
652 return -EINVAL;
653 if (sqe->rw_flags || sqe->splice_fd_in)
654 return -EINVAL;
655
656 up->offset = READ_ONCE(sqe->off);
657 up->nr_args = READ_ONCE(sqe->len);
658 if (!up->nr_args)
659 return -EINVAL;
660 up->arg = READ_ONCE(sqe->addr);
661 return 0;
662}
663
664static int io_files_update_with_index_alloc(struct io_kiocb *req,
665 unsigned int issue_flags)
666{
f2ccb5ae 667 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
73572984
JA
668 __s32 __user *fds = u64_to_user_ptr(up->arg);
669 unsigned int done;
670 struct file *file;
671 int ret, fd;
672
673 if (!req->ctx->file_data)
674 return -ENXIO;
675
676 for (done = 0; done < up->nr_args; done++) {
677 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
678 ret = -EFAULT;
679 break;
680 }
681
682 file = fget(fd);
683 if (!file) {
684 ret = -EBADF;
685 break;
686 }
687 ret = io_fixed_fd_install(req, issue_flags, file,
688 IORING_FILE_INDEX_ALLOC);
689 if (ret < 0)
690 break;
691 if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
f110ed84 692 __io_close_fixed(req->ctx, issue_flags, ret);
73572984
JA
693 ret = -EFAULT;
694 break;
695 }
696 }
697
698 if (done)
699 return done;
700 return ret;
701}
702
d9808ceb 703int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
73572984 704{
f2ccb5ae 705 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
73572984
JA
706 struct io_ring_ctx *ctx = req->ctx;
707 struct io_uring_rsrc_update2 up2;
708 int ret;
709
710 up2.offset = up->offset;
711 up2.data = up->arg;
712 up2.nr = 0;
713 up2.tags = 0;
714 up2.resv = 0;
715 up2.resv2 = 0;
716
717 if (up->offset == IORING_FILE_INDEX_ALLOC) {
718 ret = io_files_update_with_index_alloc(req, issue_flags);
719 } else {
720 io_ring_submit_lock(ctx, issue_flags);
721 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
722 &up2, up->nr_args);
723 io_ring_submit_unlock(ctx, issue_flags);
724 }
725
726 if (ret < 0)
727 req_set_fail(req);
728 io_req_set_res(req, ret, 0);
729 return IOU_OK;
730}
731
732int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
733 struct io_rsrc_node *node, void *rsrc)
734{
735 u64 *tag_slot = io_get_tag_slot(data, idx);
736 struct io_rsrc_put *prsrc;
737
738 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
739 if (!prsrc)
740 return -ENOMEM;
741
742 prsrc->tag = *tag_slot;
743 *tag_slot = 0;
744 prsrc->rsrc = rsrc;
745 list_add(&prsrc->list, &node->rsrc_list);
746 return 0;
747}
748
749void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
750{
73572984
JA
751 int i;
752
753 for (i = 0; i < ctx->nr_user_files; i++) {
754 struct file *file = io_file_from_index(&ctx->file_table, i);
755
38eddb2c
PB
756 /* skip scm accounted files, they'll be freed by ->ring_sock */
757 if (!file || io_file_need_scm(file))
73572984
JA
758 continue;
759 io_file_bitmap_clear(&ctx->file_table, i);
760 fput(file);
761 }
73572984
JA
762
763#if defined(CONFIG_UNIX)
764 if (ctx->ring_sock) {
765 struct sock *sock = ctx->ring_sock->sk;
766 struct sk_buff *skb;
767
768 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
769 kfree_skb(skb);
770 }
771#endif
772 io_free_file_tables(&ctx->file_table);
02a4d923 773 io_file_table_set_alloc_range(ctx, 0, 0);
73572984
JA
774 io_rsrc_data_free(ctx->file_data);
775 ctx->file_data = NULL;
776 ctx->nr_user_files = 0;
777}
778
779int io_sqe_files_unregister(struct io_ring_ctx *ctx)
780{
781 unsigned nr = ctx->nr_user_files;
782 int ret;
783
784 if (!ctx->file_data)
785 return -ENXIO;
786
787 /*
788 * Quiesce may unlock ->uring_lock, and while it's not held
789 * prevent new requests using the table.
790 */
791 ctx->nr_user_files = 0;
792 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
793 ctx->nr_user_files = nr;
794 if (!ret)
795 __io_sqe_files_unregister(ctx);
796 return ret;
797}
798
799/*
800 * Ensure the UNIX gc is aware of our file set, so we are certain that
801 * the io_uring can be safely unregistered on process exit, even if we have
802 * loops in the file referencing. We account only files that can hold other
803 * files because otherwise they can't form a loop and so are not interesting
804 * for GC.
805 */
806int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
807{
808#if defined(CONFIG_UNIX)
809 struct sock *sk = ctx->ring_sock->sk;
810 struct sk_buff_head *head = &sk->sk_receive_queue;
811 struct scm_fp_list *fpl;
812 struct sk_buff *skb;
813
814 if (likely(!io_file_need_scm(file)))
815 return 0;
816
817 /*
818 * See if we can merge this file into an existing skb SCM_RIGHTS
819 * file set. If there's no room, fall back to allocating a new skb
820 * and filling it in.
821 */
822 spin_lock_irq(&head->lock);
823 skb = skb_peek(head);
824 if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD)
825 __skb_unlink(skb, head);
826 else
827 skb = NULL;
828 spin_unlock_irq(&head->lock);
829
830 if (!skb) {
831 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
832 if (!fpl)
833 return -ENOMEM;
834
835 skb = alloc_skb(0, GFP_KERNEL);
836 if (!skb) {
837 kfree(fpl);
838 return -ENOMEM;
839 }
840
841 fpl->user = get_uid(current_user());
842 fpl->max = SCM_MAX_FD;
843 fpl->count = 0;
844
845 UNIXCB(skb).fp = fpl;
846 skb->sk = sk;
0091bfc8 847 skb->scm_io_uring = 1;
73572984
JA
848 skb->destructor = unix_destruct_scm;
849 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
850 }
851
852 fpl = UNIXCB(skb).fp;
853 fpl->fp[fpl->count++] = get_file(file);
854 unix_inflight(fpl->user, file);
855 skb_queue_head(head, skb);
856 fput(file);
857#endif
858 return 0;
859}
860
861static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
862{
863 struct file *file = prsrc->file;
864#if defined(CONFIG_UNIX)
865 struct sock *sock = ctx->ring_sock->sk;
866 struct sk_buff_head list, *head = &sock->sk_receive_queue;
867 struct sk_buff *skb;
868 int i;
869
870 if (!io_file_need_scm(file)) {
871 fput(file);
872 return;
873 }
874
875 __skb_queue_head_init(&list);
876
877 /*
878 * Find the skb that holds this file in its SCM_RIGHTS. When found,
879 * remove this entry and rearrange the file array.
880 */
881 skb = skb_dequeue(head);
882 while (skb) {
883 struct scm_fp_list *fp;
884
885 fp = UNIXCB(skb).fp;
886 for (i = 0; i < fp->count; i++) {
887 int left;
888
889 if (fp->fp[i] != file)
890 continue;
891
892 unix_notinflight(fp->user, fp->fp[i]);
893 left = fp->count - 1 - i;
894 if (left) {
895 memmove(&fp->fp[i], &fp->fp[i + 1],
896 left * sizeof(struct file *));
897 }
898 fp->count--;
899 if (!fp->count) {
900 kfree_skb(skb);
901 skb = NULL;
902 } else {
903 __skb_queue_tail(&list, skb);
904 }
905 fput(file);
906 file = NULL;
907 break;
908 }
909
910 if (!file)
911 break;
912
913 __skb_queue_tail(&list, skb);
914
915 skb = skb_dequeue(head);
916 }
917
918 if (skb_peek(&list)) {
919 spin_lock_irq(&head->lock);
920 while ((skb = __skb_dequeue(&list)) != NULL)
921 __skb_queue_tail(head, skb);
922 spin_unlock_irq(&head->lock);
923 }
924#else
925 fput(file);
926#endif
927}
928
929int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
930 unsigned nr_args, u64 __user *tags)
931{
932 __s32 __user *fds = (__s32 __user *) arg;
933 struct file *file;
934 int fd, ret;
935 unsigned i;
936
937 if (ctx->file_data)
938 return -EBUSY;
939 if (!nr_args)
940 return -EINVAL;
941 if (nr_args > IORING_MAX_FIXED_FILES)
942 return -EMFILE;
943 if (nr_args > rlimit(RLIMIT_NOFILE))
944 return -EMFILE;
945 ret = io_rsrc_node_switch_start(ctx);
946 if (ret)
947 return ret;
948 ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
949 &ctx->file_data);
950 if (ret)
951 return ret;
952
953 if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
954 io_rsrc_data_free(ctx->file_data);
955 ctx->file_data = NULL;
956 return -ENOMEM;
957 }
958
959 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
960 struct io_fixed_file *file_slot;
961
962 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
963 ret = -EFAULT;
964 goto fail;
965 }
966 /* allow sparse sets */
967 if (!fds || fd == -1) {
968 ret = -EINVAL;
969 if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
970 goto fail;
971 continue;
972 }
973
974 file = fget(fd);
975 ret = -EBADF;
976 if (unlikely(!file))
977 goto fail;
978
979 /*
980 * Don't allow io_uring instances to be registered. If UNIX
981 * isn't enabled, then this causes a reference cycle and this
982 * instance can never get freed. If UNIX is enabled we'll
983 * handle it just fine, but there's still no point in allowing
984 * a ring fd as it doesn't support regular read/write anyway.
985 */
986 if (io_is_uring_fops(file)) {
987 fput(file);
988 goto fail;
989 }
990 ret = io_scm_file_account(ctx, file);
991 if (ret) {
992 fput(file);
993 goto fail;
994 }
995 file_slot = io_fixed_file_slot(&ctx->file_table, i);
996 io_fixed_file_set(file_slot, file);
997 io_file_bitmap_set(&ctx->file_table, i);
998 }
999
6e73dffb
PB
1000 /* default it to the whole table */
1001 io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files);
73572984
JA
1002 io_rsrc_node_switch(ctx, NULL);
1003 return 0;
1004fail:
1005 __io_sqe_files_unregister(ctx);
1006 return ret;
1007}
1008
1009static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
1010{
1011 io_buffer_unmap(ctx, &prsrc->buf);
1012 prsrc->buf = NULL;
1013}
1014
1015void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
1016{
1017 unsigned int i;
1018
1019 for (i = 0; i < ctx->nr_user_bufs; i++)
1020 io_buffer_unmap(ctx, &ctx->user_bufs[i]);
1021 kfree(ctx->user_bufs);
1022 io_rsrc_data_free(ctx->buf_data);
1023 ctx->user_bufs = NULL;
1024 ctx->buf_data = NULL;
1025 ctx->nr_user_bufs = 0;
1026}
1027
1028int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
1029{
1030 unsigned nr = ctx->nr_user_bufs;
1031 int ret;
1032
1033 if (!ctx->buf_data)
1034 return -ENXIO;
1035
1036 /*
1037 * Quiesce may unlock ->uring_lock, and while it's not held
1038 * prevent new requests using the table.
1039 */
1040 ctx->nr_user_bufs = 0;
1041 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
1042 ctx->nr_user_bufs = nr;
1043 if (!ret)
1044 __io_sqe_buffers_unregister(ctx);
1045 return ret;
1046}
1047
1048/*
1049 * Not super efficient, but this is just a registration time. And we do cache
1050 * the last compound head, so generally we'll only do a full search if we don't
1051 * match that one.
1052 *
1053 * We check if the given compound head page has already been accounted, to
1054 * avoid double accounting it. This allows us to account the full size of the
1055 * page, not just the constituent pages of a huge page.
1056 */
1057static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
1058 int nr_pages, struct page *hpage)
1059{
1060 int i, j;
1061
1062 /* check current page array */
1063 for (i = 0; i < nr_pages; i++) {
1064 if (!PageCompound(pages[i]))
1065 continue;
1066 if (compound_head(pages[i]) == hpage)
1067 return true;
1068 }
1069
1070 /* check previously registered pages */
1071 for (i = 0; i < ctx->nr_user_bufs; i++) {
1072 struct io_mapped_ubuf *imu = ctx->user_bufs[i];
1073
1074 for (j = 0; j < imu->nr_bvecs; j++) {
1075 if (!PageCompound(imu->bvec[j].bv_page))
1076 continue;
1077 if (compound_head(imu->bvec[j].bv_page) == hpage)
1078 return true;
1079 }
1080 }
1081
1082 return false;
1083}
1084
1085static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
1086 int nr_pages, struct io_mapped_ubuf *imu,
1087 struct page **last_hpage)
1088{
1089 int i, ret;
1090
1091 imu->acct_pages = 0;
1092 for (i = 0; i < nr_pages; i++) {
1093 if (!PageCompound(pages[i])) {
1094 imu->acct_pages++;
1095 } else {
1096 struct page *hpage;
1097
1098 hpage = compound_head(pages[i]);
1099 if (hpage == *last_hpage)
1100 continue;
1101 *last_hpage = hpage;
1102 if (headpage_already_acct(ctx, pages, i, hpage))
1103 continue;
1104 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
1105 }
1106 }
1107
1108 if (!imu->acct_pages)
1109 return 0;
1110
1111 ret = io_account_mem(ctx, imu->acct_pages);
1112 if (ret)
1113 imu->acct_pages = 0;
1114 return ret;
1115}
1116
1117struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
1118{
1119 unsigned long start, end, nr_pages;
1120 struct vm_area_struct **vmas = NULL;
1121 struct page **pages = NULL;
1122 int i, pret, ret = -ENOMEM;
1123
1124 end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1125 start = ubuf >> PAGE_SHIFT;
1126 nr_pages = end - start;
1127
1128 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
1129 if (!pages)
1130 goto done;
1131
1132 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
1133 GFP_KERNEL);
1134 if (!vmas)
1135 goto done;
1136
1137 ret = 0;
1138 mmap_read_lock(current->mm);
1139 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
1140 pages, vmas);
1141 if (pret == nr_pages) {
edd47826
PB
1142 struct file *file = vmas[0]->vm_file;
1143
73572984
JA
1144 /* don't support file backed memory */
1145 for (i = 0; i < nr_pages; i++) {
edd47826
PB
1146 if (vmas[i]->vm_file != file) {
1147 ret = -EINVAL;
1148 break;
1149 }
1150 if (!file)
73572984 1151 continue;
edd47826 1152 if (!vma_is_shmem(vmas[i]) && !is_file_hugepages(file)) {
73572984
JA
1153 ret = -EOPNOTSUPP;
1154 break;
1155 }
1156 }
1157 *npages = nr_pages;
1158 } else {
1159 ret = pret < 0 ? pret : -EFAULT;
1160 }
1161 mmap_read_unlock(current->mm);
1162 if (ret) {
1163 /*
1164 * if we did partial map, or found file backed vmas,
1165 * release any pages we did get
1166 */
1167 if (pret > 0)
1168 unpin_user_pages(pages, pret);
1169 goto done;
1170 }
1171 ret = 0;
1172done:
1173 kvfree(vmas);
1174 if (ret < 0) {
1175 kvfree(pages);
1176 pages = ERR_PTR(ret);
1177 }
1178 return pages;
1179}
1180
1181static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
1182 struct io_mapped_ubuf **pimu,
1183 struct page **last_hpage)
1184{
1185 struct io_mapped_ubuf *imu = NULL;
1186 struct page **pages = NULL;
1187 unsigned long off;
1188 size_t size;
1189 int ret, nr_pages, i;
977bc873 1190 struct folio *folio = NULL;
73572984 1191
5ff4fdff
PB
1192 *pimu = ctx->dummy_ubuf;
1193 if (!iov->iov_base)
73572984 1194 return 0;
73572984 1195
73572984 1196 ret = -ENOMEM;
73572984
JA
1197 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
1198 &nr_pages);
1199 if (IS_ERR(pages)) {
1200 ret = PTR_ERR(pages);
1201 pages = NULL;
1202 goto done;
1203 }
1204
57bebf80
PB
1205 /* If it's a huge page, try to coalesce them into a single bvec entry */
1206 if (nr_pages > 1) {
1207 folio = page_folio(pages[0]);
1208 for (i = 1; i < nr_pages; i++) {
1209 if (page_folio(pages[i]) != folio) {
1210 folio = NULL;
1211 break;
1212 }
1213 }
1214 if (folio) {
d2acf789
PB
1215 /*
1216 * The pages are bound to the folio, it doesn't
1217 * actually unpin them but drops all but one reference,
1218 * which is usually put down by io_buffer_unmap().
1219 * Note, needs a better helper.
1220 */
1221 unpin_user_pages(&pages[1], nr_pages - 1);
57bebf80
PB
1222 nr_pages = 1;
1223 }
1224 }
1225
73572984
JA
1226 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
1227 if (!imu)
1228 goto done;
1229
1230 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
1231 if (ret) {
1232 unpin_user_pages(pages, nr_pages);
1233 goto done;
1234 }
1235
1236 off = (unsigned long) iov->iov_base & ~PAGE_MASK;
1237 size = iov->iov_len;
57bebf80
PB
1238 /* store original address for later verification */
1239 imu->ubuf = (unsigned long) iov->iov_base;
1240 imu->ubuf_end = imu->ubuf + iov->iov_len;
1241 imu->nr_bvecs = nr_pages;
1242 *pimu = imu;
1243 ret = 0;
1244
1245 if (folio) {
1246 bvec_set_page(&imu->bvec[0], pages[0], size, off);
1247 goto done;
1248 }
73572984
JA
1249 for (i = 0; i < nr_pages; i++) {
1250 size_t vec_len;
1251
1252 vec_len = min_t(size_t, size, PAGE_SIZE - off);
cc342a21 1253 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off);
73572984
JA
1254 off = 0;
1255 size -= vec_len;
1256 }
73572984
JA
1257done:
1258 if (ret)
1259 kvfree(imu);
1260 kvfree(pages);
1261 return ret;
1262}
1263
1264static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
1265{
1266 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
1267 return ctx->user_bufs ? 0 : -ENOMEM;
1268}
1269
1270int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
1271 unsigned int nr_args, u64 __user *tags)
1272{
1273 struct page *last_hpage = NULL;
1274 struct io_rsrc_data *data;
1275 int i, ret;
1276 struct iovec iov;
1277
1278 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
1279
1280 if (ctx->user_bufs)
1281 return -EBUSY;
1282 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
1283 return -EINVAL;
1284 ret = io_rsrc_node_switch_start(ctx);
1285 if (ret)
1286 return ret;
1287 ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
1288 if (ret)
1289 return ret;
1290 ret = io_buffers_map_alloc(ctx, nr_args);
1291 if (ret) {
1292 io_rsrc_data_free(data);
1293 return ret;
1294 }
1295
1296 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
1297 if (arg) {
1298 ret = io_copy_iov(ctx, &iov, arg, i);
1299 if (ret)
1300 break;
1301 ret = io_buffer_validate(&iov);
1302 if (ret)
1303 break;
1304 } else {
1305 memset(&iov, 0, sizeof(iov));
1306 }
1307
1308 if (!iov.iov_base && *io_get_tag_slot(data, i)) {
1309 ret = -EINVAL;
1310 break;
1311 }
1312
1313 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
1314 &last_hpage);
1315 if (ret)
1316 break;
1317 }
1318
1319 WARN_ON_ONCE(ctx->buf_data);
1320
1321 ctx->buf_data = data;
1322 if (ret)
1323 __io_sqe_buffers_unregister(ctx);
1324 else
1325 io_rsrc_node_switch(ctx, NULL);
1326 return ret;
1327}
c059f785
PB
1328
1329int io_import_fixed(int ddir, struct iov_iter *iter,
1330 struct io_mapped_ubuf *imu,
1331 u64 buf_addr, size_t len)
1332{
1333 u64 buf_end;
1334 size_t offset;
1335
1336 if (WARN_ON_ONCE(!imu))
1337 return -EFAULT;
1338 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
1339 return -EFAULT;
1340 /* not inside the mapped region */
1341 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
1342 return -EFAULT;
1343
1344 /*
6bf65a1b 1345 * Might not be a start of buffer, set size appropriately
c059f785
PB
1346 * and advance us to the beginning.
1347 */
1348 offset = buf_addr - imu->ubuf;
1349 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
1350
1351 if (offset) {
1352 /*
1353 * Don't use iov_iter_advance() here, as it's really slow for
1354 * using the latter parts of a big fixed buffer - it iterates
1355 * over each segment manually. We can cheat a bit here, because
1356 * we know that:
1357 *
1358 * 1) it's a BVEC iter, we set it up
1359 * 2) all bvecs are PAGE_SIZE in size, except potentially the
1360 * first and last bvec
1361 *
1362 * So just find our index, and adjust the iterator afterwards.
1363 * If the offset is within the first bvec (or the whole first
1364 * bvec, just use iov_iter_advance(). This makes it easier
1365 * since we can just skip the first segment, which may not
1366 * be PAGE_SIZE aligned.
1367 */
1368 const struct bio_vec *bvec = imu->bvec;
1369
1370 if (offset <= bvec->bv_len) {
57bebf80
PB
1371 /*
1372 * Note, huge pages buffers consists of one large
1373 * bvec entry and should always go this way. The other
1374 * branch doesn't expect non PAGE_SIZE'd chunks.
1375 */
b000ae0e
PB
1376 iter->bvec = bvec;
1377 iter->nr_segs = bvec->bv_len;
1378 iter->count -= offset;
1379 iter->iov_offset = offset;
c059f785
PB
1380 } else {
1381 unsigned long seg_skip;
1382
1383 /* skip first vec */
1384 offset -= bvec->bv_len;
1385 seg_skip = 1 + (offset >> PAGE_SHIFT);
1386
1387 iter->bvec = bvec + seg_skip;
1388 iter->nr_segs -= seg_skip;
1389 iter->count -= bvec->bv_len + offset;
1390 iter->iov_offset = offset & ~PAGE_MASK;
1391 }
1392 }
1393
1394 return 0;
1395}