io_uring: deprecate epoll_ctl support
[linux-2.6-block.git] / io_uring / rsrc.c
CommitLineData
73572984
JA
1// SPDX-License-Identifier: GPL-2.0
2#include <linux/kernel.h>
3#include <linux/errno.h>
4#include <linux/fs.h>
5#include <linux/file.h>
6#include <linux/mm.h>
7#include <linux/slab.h>
8#include <linux/nospec.h>
9#include <linux/hugetlb.h>
10#include <linux/compat.h>
11#include <linux/io_uring.h>
12
13#include <uapi/linux/io_uring.h>
14
15#include "io_uring_types.h"
16#include "io_uring.h"
17#include "openclose.h"
18#include "rsrc.h"
19
20struct io_rsrc_update {
21 struct file *file;
22 u64 arg;
23 u32 nr_args;
24 u32 offset;
25};
26
27static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
28 struct io_mapped_ubuf **pimu,
29 struct page **last_hpage);
30
31#define IO_RSRC_REF_BATCH 100
32
33/* only define max */
34#define IORING_MAX_FIXED_FILES (1U << 20)
35#define IORING_MAX_REG_BUFFERS (1U << 14)
36
37void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
38 __must_hold(&ctx->uring_lock)
39{
40 if (ctx->rsrc_cached_refs) {
41 io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs);
42 ctx->rsrc_cached_refs = 0;
43 }
44}
45
46static inline void __io_unaccount_mem(struct user_struct *user,
47 unsigned long nr_pages)
48{
49 atomic_long_sub(nr_pages, &user->locked_vm);
50}
51
52static inline int __io_account_mem(struct user_struct *user,
53 unsigned long nr_pages)
54{
55 unsigned long page_limit, cur_pages, new_pages;
56
57 /* Don't allow more pages than we can safely lock */
58 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
59
60 do {
61 cur_pages = atomic_long_read(&user->locked_vm);
62 new_pages = cur_pages + nr_pages;
63 if (new_pages > page_limit)
64 return -ENOMEM;
65 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
66 new_pages) != cur_pages);
67
68 return 0;
69}
70
71static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
72{
73 if (ctx->user)
74 __io_unaccount_mem(ctx->user, nr_pages);
75
76 if (ctx->mm_account)
77 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
78}
79
80static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
81{
82 int ret;
83
84 if (ctx->user) {
85 ret = __io_account_mem(ctx->user, nr_pages);
86 if (ret)
87 return ret;
88 }
89
90 if (ctx->mm_account)
91 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
92
93 return 0;
94}
95
96static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
97 void __user *arg, unsigned index)
98{
99 struct iovec __user *src;
100
101#ifdef CONFIG_COMPAT
102 if (ctx->compat) {
103 struct compat_iovec __user *ciovs;
104 struct compat_iovec ciov;
105
106 ciovs = (struct compat_iovec __user *) arg;
107 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
108 return -EFAULT;
109
110 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
111 dst->iov_len = ciov.iov_len;
112 return 0;
113 }
114#endif
115 src = (struct iovec __user *) arg;
116 if (copy_from_user(dst, &src[index], sizeof(*dst)))
117 return -EFAULT;
118 return 0;
119}
120
121static int io_buffer_validate(struct iovec *iov)
122{
123 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
124
125 /*
126 * Don't impose further limits on the size and buffer
127 * constraints here, we'll -EINVAL later when IO is
128 * submitted if they are wrong.
129 */
130 if (!iov->iov_base)
131 return iov->iov_len ? -EFAULT : 0;
132 if (!iov->iov_len)
133 return -EFAULT;
134
135 /* arbitrary limit, but we need something */
136 if (iov->iov_len > SZ_1G)
137 return -EFAULT;
138
139 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
140 return -EOVERFLOW;
141
142 return 0;
143}
144
145static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
146{
147 struct io_mapped_ubuf *imu = *slot;
148 unsigned int i;
149
150 if (imu != ctx->dummy_ubuf) {
151 for (i = 0; i < imu->nr_bvecs; i++)
152 unpin_user_page(imu->bvec[i].bv_page);
153 if (imu->acct_pages)
154 io_unaccount_mem(ctx, imu->acct_pages);
155 kvfree(imu);
156 }
157 *slot = NULL;
158}
159
160void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
161 __must_hold(&ctx->uring_lock)
162{
163 ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
164 percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
165}
166
167static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
168{
169 struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
170 struct io_ring_ctx *ctx = rsrc_data->ctx;
171 struct io_rsrc_put *prsrc, *tmp;
172
173 list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
174 list_del(&prsrc->list);
175
176 if (prsrc->tag) {
177 if (ctx->flags & IORING_SETUP_IOPOLL)
178 mutex_lock(&ctx->uring_lock);
179
180 spin_lock(&ctx->completion_lock);
181 io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
182 io_commit_cqring(ctx);
183 spin_unlock(&ctx->completion_lock);
184 io_cqring_ev_posted(ctx);
185
186 if (ctx->flags & IORING_SETUP_IOPOLL)
187 mutex_unlock(&ctx->uring_lock);
188 }
189
190 rsrc_data->do_put(ctx, prsrc);
191 kfree(prsrc);
192 }
193
194 io_rsrc_node_destroy(ref_node);
195 if (atomic_dec_and_test(&rsrc_data->refs))
196 complete(&rsrc_data->done);
197}
198
199void io_rsrc_put_work(struct work_struct *work)
200{
201 struct io_ring_ctx *ctx;
202 struct llist_node *node;
203
204 ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
205 node = llist_del_all(&ctx->rsrc_put_llist);
206
207 while (node) {
208 struct io_rsrc_node *ref_node;
209 struct llist_node *next = node->next;
210
211 ref_node = llist_entry(node, struct io_rsrc_node, llist);
212 __io_rsrc_put_work(ref_node);
213 node = next;
214 }
215}
216
217void io_wait_rsrc_data(struct io_rsrc_data *data)
218{
219 if (data && !atomic_dec_and_test(&data->refs))
220 wait_for_completion(&data->done);
221}
222
223void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
224{
225 percpu_ref_exit(&ref_node->refs);
226 kfree(ref_node);
227}
228
229static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
230{
231 struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
232 struct io_ring_ctx *ctx = node->rsrc_data->ctx;
233 unsigned long flags;
234 bool first_add = false;
235 unsigned long delay = HZ;
236
237 spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
238 node->done = true;
239
240 /* if we are mid-quiesce then do not delay */
241 if (node->rsrc_data->quiesce)
242 delay = 0;
243
244 while (!list_empty(&ctx->rsrc_ref_list)) {
245 node = list_first_entry(&ctx->rsrc_ref_list,
246 struct io_rsrc_node, node);
247 /* recycle ref nodes in order */
248 if (!node->done)
249 break;
250 list_del(&node->node);
251 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
252 }
253 spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
254
255 if (first_add)
256 mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
257}
258
259static struct io_rsrc_node *io_rsrc_node_alloc(void)
260{
261 struct io_rsrc_node *ref_node;
262
263 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
264 if (!ref_node)
265 return NULL;
266
267 if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
268 0, GFP_KERNEL)) {
269 kfree(ref_node);
270 return NULL;
271 }
272 INIT_LIST_HEAD(&ref_node->node);
273 INIT_LIST_HEAD(&ref_node->rsrc_list);
274 ref_node->done = false;
275 return ref_node;
276}
277
278void io_rsrc_node_switch(struct io_ring_ctx *ctx,
279 struct io_rsrc_data *data_to_kill)
280 __must_hold(&ctx->uring_lock)
281{
282 WARN_ON_ONCE(!ctx->rsrc_backup_node);
283 WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
284
285 io_rsrc_refs_drop(ctx);
286
287 if (data_to_kill) {
288 struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
289
290 rsrc_node->rsrc_data = data_to_kill;
291 spin_lock_irq(&ctx->rsrc_ref_lock);
292 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
293 spin_unlock_irq(&ctx->rsrc_ref_lock);
294
295 atomic_inc(&data_to_kill->refs);
296 percpu_ref_kill(&rsrc_node->refs);
297 ctx->rsrc_node = NULL;
298 }
299
300 if (!ctx->rsrc_node) {
301 ctx->rsrc_node = ctx->rsrc_backup_node;
302 ctx->rsrc_backup_node = NULL;
303 }
304}
305
306int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
307{
308 if (ctx->rsrc_backup_node)
309 return 0;
310 ctx->rsrc_backup_node = io_rsrc_node_alloc();
311 return ctx->rsrc_backup_node ? 0 : -ENOMEM;
312}
313
314__cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
315 struct io_ring_ctx *ctx)
316{
317 int ret;
318
319 /* As we may drop ->uring_lock, other task may have started quiesce */
320 if (data->quiesce)
321 return -ENXIO;
322
323 data->quiesce = true;
324 do {
325 ret = io_rsrc_node_switch_start(ctx);
326 if (ret)
327 break;
328 io_rsrc_node_switch(ctx, data);
329
330 /* kill initial ref, already quiesced if zero */
331 if (atomic_dec_and_test(&data->refs))
332 break;
333 mutex_unlock(&ctx->uring_lock);
334 flush_delayed_work(&ctx->rsrc_put_work);
335 ret = wait_for_completion_interruptible(&data->done);
336 if (!ret) {
337 mutex_lock(&ctx->uring_lock);
338 if (atomic_read(&data->refs) > 0) {
339 /*
340 * it has been revived by another thread while
341 * we were unlocked
342 */
343 mutex_unlock(&ctx->uring_lock);
344 } else {
345 break;
346 }
347 }
348
349 atomic_inc(&data->refs);
350 /* wait for all works potentially completing data->done */
351 flush_delayed_work(&ctx->rsrc_put_work);
352 reinit_completion(&data->done);
353
354 ret = io_run_task_work_sig();
355 mutex_lock(&ctx->uring_lock);
356 } while (ret >= 0);
357 data->quiesce = false;
358
359 return ret;
360}
361
362static void io_free_page_table(void **table, size_t size)
363{
364 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
365
366 for (i = 0; i < nr_tables; i++)
367 kfree(table[i]);
368 kfree(table);
369}
370
371static void io_rsrc_data_free(struct io_rsrc_data *data)
372{
373 size_t size = data->nr * sizeof(data->tags[0][0]);
374
375 if (data->tags)
376 io_free_page_table((void **)data->tags, size);
377 kfree(data);
378}
379
380static __cold void **io_alloc_page_table(size_t size)
381{
382 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
383 size_t init_size = size;
384 void **table;
385
386 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
387 if (!table)
388 return NULL;
389
390 for (i = 0; i < nr_tables; i++) {
391 unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
392
393 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
394 if (!table[i]) {
395 io_free_page_table(table, init_size);
396 return NULL;
397 }
398 size -= this_size;
399 }
400 return table;
401}
402
403__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx,
404 rsrc_put_fn *do_put, u64 __user *utags,
405 unsigned nr, struct io_rsrc_data **pdata)
406{
407 struct io_rsrc_data *data;
408 int ret = -ENOMEM;
409 unsigned i;
410
411 data = kzalloc(sizeof(*data), GFP_KERNEL);
412 if (!data)
413 return -ENOMEM;
414 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
415 if (!data->tags) {
416 kfree(data);
417 return -ENOMEM;
418 }
419
420 data->nr = nr;
421 data->ctx = ctx;
422 data->do_put = do_put;
423 if (utags) {
424 ret = -EFAULT;
425 for (i = 0; i < nr; i++) {
426 u64 *tag_slot = io_get_tag_slot(data, i);
427
428 if (copy_from_user(tag_slot, &utags[i],
429 sizeof(*tag_slot)))
430 goto fail;
431 }
432 }
433
434 atomic_set(&data->refs, 1);
435 init_completion(&data->done);
436 *pdata = data;
437 return 0;
438fail:
439 io_rsrc_data_free(data);
440 return ret;
441}
442
443static int __io_sqe_files_update(struct io_ring_ctx *ctx,
444 struct io_uring_rsrc_update2 *up,
445 unsigned nr_args)
446{
447 u64 __user *tags = u64_to_user_ptr(up->tags);
448 __s32 __user *fds = u64_to_user_ptr(up->data);
449 struct io_rsrc_data *data = ctx->file_data;
450 struct io_fixed_file *file_slot;
451 struct file *file;
452 int fd, i, err = 0;
453 unsigned int done;
454 bool needs_switch = false;
455
456 if (!ctx->file_data)
457 return -ENXIO;
458 if (up->offset + nr_args > ctx->nr_user_files)
459 return -EINVAL;
460
461 for (done = 0; done < nr_args; done++) {
462 u64 tag = 0;
463
464 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
465 copy_from_user(&fd, &fds[done], sizeof(fd))) {
466 err = -EFAULT;
467 break;
468 }
469 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
470 err = -EINVAL;
471 break;
472 }
473 if (fd == IORING_REGISTER_FILES_SKIP)
474 continue;
475
476 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
477 file_slot = io_fixed_file_slot(&ctx->file_table, i);
478
479 if (file_slot->file_ptr) {
480 file = (struct file *)(file_slot->file_ptr & FFS_MASK);
481 err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
482 if (err)
483 break;
484 file_slot->file_ptr = 0;
485 io_file_bitmap_clear(&ctx->file_table, i);
486 needs_switch = true;
487 }
488 if (fd != -1) {
489 file = fget(fd);
490 if (!file) {
491 err = -EBADF;
492 break;
493 }
494 /*
495 * Don't allow io_uring instances to be registered. If
496 * UNIX isn't enabled, then this causes a reference
497 * cycle and this instance can never get freed. If UNIX
498 * is enabled we'll handle it just fine, but there's
499 * still no point in allowing a ring fd as it doesn't
500 * support regular read/write anyway.
501 */
502 if (io_is_uring_fops(file)) {
503 fput(file);
504 err = -EBADF;
505 break;
506 }
507 err = io_scm_file_account(ctx, file);
508 if (err) {
509 fput(file);
510 break;
511 }
512 *io_get_tag_slot(data, i) = tag;
513 io_fixed_file_set(file_slot, file);
514 io_file_bitmap_set(&ctx->file_table, i);
515 }
516 }
517
518 if (needs_switch)
519 io_rsrc_node_switch(ctx, data);
520 return done ? done : err;
521}
522
523static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
524 struct io_uring_rsrc_update2 *up,
525 unsigned int nr_args)
526{
527 u64 __user *tags = u64_to_user_ptr(up->tags);
528 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
529 struct page *last_hpage = NULL;
530 bool needs_switch = false;
531 __u32 done;
532 int i, err;
533
534 if (!ctx->buf_data)
535 return -ENXIO;
536 if (up->offset + nr_args > ctx->nr_user_bufs)
537 return -EINVAL;
538
539 for (done = 0; done < nr_args; done++) {
540 struct io_mapped_ubuf *imu;
541 int offset = up->offset + done;
542 u64 tag = 0;
543
544 err = io_copy_iov(ctx, &iov, iovs, done);
545 if (err)
546 break;
547 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
548 err = -EFAULT;
549 break;
550 }
551 err = io_buffer_validate(&iov);
552 if (err)
553 break;
554 if (!iov.iov_base && tag) {
555 err = -EINVAL;
556 break;
557 }
558 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
559 if (err)
560 break;
561
562 i = array_index_nospec(offset, ctx->nr_user_bufs);
563 if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
564 err = io_queue_rsrc_removal(ctx->buf_data, i,
565 ctx->rsrc_node, ctx->user_bufs[i]);
566 if (unlikely(err)) {
567 io_buffer_unmap(ctx, &imu);
568 break;
569 }
570 ctx->user_bufs[i] = NULL;
571 needs_switch = true;
572 }
573
574 ctx->user_bufs[i] = imu;
575 *io_get_tag_slot(ctx->buf_data, offset) = tag;
576 }
577
578 if (needs_switch)
579 io_rsrc_node_switch(ctx, ctx->buf_data);
580 return done ? done : err;
581}
582
583static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
584 struct io_uring_rsrc_update2 *up,
585 unsigned nr_args)
586{
587 __u32 tmp;
588 int err;
589
590 if (check_add_overflow(up->offset, nr_args, &tmp))
591 return -EOVERFLOW;
592 err = io_rsrc_node_switch_start(ctx);
593 if (err)
594 return err;
595
596 switch (type) {
597 case IORING_RSRC_FILE:
598 return __io_sqe_files_update(ctx, up, nr_args);
599 case IORING_RSRC_BUFFER:
600 return __io_sqe_buffers_update(ctx, up, nr_args);
601 }
602 return -EINVAL;
603}
604
605int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
606 unsigned nr_args)
607{
608 struct io_uring_rsrc_update2 up;
609
610 if (!nr_args)
611 return -EINVAL;
612 memset(&up, 0, sizeof(up));
613 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
614 return -EFAULT;
615 if (up.resv || up.resv2)
616 return -EINVAL;
617 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
618}
619
620int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
621 unsigned size, unsigned type)
622{
623 struct io_uring_rsrc_update2 up;
624
625 if (size != sizeof(up))
626 return -EINVAL;
627 if (copy_from_user(&up, arg, sizeof(up)))
628 return -EFAULT;
629 if (!up.nr || up.resv || up.resv2)
630 return -EINVAL;
631 return __io_register_rsrc_update(ctx, type, &up, up.nr);
632}
633
634__cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
635 unsigned int size, unsigned int type)
636{
637 struct io_uring_rsrc_register rr;
638
639 /* keep it extendible */
640 if (size != sizeof(rr))
641 return -EINVAL;
642
643 memset(&rr, 0, sizeof(rr));
644 if (copy_from_user(&rr, arg, size))
645 return -EFAULT;
646 if (!rr.nr || rr.resv2)
647 return -EINVAL;
648 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
649 return -EINVAL;
650
651 switch (type) {
652 case IORING_RSRC_FILE:
653 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
654 break;
655 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
656 rr.nr, u64_to_user_ptr(rr.tags));
657 case IORING_RSRC_BUFFER:
658 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
659 break;
660 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
661 rr.nr, u64_to_user_ptr(rr.tags));
662 }
663 return -EINVAL;
664}
665
666int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
667{
668 struct io_rsrc_update *up = io_kiocb_to_cmd(req);
669
670 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
671 return -EINVAL;
672 if (sqe->rw_flags || sqe->splice_fd_in)
673 return -EINVAL;
674
675 up->offset = READ_ONCE(sqe->off);
676 up->nr_args = READ_ONCE(sqe->len);
677 if (!up->nr_args)
678 return -EINVAL;
679 up->arg = READ_ONCE(sqe->addr);
680 return 0;
681}
682
683static int io_files_update_with_index_alloc(struct io_kiocb *req,
684 unsigned int issue_flags)
685{
686 struct io_rsrc_update *up = io_kiocb_to_cmd(req);
687 __s32 __user *fds = u64_to_user_ptr(up->arg);
688 unsigned int done;
689 struct file *file;
690 int ret, fd;
691
692 if (!req->ctx->file_data)
693 return -ENXIO;
694
695 for (done = 0; done < up->nr_args; done++) {
696 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
697 ret = -EFAULT;
698 break;
699 }
700
701 file = fget(fd);
702 if (!file) {
703 ret = -EBADF;
704 break;
705 }
706 ret = io_fixed_fd_install(req, issue_flags, file,
707 IORING_FILE_INDEX_ALLOC);
708 if (ret < 0)
709 break;
710 if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
711 __io_close_fixed(req, issue_flags, ret);
712 ret = -EFAULT;
713 break;
714 }
715 }
716
717 if (done)
718 return done;
719 return ret;
720}
721
722int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
723{
724 struct io_rsrc_update *up = io_kiocb_to_cmd(req);
725 struct io_ring_ctx *ctx = req->ctx;
726 struct io_uring_rsrc_update2 up2;
727 int ret;
728
729 up2.offset = up->offset;
730 up2.data = up->arg;
731 up2.nr = 0;
732 up2.tags = 0;
733 up2.resv = 0;
734 up2.resv2 = 0;
735
736 if (up->offset == IORING_FILE_INDEX_ALLOC) {
737 ret = io_files_update_with_index_alloc(req, issue_flags);
738 } else {
739 io_ring_submit_lock(ctx, issue_flags);
740 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
741 &up2, up->nr_args);
742 io_ring_submit_unlock(ctx, issue_flags);
743 }
744
745 if (ret < 0)
746 req_set_fail(req);
747 io_req_set_res(req, ret, 0);
748 return IOU_OK;
749}
750
751int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
752 struct io_rsrc_node *node, void *rsrc)
753{
754 u64 *tag_slot = io_get_tag_slot(data, idx);
755 struct io_rsrc_put *prsrc;
756
757 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
758 if (!prsrc)
759 return -ENOMEM;
760
761 prsrc->tag = *tag_slot;
762 *tag_slot = 0;
763 prsrc->rsrc = rsrc;
764 list_add(&prsrc->list, &node->rsrc_list);
765 return 0;
766}
767
768void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
769{
770#if !defined(IO_URING_SCM_ALL)
771 int i;
772
773 for (i = 0; i < ctx->nr_user_files; i++) {
774 struct file *file = io_file_from_index(&ctx->file_table, i);
775
776 if (!file)
777 continue;
778 if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM)
779 continue;
780 io_file_bitmap_clear(&ctx->file_table, i);
781 fput(file);
782 }
783#endif
784
785#if defined(CONFIG_UNIX)
786 if (ctx->ring_sock) {
787 struct sock *sock = ctx->ring_sock->sk;
788 struct sk_buff *skb;
789
790 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
791 kfree_skb(skb);
792 }
793#endif
794 io_free_file_tables(&ctx->file_table);
795 io_rsrc_data_free(ctx->file_data);
796 ctx->file_data = NULL;
797 ctx->nr_user_files = 0;
798}
799
800int io_sqe_files_unregister(struct io_ring_ctx *ctx)
801{
802 unsigned nr = ctx->nr_user_files;
803 int ret;
804
805 if (!ctx->file_data)
806 return -ENXIO;
807
808 /*
809 * Quiesce may unlock ->uring_lock, and while it's not held
810 * prevent new requests using the table.
811 */
812 ctx->nr_user_files = 0;
813 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
814 ctx->nr_user_files = nr;
815 if (!ret)
816 __io_sqe_files_unregister(ctx);
817 return ret;
818}
819
820/*
821 * Ensure the UNIX gc is aware of our file set, so we are certain that
822 * the io_uring can be safely unregistered on process exit, even if we have
823 * loops in the file referencing. We account only files that can hold other
824 * files because otherwise they can't form a loop and so are not interesting
825 * for GC.
826 */
827int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
828{
829#if defined(CONFIG_UNIX)
830 struct sock *sk = ctx->ring_sock->sk;
831 struct sk_buff_head *head = &sk->sk_receive_queue;
832 struct scm_fp_list *fpl;
833 struct sk_buff *skb;
834
835 if (likely(!io_file_need_scm(file)))
836 return 0;
837
838 /*
839 * See if we can merge this file into an existing skb SCM_RIGHTS
840 * file set. If there's no room, fall back to allocating a new skb
841 * and filling it in.
842 */
843 spin_lock_irq(&head->lock);
844 skb = skb_peek(head);
845 if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD)
846 __skb_unlink(skb, head);
847 else
848 skb = NULL;
849 spin_unlock_irq(&head->lock);
850
851 if (!skb) {
852 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
853 if (!fpl)
854 return -ENOMEM;
855
856 skb = alloc_skb(0, GFP_KERNEL);
857 if (!skb) {
858 kfree(fpl);
859 return -ENOMEM;
860 }
861
862 fpl->user = get_uid(current_user());
863 fpl->max = SCM_MAX_FD;
864 fpl->count = 0;
865
866 UNIXCB(skb).fp = fpl;
867 skb->sk = sk;
868 skb->destructor = unix_destruct_scm;
869 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
870 }
871
872 fpl = UNIXCB(skb).fp;
873 fpl->fp[fpl->count++] = get_file(file);
874 unix_inflight(fpl->user, file);
875 skb_queue_head(head, skb);
876 fput(file);
877#endif
878 return 0;
879}
880
881static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
882{
883 struct file *file = prsrc->file;
884#if defined(CONFIG_UNIX)
885 struct sock *sock = ctx->ring_sock->sk;
886 struct sk_buff_head list, *head = &sock->sk_receive_queue;
887 struct sk_buff *skb;
888 int i;
889
890 if (!io_file_need_scm(file)) {
891 fput(file);
892 return;
893 }
894
895 __skb_queue_head_init(&list);
896
897 /*
898 * Find the skb that holds this file in its SCM_RIGHTS. When found,
899 * remove this entry and rearrange the file array.
900 */
901 skb = skb_dequeue(head);
902 while (skb) {
903 struct scm_fp_list *fp;
904
905 fp = UNIXCB(skb).fp;
906 for (i = 0; i < fp->count; i++) {
907 int left;
908
909 if (fp->fp[i] != file)
910 continue;
911
912 unix_notinflight(fp->user, fp->fp[i]);
913 left = fp->count - 1 - i;
914 if (left) {
915 memmove(&fp->fp[i], &fp->fp[i + 1],
916 left * sizeof(struct file *));
917 }
918 fp->count--;
919 if (!fp->count) {
920 kfree_skb(skb);
921 skb = NULL;
922 } else {
923 __skb_queue_tail(&list, skb);
924 }
925 fput(file);
926 file = NULL;
927 break;
928 }
929
930 if (!file)
931 break;
932
933 __skb_queue_tail(&list, skb);
934
935 skb = skb_dequeue(head);
936 }
937
938 if (skb_peek(&list)) {
939 spin_lock_irq(&head->lock);
940 while ((skb = __skb_dequeue(&list)) != NULL)
941 __skb_queue_tail(head, skb);
942 spin_unlock_irq(&head->lock);
943 }
944#else
945 fput(file);
946#endif
947}
948
949int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
950 unsigned nr_args, u64 __user *tags)
951{
952 __s32 __user *fds = (__s32 __user *) arg;
953 struct file *file;
954 int fd, ret;
955 unsigned i;
956
957 if (ctx->file_data)
958 return -EBUSY;
959 if (!nr_args)
960 return -EINVAL;
961 if (nr_args > IORING_MAX_FIXED_FILES)
962 return -EMFILE;
963 if (nr_args > rlimit(RLIMIT_NOFILE))
964 return -EMFILE;
965 ret = io_rsrc_node_switch_start(ctx);
966 if (ret)
967 return ret;
968 ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
969 &ctx->file_data);
970 if (ret)
971 return ret;
972
973 if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
974 io_rsrc_data_free(ctx->file_data);
975 ctx->file_data = NULL;
976 return -ENOMEM;
977 }
978
979 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
980 struct io_fixed_file *file_slot;
981
982 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
983 ret = -EFAULT;
984 goto fail;
985 }
986 /* allow sparse sets */
987 if (!fds || fd == -1) {
988 ret = -EINVAL;
989 if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
990 goto fail;
991 continue;
992 }
993
994 file = fget(fd);
995 ret = -EBADF;
996 if (unlikely(!file))
997 goto fail;
998
999 /*
1000 * Don't allow io_uring instances to be registered. If UNIX
1001 * isn't enabled, then this causes a reference cycle and this
1002 * instance can never get freed. If UNIX is enabled we'll
1003 * handle it just fine, but there's still no point in allowing
1004 * a ring fd as it doesn't support regular read/write anyway.
1005 */
1006 if (io_is_uring_fops(file)) {
1007 fput(file);
1008 goto fail;
1009 }
1010 ret = io_scm_file_account(ctx, file);
1011 if (ret) {
1012 fput(file);
1013 goto fail;
1014 }
1015 file_slot = io_fixed_file_slot(&ctx->file_table, i);
1016 io_fixed_file_set(file_slot, file);
1017 io_file_bitmap_set(&ctx->file_table, i);
1018 }
1019
1020 io_rsrc_node_switch(ctx, NULL);
1021 return 0;
1022fail:
1023 __io_sqe_files_unregister(ctx);
1024 return ret;
1025}
1026
1027static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
1028{
1029 io_buffer_unmap(ctx, &prsrc->buf);
1030 prsrc->buf = NULL;
1031}
1032
1033void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
1034{
1035 unsigned int i;
1036
1037 for (i = 0; i < ctx->nr_user_bufs; i++)
1038 io_buffer_unmap(ctx, &ctx->user_bufs[i]);
1039 kfree(ctx->user_bufs);
1040 io_rsrc_data_free(ctx->buf_data);
1041 ctx->user_bufs = NULL;
1042 ctx->buf_data = NULL;
1043 ctx->nr_user_bufs = 0;
1044}
1045
1046int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
1047{
1048 unsigned nr = ctx->nr_user_bufs;
1049 int ret;
1050
1051 if (!ctx->buf_data)
1052 return -ENXIO;
1053
1054 /*
1055 * Quiesce may unlock ->uring_lock, and while it's not held
1056 * prevent new requests using the table.
1057 */
1058 ctx->nr_user_bufs = 0;
1059 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
1060 ctx->nr_user_bufs = nr;
1061 if (!ret)
1062 __io_sqe_buffers_unregister(ctx);
1063 return ret;
1064}
1065
1066/*
1067 * Not super efficient, but this is just a registration time. And we do cache
1068 * the last compound head, so generally we'll only do a full search if we don't
1069 * match that one.
1070 *
1071 * We check if the given compound head page has already been accounted, to
1072 * avoid double accounting it. This allows us to account the full size of the
1073 * page, not just the constituent pages of a huge page.
1074 */
1075static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
1076 int nr_pages, struct page *hpage)
1077{
1078 int i, j;
1079
1080 /* check current page array */
1081 for (i = 0; i < nr_pages; i++) {
1082 if (!PageCompound(pages[i]))
1083 continue;
1084 if (compound_head(pages[i]) == hpage)
1085 return true;
1086 }
1087
1088 /* check previously registered pages */
1089 for (i = 0; i < ctx->nr_user_bufs; i++) {
1090 struct io_mapped_ubuf *imu = ctx->user_bufs[i];
1091
1092 for (j = 0; j < imu->nr_bvecs; j++) {
1093 if (!PageCompound(imu->bvec[j].bv_page))
1094 continue;
1095 if (compound_head(imu->bvec[j].bv_page) == hpage)
1096 return true;
1097 }
1098 }
1099
1100 return false;
1101}
1102
1103static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
1104 int nr_pages, struct io_mapped_ubuf *imu,
1105 struct page **last_hpage)
1106{
1107 int i, ret;
1108
1109 imu->acct_pages = 0;
1110 for (i = 0; i < nr_pages; i++) {
1111 if (!PageCompound(pages[i])) {
1112 imu->acct_pages++;
1113 } else {
1114 struct page *hpage;
1115
1116 hpage = compound_head(pages[i]);
1117 if (hpage == *last_hpage)
1118 continue;
1119 *last_hpage = hpage;
1120 if (headpage_already_acct(ctx, pages, i, hpage))
1121 continue;
1122 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
1123 }
1124 }
1125
1126 if (!imu->acct_pages)
1127 return 0;
1128
1129 ret = io_account_mem(ctx, imu->acct_pages);
1130 if (ret)
1131 imu->acct_pages = 0;
1132 return ret;
1133}
1134
1135struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
1136{
1137 unsigned long start, end, nr_pages;
1138 struct vm_area_struct **vmas = NULL;
1139 struct page **pages = NULL;
1140 int i, pret, ret = -ENOMEM;
1141
1142 end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1143 start = ubuf >> PAGE_SHIFT;
1144 nr_pages = end - start;
1145
1146 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
1147 if (!pages)
1148 goto done;
1149
1150 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
1151 GFP_KERNEL);
1152 if (!vmas)
1153 goto done;
1154
1155 ret = 0;
1156 mmap_read_lock(current->mm);
1157 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
1158 pages, vmas);
1159 if (pret == nr_pages) {
1160 /* don't support file backed memory */
1161 for (i = 0; i < nr_pages; i++) {
1162 struct vm_area_struct *vma = vmas[i];
1163
1164 if (vma_is_shmem(vma))
1165 continue;
1166 if (vma->vm_file &&
1167 !is_file_hugepages(vma->vm_file)) {
1168 ret = -EOPNOTSUPP;
1169 break;
1170 }
1171 }
1172 *npages = nr_pages;
1173 } else {
1174 ret = pret < 0 ? pret : -EFAULT;
1175 }
1176 mmap_read_unlock(current->mm);
1177 if (ret) {
1178 /*
1179 * if we did partial map, or found file backed vmas,
1180 * release any pages we did get
1181 */
1182 if (pret > 0)
1183 unpin_user_pages(pages, pret);
1184 goto done;
1185 }
1186 ret = 0;
1187done:
1188 kvfree(vmas);
1189 if (ret < 0) {
1190 kvfree(pages);
1191 pages = ERR_PTR(ret);
1192 }
1193 return pages;
1194}
1195
1196static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
1197 struct io_mapped_ubuf **pimu,
1198 struct page **last_hpage)
1199{
1200 struct io_mapped_ubuf *imu = NULL;
1201 struct page **pages = NULL;
1202 unsigned long off;
1203 size_t size;
1204 int ret, nr_pages, i;
1205
1206 if (!iov->iov_base) {
1207 *pimu = ctx->dummy_ubuf;
1208 return 0;
1209 }
1210
1211 *pimu = NULL;
1212 ret = -ENOMEM;
1213
1214 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
1215 &nr_pages);
1216 if (IS_ERR(pages)) {
1217 ret = PTR_ERR(pages);
1218 pages = NULL;
1219 goto done;
1220 }
1221
1222 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
1223 if (!imu)
1224 goto done;
1225
1226 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
1227 if (ret) {
1228 unpin_user_pages(pages, nr_pages);
1229 goto done;
1230 }
1231
1232 off = (unsigned long) iov->iov_base & ~PAGE_MASK;
1233 size = iov->iov_len;
1234 for (i = 0; i < nr_pages; i++) {
1235 size_t vec_len;
1236
1237 vec_len = min_t(size_t, size, PAGE_SIZE - off);
1238 imu->bvec[i].bv_page = pages[i];
1239 imu->bvec[i].bv_len = vec_len;
1240 imu->bvec[i].bv_offset = off;
1241 off = 0;
1242 size -= vec_len;
1243 }
1244 /* store original address for later verification */
1245 imu->ubuf = (unsigned long) iov->iov_base;
1246 imu->ubuf_end = imu->ubuf + iov->iov_len;
1247 imu->nr_bvecs = nr_pages;
1248 *pimu = imu;
1249 ret = 0;
1250done:
1251 if (ret)
1252 kvfree(imu);
1253 kvfree(pages);
1254 return ret;
1255}
1256
1257static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
1258{
1259 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
1260 return ctx->user_bufs ? 0 : -ENOMEM;
1261}
1262
1263int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
1264 unsigned int nr_args, u64 __user *tags)
1265{
1266 struct page *last_hpage = NULL;
1267 struct io_rsrc_data *data;
1268 int i, ret;
1269 struct iovec iov;
1270
1271 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
1272
1273 if (ctx->user_bufs)
1274 return -EBUSY;
1275 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
1276 return -EINVAL;
1277 ret = io_rsrc_node_switch_start(ctx);
1278 if (ret)
1279 return ret;
1280 ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
1281 if (ret)
1282 return ret;
1283 ret = io_buffers_map_alloc(ctx, nr_args);
1284 if (ret) {
1285 io_rsrc_data_free(data);
1286 return ret;
1287 }
1288
1289 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
1290 if (arg) {
1291 ret = io_copy_iov(ctx, &iov, arg, i);
1292 if (ret)
1293 break;
1294 ret = io_buffer_validate(&iov);
1295 if (ret)
1296 break;
1297 } else {
1298 memset(&iov, 0, sizeof(iov));
1299 }
1300
1301 if (!iov.iov_base && *io_get_tag_slot(data, i)) {
1302 ret = -EINVAL;
1303 break;
1304 }
1305
1306 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
1307 &last_hpage);
1308 if (ret)
1309 break;
1310 }
1311
1312 WARN_ON_ONCE(ctx->buf_data);
1313
1314 ctx->buf_data = data;
1315 if (ret)
1316 __io_sqe_buffers_unregister(ctx);
1317 else
1318 io_rsrc_node_switch(ctx, NULL);
1319 return ret;
1320}