zram: use __bio_add_page for adding single page to bio
[linux-block.git] / io_uring / kbuf.c
CommitLineData
3b77495a
JA
1// SPDX-License-Identifier: GPL-2.0
2#include <linux/kernel.h>
3#include <linux/errno.h>
4#include <linux/fs.h>
5#include <linux/file.h>
6#include <linux/mm.h>
7#include <linux/slab.h>
8#include <linux/namei.h>
9#include <linux/poll.h>
10#include <linux/io_uring.h>
11
12#include <uapi/linux/io_uring.h>
13
3b77495a
JA
14#include "io_uring.h"
15#include "opdef.h"
16#include "kbuf.h"
17
18#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
19
20#define BGID_ARRAY 64
21
22struct io_provide_buf {
23 struct file *file;
24 __u64 addr;
25 __u32 len;
26 __u32 bgid;
27 __u16 nbufs;
28 __u16 bid;
29};
30
31static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
32 unsigned int bgid)
33{
34 if (ctx->io_bl && bgid < BGID_ARRAY)
35 return &ctx->io_bl[bgid];
36
37 return xa_load(&ctx->io_bl_xa, bgid);
38}
39
024b8fde
HX
40static int io_buffer_add_list(struct io_ring_ctx *ctx,
41 struct io_buffer_list *bl, unsigned int bgid)
42{
43 bl->bgid = bgid;
44 if (bgid < BGID_ARRAY)
45 return 0;
46
47 return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
48}
49
50void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
3b77495a
JA
51{
52 struct io_ring_ctx *ctx = req->ctx;
53 struct io_buffer_list *bl;
54 struct io_buffer *buf;
55
56 /*
024b8fde
HX
57 * For legacy provided buffer mode, don't recycle if we already did
58 * IO to this buffer. For ring-mapped provided buffer mode, we should
59 * increment ring->head to explicitly monopolize the buffer to avoid
60 * multiple use.
3b77495a 61 */
024b8fde 62 if (req->flags & REQ_F_PARTIAL_IO)
3b77495a 63 return;
3b77495a
JA
64
65 io_ring_submit_lock(ctx, issue_flags);
66
67 buf = req->kbuf;
68 bl = io_buffer_get_list(ctx, buf->bgid);
69 list_add(&buf->list, &bl->buf_list);
70 req->flags &= ~REQ_F_BUFFER_SELECTED;
71 req->buf_index = buf->bgid;
72
73 io_ring_submit_unlock(ctx, issue_flags);
024b8fde 74 return;
3b77495a
JA
75}
76
53ccf69b
PB
77unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
78{
79 unsigned int cflags;
80
81 /*
82 * We can add this buffer back to two lists:
83 *
84 * 1) The io_buffers_cache list. This one is protected by the
85 * ctx->uring_lock. If we already hold this lock, add back to this
86 * list as we can grab it from issue as well.
87 * 2) The io_buffers_comp list. This one is protected by the
88 * ctx->completion_lock.
89 *
90 * We migrate buffers from the comp_list to the issue cache list
91 * when we need one.
92 */
93 if (req->flags & REQ_F_BUFFER_RING) {
94 /* no buffers to recycle for this case */
95 cflags = __io_put_kbuf_list(req, NULL);
96 } else if (issue_flags & IO_URING_F_UNLOCKED) {
97 struct io_ring_ctx *ctx = req->ctx;
98
99 spin_lock(&ctx->completion_lock);
100 cflags = __io_put_kbuf_list(req, &ctx->io_buffers_comp);
101 spin_unlock(&ctx->completion_lock);
102 } else {
103 lockdep_assert_held(&req->ctx->uring_lock);
104
105 cflags = __io_put_kbuf_list(req, &req->ctx->io_buffers_cache);
106 }
107 return cflags;
108}
109
3b77495a
JA
110static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
111 struct io_buffer_list *bl)
112{
113 if (!list_empty(&bl->buf_list)) {
114 struct io_buffer *kbuf;
115
116 kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
117 list_del(&kbuf->list);
b8c01559 118 if (*len == 0 || *len > kbuf->len)
3b77495a
JA
119 *len = kbuf->len;
120 req->flags |= REQ_F_BUFFER_SELECTED;
121 req->kbuf = kbuf;
122 req->buf_index = kbuf->bid;
123 return u64_to_user_ptr(kbuf->addr);
124 }
125 return NULL;
126}
127
128static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
129 struct io_buffer_list *bl,
130 unsigned int issue_flags)
131{
132 struct io_uring_buf_ring *br = bl->buf_ring;
133 struct io_uring_buf *buf;
134 __u16 head = bl->head;
135
136 if (unlikely(smp_load_acquire(&br->tail) == head))
137 return NULL;
138
139 head &= bl->mask;
c56e022c
JA
140 /* mmaped buffers are always contig */
141 if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) {
3b77495a
JA
142 buf = &br->bufs[head];
143 } else {
144 int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
145 int index = head / IO_BUFFER_LIST_BUF_PER_PAGE;
146 buf = page_address(bl->buf_pages[index]);
147 buf += off;
148 }
b8c01559 149 if (*len == 0 || *len > buf->len)
3b77495a
JA
150 *len = buf->len;
151 req->flags |= REQ_F_BUFFER_RING;
152 req->buf_list = bl;
153 req->buf_index = buf->bid;
154
155 if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) {
156 /*
157 * If we came in unlocked, we have no choice but to consume the
f09c8643
HX
158 * buffer here, otherwise nothing ensures that the buffer won't
159 * get used by others. This does mean it'll be pinned until the
160 * IO completes, coming in unlocked means we're being called from
161 * io-wq context and there may be further retries in async hybrid
162 * mode. For the locked case, the caller must call commit when
163 * the transfer completes (or if we get -EAGAIN and must poll of
164 * retry).
3b77495a
JA
165 */
166 req->buf_list = NULL;
167 bl->head++;
168 }
169 return u64_to_user_ptr(buf->addr);
170}
171
172void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
173 unsigned int issue_flags)
174{
175 struct io_ring_ctx *ctx = req->ctx;
176 struct io_buffer_list *bl;
177 void __user *ret = NULL;
178
179 io_ring_submit_lock(req->ctx, issue_flags);
180
181 bl = io_buffer_get_list(ctx, req->buf_index);
182 if (likely(bl)) {
25a2c188 183 if (bl->is_mapped)
3b77495a
JA
184 ret = io_ring_buffer_select(req, len, bl, issue_flags);
185 else
186 ret = io_provided_buffer_select(req, len, bl);
187 }
188 io_ring_submit_unlock(req->ctx, issue_flags);
189 return ret;
190}
191
192static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
193{
194 int i;
195
196 ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list),
197 GFP_KERNEL);
198 if (!ctx->io_bl)
199 return -ENOMEM;
200
201 for (i = 0; i < BGID_ARRAY; i++) {
202 INIT_LIST_HEAD(&ctx->io_bl[i].buf_list);
203 ctx->io_bl[i].bgid = i;
204 }
205
206 return 0;
207}
208
209static int __io_remove_buffers(struct io_ring_ctx *ctx,
210 struct io_buffer_list *bl, unsigned nbufs)
211{
212 unsigned i = 0;
213
214 /* shouldn't happen */
215 if (!nbufs)
216 return 0;
217
c56e022c 218 if (bl->is_mapped) {
3b77495a 219 i = bl->buf_ring->tail - bl->head;
c56e022c 220 if (bl->is_mmap) {
ceac766a
PB
221 struct page *page;
222
223 page = virt_to_head_page(bl->buf_ring);
224 if (put_page_testzero(page))
225 free_compound_page(page);
226 bl->buf_ring = NULL;
c56e022c
JA
227 bl->is_mmap = 0;
228 } else if (bl->buf_nr_pages) {
229 int j;
230
231 for (j = 0; j < bl->buf_nr_pages; j++)
232 unpin_user_page(bl->buf_pages[j]);
233 kvfree(bl->buf_pages);
234 bl->buf_pages = NULL;
235 bl->buf_nr_pages = 0;
236 }
3b77495a
JA
237 /* make sure it's seen as empty */
238 INIT_LIST_HEAD(&bl->buf_list);
25a2c188 239 bl->is_mapped = 0;
3b77495a
JA
240 return i;
241 }
242
b4a72c05
WL
243 /* protects io_buffers_cache */
244 lockdep_assert_held(&ctx->uring_lock);
245
3b77495a
JA
246 while (!list_empty(&bl->buf_list)) {
247 struct io_buffer *nxt;
248
249 nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
b4a72c05 250 list_move(&nxt->list, &ctx->io_buffers_cache);
3b77495a
JA
251 if (++i == nbufs)
252 return i;
253 cond_resched();
254 }
3b77495a
JA
255
256 return i;
257}
258
259void io_destroy_buffers(struct io_ring_ctx *ctx)
260{
261 struct io_buffer_list *bl;
262 unsigned long index;
263 int i;
264
265 for (i = 0; i < BGID_ARRAY; i++) {
266 if (!ctx->io_bl)
267 break;
268 __io_remove_buffers(ctx, &ctx->io_bl[i], -1U);
269 }
270
271 xa_for_each(&ctx->io_bl_xa, index, bl) {
272 xa_erase(&ctx->io_bl_xa, bl->bgid);
273 __io_remove_buffers(ctx, bl, -1U);
274 kfree(bl);
275 }
276
277 while (!list_empty(&ctx->io_buffers_pages)) {
278 struct page *page;
279
280 page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
281 list_del_init(&page->lru);
282 __free_page(page);
283 }
284}
285
286int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
287{
f2ccb5ae 288 struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
3b77495a
JA
289 u64 tmp;
290
291 if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
292 sqe->splice_fd_in)
293 return -EINVAL;
294
295 tmp = READ_ONCE(sqe->fd);
296 if (!tmp || tmp > USHRT_MAX)
297 return -EINVAL;
298
299 memset(p, 0, sizeof(*p));
300 p->nbufs = tmp;
301 p->bgid = READ_ONCE(sqe->buf_group);
302 return 0;
303}
304
305int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
306{
f2ccb5ae 307 struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
3b77495a
JA
308 struct io_ring_ctx *ctx = req->ctx;
309 struct io_buffer_list *bl;
310 int ret = 0;
311
312 io_ring_submit_lock(ctx, issue_flags);
313
314 ret = -ENOENT;
315 bl = io_buffer_get_list(ctx, p->bgid);
316 if (bl) {
317 ret = -EINVAL;
318 /* can't use provide/remove buffers command on mapped buffers */
25a2c188 319 if (!bl->is_mapped)
3b77495a
JA
320 ret = __io_remove_buffers(ctx, bl, p->nbufs);
321 }
c3b49093 322 io_ring_submit_unlock(ctx, issue_flags);
3b77495a
JA
323 if (ret < 0)
324 req_set_fail(req);
3b77495a 325 io_req_set_res(req, ret, 0);
c3b49093 326 return IOU_OK;
3b77495a
JA
327}
328
329int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
330{
331 unsigned long size, tmp_check;
f2ccb5ae 332 struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
3b77495a
JA
333 u64 tmp;
334
335 if (sqe->rw_flags || sqe->splice_fd_in)
336 return -EINVAL;
337
338 tmp = READ_ONCE(sqe->fd);
339 if (!tmp || tmp > USHRT_MAX)
340 return -E2BIG;
341 p->nbufs = tmp;
342 p->addr = READ_ONCE(sqe->addr);
343 p->len = READ_ONCE(sqe->len);
344
345 if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
346 &size))
347 return -EOVERFLOW;
348 if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
349 return -EOVERFLOW;
350
351 size = (unsigned long)p->len * p->nbufs;
352 if (!access_ok(u64_to_user_ptr(p->addr), size))
353 return -EFAULT;
354
355 p->bgid = READ_ONCE(sqe->buf_group);
356 tmp = READ_ONCE(sqe->off);
357 if (tmp > USHRT_MAX)
358 return -E2BIG;
3851d25c
JA
359 if (tmp + p->nbufs >= USHRT_MAX)
360 return -EINVAL;
3b77495a
JA
361 p->bid = tmp;
362 return 0;
363}
364
365static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
366{
367 struct io_buffer *buf;
368 struct page *page;
369 int bufs_in_page;
370
371 /*
372 * Completions that don't happen inline (eg not under uring_lock) will
373 * add to ->io_buffers_comp. If we don't have any free buffers, check
374 * the completion list and splice those entries first.
375 */
376 if (!list_empty_careful(&ctx->io_buffers_comp)) {
377 spin_lock(&ctx->completion_lock);
378 if (!list_empty(&ctx->io_buffers_comp)) {
379 list_splice_init(&ctx->io_buffers_comp,
380 &ctx->io_buffers_cache);
381 spin_unlock(&ctx->completion_lock);
382 return 0;
383 }
384 spin_unlock(&ctx->completion_lock);
385 }
386
387 /*
388 * No free buffers and no completion entries either. Allocate a new
389 * page worth of buffer entries and add those to our freelist.
390 */
391 page = alloc_page(GFP_KERNEL_ACCOUNT);
392 if (!page)
393 return -ENOMEM;
394
395 list_add(&page->lru, &ctx->io_buffers_pages);
396
397 buf = page_address(page);
398 bufs_in_page = PAGE_SIZE / sizeof(*buf);
399 while (bufs_in_page) {
400 list_add_tail(&buf->list, &ctx->io_buffers_cache);
401 buf++;
402 bufs_in_page--;
403 }
404
405 return 0;
406}
407
408static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
409 struct io_buffer_list *bl)
410{
411 struct io_buffer *buf;
412 u64 addr = pbuf->addr;
413 int i, bid = pbuf->bid;
414
415 for (i = 0; i < pbuf->nbufs; i++) {
416 if (list_empty(&ctx->io_buffers_cache) &&
417 io_refill_buffer_cache(ctx))
418 break;
419 buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
420 list);
421 list_move_tail(&buf->list, &bl->buf_list);
422 buf->addr = addr;
423 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
424 buf->bid = bid;
425 buf->bgid = pbuf->bgid;
426 addr += pbuf->len;
427 bid++;
428 cond_resched();
429 }
430
431 return i ? 0 : -ENOMEM;
432}
433
434int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
435{
f2ccb5ae 436 struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
3b77495a
JA
437 struct io_ring_ctx *ctx = req->ctx;
438 struct io_buffer_list *bl;
439 int ret = 0;
440
441 io_ring_submit_lock(ctx, issue_flags);
442
443 if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) {
444 ret = io_init_bl_list(ctx);
445 if (ret)
446 goto err;
447 }
448
449 bl = io_buffer_get_list(ctx, p->bgid);
450 if (unlikely(!bl)) {
cc18cc5e 451 bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
3b77495a
JA
452 if (!bl) {
453 ret = -ENOMEM;
454 goto err;
455 }
456 INIT_LIST_HEAD(&bl->buf_list);
457 ret = io_buffer_add_list(ctx, bl, p->bgid);
458 if (ret) {
459 kfree(bl);
460 goto err;
461 }
462 }
463 /* can't add buffers via this command for a mapped buffer ring */
25a2c188 464 if (bl->is_mapped) {
3b77495a
JA
465 ret = -EINVAL;
466 goto err;
467 }
468
469 ret = io_add_buffers(ctx, p, bl);
470err:
c3b49093
PB
471 io_ring_submit_unlock(ctx, issue_flags);
472
3b77495a
JA
473 if (ret < 0)
474 req_set_fail(req);
3b77495a 475 io_req_set_res(req, ret, 0);
c3b49093 476 return IOU_OK;
3b77495a
JA
477}
478
ba56b632
JA
479static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
480 struct io_buffer_list *bl)
3b77495a
JA
481{
482 struct io_uring_buf_ring *br;
3b77495a
JA
483 struct page **pages;
484 int nr_pages;
485
ba56b632
JA
486 pages = io_pin_pages(reg->ring_addr,
487 flex_array_size(br, bufs, reg->ring_entries),
488 &nr_pages);
489 if (IS_ERR(pages))
490 return PTR_ERR(pages);
491
492 br = page_address(pages[0]);
fcb46c0c
JA
493#ifdef SHM_COLOUR
494 /*
495 * On platforms that have specific aliasing requirements, SHM_COLOUR
496 * is set and we must guarantee that the kernel and user side align
497 * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
498 * the application mmap's the provided ring buffer. Fail the request
499 * if we, by chance, don't end up with aligned addresses. The app
500 * should use IOU_PBUF_RING_MMAP instead, and liburing will handle
501 * this transparently.
502 */
503 if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) {
504 int i;
505
506 for (i = 0; i < nr_pages; i++)
507 unpin_user_page(pages[i]);
508 return -EINVAL;
509 }
510#endif
ba56b632
JA
511 bl->buf_pages = pages;
512 bl->buf_nr_pages = nr_pages;
513 bl->buf_ring = br;
25a2c188 514 bl->is_mapped = 1;
c56e022c
JA
515 bl->is_mmap = 0;
516 return 0;
517}
518
519static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,
520 struct io_buffer_list *bl)
521{
522 gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
523 size_t ring_size;
524 void *ptr;
525
526 ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
527 ptr = (void *) __get_free_pages(gfp, get_order(ring_size));
528 if (!ptr)
529 return -ENOMEM;
530
531 bl->buf_ring = ptr;
532 bl->is_mapped = 1;
533 bl->is_mmap = 1;
ba56b632
JA
534 return 0;
535}
536
537int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
538{
539 struct io_uring_buf_reg reg;
540 struct io_buffer_list *bl, *free_bl = NULL;
541 int ret;
542
3b77495a
JA
543 if (copy_from_user(&reg, arg, sizeof(reg)))
544 return -EFAULT;
545
81cf17cd 546 if (reg.resv[0] || reg.resv[1] || reg.resv[2])
3b77495a 547 return -EINVAL;
c56e022c 548 if (reg.flags & ~IOU_PBUF_RING_MMAP)
3b77495a 549 return -EINVAL;
c56e022c
JA
550 if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
551 if (!reg.ring_addr)
552 return -EFAULT;
553 if (reg.ring_addr & ~PAGE_MASK)
554 return -EINVAL;
555 } else {
556 if (reg.ring_addr)
557 return -EINVAL;
558 }
559
3b77495a
JA
560 if (!is_power_of_2(reg.ring_entries))
561 return -EINVAL;
562
563 /* cannot disambiguate full vs empty due to head/tail size */
564 if (reg.ring_entries >= 65536)
565 return -EINVAL;
566
567 if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
568 int ret = io_init_bl_list(ctx);
569 if (ret)
570 return ret;
571 }
572
573 bl = io_buffer_get_list(ctx, reg.bgid);
574 if (bl) {
575 /* if mapped buffer ring OR classic exists, don't allow */
25a2c188 576 if (bl->is_mapped || !list_empty(&bl->buf_list))
3b77495a
JA
577 return -EEXIST;
578 } else {
579 free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
580 if (!bl)
581 return -ENOMEM;
582 }
583
c56e022c
JA
584 if (!(reg.flags & IOU_PBUF_RING_MMAP))
585 ret = io_pin_pbuf_ring(&reg, bl);
586 else
587 ret = io_alloc_pbuf_ring(&reg, bl);
3b77495a 588
c56e022c
JA
589 if (!ret) {
590 bl->nr_entries = reg.ring_entries;
591 bl->mask = reg.ring_entries - 1;
ba56b632 592
c56e022c
JA
593 io_buffer_add_list(ctx, bl, reg.bgid);
594 return 0;
3b77495a
JA
595 }
596
c56e022c
JA
597 kfree(free_bl);
598 return ret;
3b77495a
JA
599}
600
601int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
602{
603 struct io_uring_buf_reg reg;
604 struct io_buffer_list *bl;
605
606 if (copy_from_user(&reg, arg, sizeof(reg)))
607 return -EFAULT;
81cf17cd
JA
608 if (reg.resv[0] || reg.resv[1] || reg.resv[2])
609 return -EINVAL;
610 if (reg.flags)
3b77495a
JA
611 return -EINVAL;
612
613 bl = io_buffer_get_list(ctx, reg.bgid);
614 if (!bl)
615 return -ENOENT;
25a2c188 616 if (!bl->is_mapped)
3b77495a
JA
617 return -EINVAL;
618
619 __io_remove_buffers(ctx, bl, -1U);
620 if (bl->bgid >= BGID_ARRAY) {
621 xa_erase(&ctx->io_bl_xa, bl->bgid);
622 kfree(bl);
623 }
624 return 0;
625}
c56e022c
JA
626
627void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
628{
629 struct io_buffer_list *bl;
630
631 bl = io_buffer_get_list(ctx, bgid);
632 if (!bl || !bl->is_mmap)
633 return NULL;
634
635 return bl->buf_ring;
636}