io_uring/kbuf: use region api for pbuf rings
authorPavel Begunkov <asml.silence@gmail.com>
Fri, 29 Nov 2024 13:34:38 +0000 (13:34 +0000)
committerJens Axboe <axboe@kernel.dk>
Mon, 23 Dec 2024 15:17:16 +0000 (08:17 -0700)
Convert internal parts of the provided buffer ring managment to the
region API. It's the last non-region mapped ring we have, so it also
kills a bunch of now unused memmap.c helpers.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/6c40cf7beaa648558acd4d84bc0fb3279a35d74b.1732886067.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
io_uring/kbuf.c
io_uring/kbuf.h
io_uring/memmap.c
io_uring/memmap.h

index 644f61445ec980d8a3bf4d0e876bf58f8d3f04a6..2dfb9f9419a0578acf2ef2480f36951680a73d34 100644 (file)
@@ -351,17 +351,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
 
        if (bl->flags & IOBL_BUF_RING) {
                i = bl->buf_ring->tail - bl->head;
-               if (bl->buf_nr_pages) {
-                       int j;
-
-                       if (!(bl->flags & IOBL_MMAP)) {
-                               for (j = 0; j < bl->buf_nr_pages; j++)
-                                       unpin_user_page(bl->buf_pages[j]);
-                       }
-                       io_pages_unmap(bl->buf_ring, &bl->buf_pages,
-                                       &bl->buf_nr_pages, bl->flags & IOBL_MMAP);
-                       bl->flags &= ~IOBL_MMAP;
-               }
+               io_free_region(ctx, &bl->region);
                /* make sure it's seen as empty */
                INIT_LIST_HEAD(&bl->buf_list);
                bl->flags &= ~IOBL_BUF_RING;
@@ -614,75 +604,14 @@ err:
        return IOU_OK;
 }
 
-static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
-                           struct io_buffer_list *bl)
-{
-       struct io_uring_buf_ring *br = NULL;
-       struct page **pages;
-       int nr_pages, ret;
-
-       pages = io_pin_pages(reg->ring_addr,
-                            flex_array_size(br, bufs, reg->ring_entries),
-                            &nr_pages);
-       if (IS_ERR(pages))
-               return PTR_ERR(pages);
-
-       br = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
-       if (!br) {
-               ret = -ENOMEM;
-               goto error_unpin;
-       }
-
-#ifdef SHM_COLOUR
-       /*
-        * On platforms that have specific aliasing requirements, SHM_COLOUR
-        * is set and we must guarantee that the kernel and user side align
-        * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
-        * the application mmap's the provided ring buffer. Fail the request
-        * if we, by chance, don't end up with aligned addresses. The app
-        * should use IOU_PBUF_RING_MMAP instead, and liburing will handle
-        * this transparently.
-        */
-       if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) {
-               ret = -EINVAL;
-               goto error_unpin;
-       }
-#endif
-       bl->buf_pages = pages;
-       bl->buf_nr_pages = nr_pages;
-       bl->buf_ring = br;
-       bl->flags |= IOBL_BUF_RING;
-       bl->flags &= ~IOBL_MMAP;
-       return 0;
-error_unpin:
-       unpin_user_pages(pages, nr_pages);
-       kvfree(pages);
-       vunmap(br);
-       return ret;
-}
-
-static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
-                             struct io_uring_buf_reg *reg,
-                             struct io_buffer_list *bl)
-{
-       size_t ring_size;
-
-       ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
-
-       bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size);
-       if (IS_ERR(bl->buf_ring)) {
-               bl->buf_ring = NULL;
-               return -ENOMEM;
-       }
-
-       bl->flags |= (IOBL_BUF_RING | IOBL_MMAP);
-       return 0;
-}
-
 int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 {
        struct io_uring_buf_reg reg;
        struct io_buffer_list *bl, *free_bl = NULL;
+       struct io_uring_region_desc rd;
+       struct io_uring_buf_ring *br;
+       unsigned long mmap_offset;
+       unsigned long ring_size;
        int ret;
 
        lockdep_assert_held(&ctx->uring_lock);
@@ -694,19 +623,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
                return -EINVAL;
        if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC))
                return -EINVAL;
-       if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
-               if (!reg.ring_addr)
-                       return -EFAULT;
-               if (reg.ring_addr & ~PAGE_MASK)
-                       return -EINVAL;
-       } else {
-               if (reg.ring_addr)
-                       return -EINVAL;
-       }
-
        if (!is_power_of_2(reg.ring_entries))
                return -EINVAL;
-
        /* cannot disambiguate full vs empty due to head/tail size */
        if (reg.ring_entries >= 65536)
                return -EINVAL;
@@ -722,21 +640,47 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
                        return -ENOMEM;
        }
 
-       if (!(reg.flags & IOU_PBUF_RING_MMAP))
-               ret = io_pin_pbuf_ring(&reg, bl);
-       else
-               ret = io_alloc_pbuf_ring(ctx, &reg, bl);
+       mmap_offset = reg.bgid << IORING_OFF_PBUF_SHIFT;
+       ring_size = flex_array_size(br, bufs, reg.ring_entries);
 
-       if (!ret) {
-               bl->nr_entries = reg.ring_entries;
-               bl->mask = reg.ring_entries - 1;
-               if (reg.flags & IOU_PBUF_RING_INC)
-                       bl->flags |= IOBL_INC;
+       memset(&rd, 0, sizeof(rd));
+       rd.size = PAGE_ALIGN(ring_size);
+       if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
+               rd.user_addr = reg.ring_addr;
+               rd.flags |= IORING_MEM_REGION_TYPE_USER;
+       }
+       ret = io_create_region_mmap_safe(ctx, &bl->region, &rd, mmap_offset);
+       if (ret)
+               goto fail;
+       br = io_region_get_ptr(&bl->region);
 
-               io_buffer_add_list(ctx, bl, reg.bgid);
-               return 0;
+#ifdef SHM_COLOUR
+       /*
+        * On platforms that have specific aliasing requirements, SHM_COLOUR
+        * is set and we must guarantee that the kernel and user side align
+        * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
+        * the application mmap's the provided ring buffer. Fail the request
+        * if we, by chance, don't end up with aligned addresses. The app
+        * should use IOU_PBUF_RING_MMAP instead, and liburing will handle
+        * this transparently.
+        */
+       if (!(reg.flags & IOU_PBUF_RING_MMAP) &&
+           ((reg.ring_addr | (unsigned long)br) & (SHM_COLOUR - 1))) {
+               ret = -EINVAL;
+               goto fail;
        }
+#endif
 
+       bl->nr_entries = reg.ring_entries;
+       bl->mask = reg.ring_entries - 1;
+       bl->flags |= IOBL_BUF_RING;
+       bl->buf_ring = br;
+       if (reg.flags & IOU_PBUF_RING_INC)
+               bl->flags |= IOBL_INC;
+       io_buffer_add_list(ctx, bl, reg.bgid);
+       return 0;
+fail:
+       io_free_region(ctx, &bl->region);
        kfree(free_bl);
        return ret;
 }
@@ -794,32 +738,18 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
        return 0;
 }
 
-struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
-                                     unsigned long bgid)
-{
-       struct io_buffer_list *bl;
-
-       bl = xa_load(&ctx->io_bl_xa, bgid);
-       /* must be a mmap'able buffer ring and have pages */
-       if (bl && bl->flags & IOBL_MMAP)
-               return bl;
-
-       return ERR_PTR(-EINVAL);
-}
-
-int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma)
+struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx,
+                                           unsigned int bgid)
 {
-       struct io_ring_ctx *ctx = file->private_data;
-       loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT;
        struct io_buffer_list *bl;
-       int bgid;
 
        lockdep_assert_held(&ctx->mmap_lock);
 
-       bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
-       bl = io_pbuf_get_bl(ctx, bgid);
-       if (IS_ERR(bl))
-               return PTR_ERR(bl);
+       bl = xa_load(&ctx->io_bl_xa, bgid);
+       if (!bl || !(bl->flags & IOBL_BUF_RING))
+               return NULL;
+       if (WARN_ON_ONCE(!io_region_is_set(&bl->region)))
+               return NULL;
 
-       return io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages);
+       return &bl->region;
 }
index dff7444026a639e380385ed0dfef879d8a041a36..bd80c44c5af1e9ca955e766c26c6f39239163452 100644 (file)
@@ -3,15 +3,13 @@
 #define IOU_KBUF_H
 
 #include <uapi/linux/io_uring.h>
+#include <linux/io_uring_types.h>
 
 enum {
        /* ring mapped provided buffers */
        IOBL_BUF_RING   = 1,
-       /* ring mapped provided buffers, but mmap'ed by application */
-       IOBL_MMAP       = 2,
        /* buffers are consumed incrementally rather than always fully */
-       IOBL_INC        = 4,
-
+       IOBL_INC        = 2,
 };
 
 struct io_buffer_list {
@@ -21,10 +19,7 @@ struct io_buffer_list {
         */
        union {
                struct list_head buf_list;
-               struct {
-                       struct page **buf_pages;
-                       struct io_uring_buf_ring *buf_ring;
-               };
+               struct io_uring_buf_ring *buf_ring;
        };
        __u16 bgid;
 
@@ -35,6 +30,8 @@ struct io_buffer_list {
        __u16 mask;
 
        __u16 flags;
+
+       struct io_mapped_region region;
 };
 
 struct io_buffer {
@@ -81,9 +78,8 @@ void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags);
 
 bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
 
-struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
-                                     unsigned long bgid);
-int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma);
+struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx,
+                                           unsigned int bgid);
 
 static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
 {
index 73b73f4ea1bd4b6af1e07b8af23418cc3f1e2a35..6d8a98bd9caccd4e02bd93aff349976c839a480f 100644 (file)
@@ -36,90 +36,6 @@ static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
        return page_address(page);
 }
 
-static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size,
-                                gfp_t gfp)
-{
-       void *ret;
-       int i;
-
-       for (i = 0; i < nr_pages; i++) {
-               pages[i] = alloc_page(gfp);
-               if (!pages[i])
-                       goto err;
-       }
-
-       ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
-       if (ret)
-               return ret;
-err:
-       while (i--)
-               put_page(pages[i]);
-       return ERR_PTR(-ENOMEM);
-}
-
-void *io_pages_map(struct page ***out_pages, unsigned short *npages,
-                  size_t size)
-{
-       gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
-       struct page **pages;
-       int nr_pages;
-       void *ret;
-
-       nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-       pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp);
-       if (!pages)
-               return ERR_PTR(-ENOMEM);
-
-       ret = io_mem_alloc_compound(pages, nr_pages, size, gfp);
-       if (!IS_ERR(ret))
-               goto done;
-       if (nr_pages == 1)
-               goto fail;
-
-       ret = io_mem_alloc_single(pages, nr_pages, size, gfp);
-       if (!IS_ERR(ret)) {
-done:
-               *out_pages = pages;
-               *npages = nr_pages;
-               return ret;
-       }
-fail:
-       kvfree(pages);
-       *out_pages = NULL;
-       *npages = 0;
-       return ret;
-}
-
-void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
-                   bool put_pages)
-{
-       bool do_vunmap = false;
-
-       if (!ptr)
-               return;
-
-       if (put_pages && *npages) {
-               struct page **to_free = *pages;
-               int i;
-
-               /*
-                * Only did vmap for the non-compound multiple page case.
-                * For the compound page, we just need to put the head.
-                */
-               if (PageCompound(to_free[0]))
-                       *npages = 1;
-               else if (*npages > 1)
-                       do_vunmap = true;
-               for (i = 0; i < *npages; i++)
-                       put_page(to_free[i]);
-       }
-       if (do_vunmap)
-               vunmap(ptr);
-       kvfree(*pages);
-       *pages = NULL;
-       *npages = 0;
-}
-
 struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
 {
        unsigned long start, end, nr_pages;
@@ -374,16 +290,14 @@ static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff,
                        return ERR_PTR(-EFAULT);
                return ctx->sq_sqes;
        case IORING_OFF_PBUF_RING: {
-               struct io_buffer_list *bl;
+               struct io_mapped_region *region;
                unsigned int bgid;
-               void *ptr;
 
                bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
-               bl = io_pbuf_get_bl(ctx, bgid);
-               if (IS_ERR(bl))
-                       return bl;
-               ptr = bl->buf_ring;
-               return ptr;
+               region = io_pbuf_get_region(ctx, bgid);
+               if (!region)
+                       return ERR_PTR(-EINVAL);
+               return io_region_validate_mmap(ctx, region);
                }
        case IORING_MAP_OFF_PARAM_REGION:
                return io_region_validate_mmap(ctx, &ctx->param_region);
@@ -392,15 +306,6 @@ static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff,
        return ERR_PTR(-EINVAL);
 }
 
-int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
-                       struct page **pages, int npages)
-{
-       unsigned long nr_pages = npages;
-
-       vm_flags_set(vma, VM_DONTEXPAND);
-       return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
-}
-
 #ifdef CONFIG_MMU
 
 static int io_region_mmap(struct io_ring_ctx *ctx,
@@ -435,8 +340,17 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
                return io_region_mmap(ctx, &ctx->ring_region, vma, page_limit);
        case IORING_OFF_SQES:
                return io_region_mmap(ctx, &ctx->sq_region, vma, UINT_MAX);
-       case IORING_OFF_PBUF_RING:
-               return io_pbuf_mmap(file, vma);
+       case IORING_OFF_PBUF_RING: {
+               struct io_mapped_region *region;
+               unsigned int bgid;
+
+               bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
+               region = io_pbuf_get_region(ctx, bgid);
+               if (!region)
+                       return -EINVAL;
+
+               return io_region_mmap(ctx, region, vma, UINT_MAX);
+       }
        case IORING_MAP_OFF_PARAM_REGION:
                return io_region_mmap(ctx, &ctx->param_region, vma, UINT_MAX);
        }
index 7395996eb35376a3b9c554e1272d334922f1c46d..c898dcba2b4ecceaec95b52c84481be6589e3016 100644 (file)
@@ -4,13 +4,6 @@
 #define IORING_MAP_OFF_PARAM_REGION            0x20000000ULL
 
 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
-int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
-                       struct page **pages, int npages);
-
-void *io_pages_map(struct page ***out_pages, unsigned short *npages,
-                  size_t size);
-void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
-                   bool put_pages);
 
 #ifndef CONFIG_MMU
 unsigned int io_uring_nommu_mmap_capabilities(struct file *file);