io_uring/kbuf.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 #include <linux/kernel.h>
   3 #include <linux/errno.h>
   4 #include <linux/fs.h>
   5 #include <linux/file.h>
   6 #include <linux/mm.h>
   7 #include <linux/slab.h>
   8 #include <linux/namei.h>
   9 #include <linux/poll.h>
  10 #include <linux/io_uring.h>
  11
  12 #include <uapi/linux/io_uring.h>
  13
  14 #include "io_uring.h"
  15 #include "opdef.h"
  16 #include "kbuf.h"
  17
  18 #define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
  19
  20 #define BGID_ARRAY      64
  21
  22 /* BIDs are addressed by a 16-bit field in a CQE */
  23 #define MAX_BIDS_PER_BGID (1 << 16)
  24
  25 struct kmem_cache *io_buf_cachep;
  26
  27 struct io_provide_buf {
  28         struct file                     *file;
  29         __u64                           addr;
  30         __u32                           len;
  31         __u32                           bgid;
  32         __u32                           nbufs;
  33         __u16                           bid;
  34 };
  35
  36 struct io_buf_free {
  37         struct hlist_node               list;
  38         void                            *mem;
  39         size_t                          size;
  40         int                             inuse;
  41 };
  42
  43 static struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
  44                                                    struct io_buffer_list *bl,
  45                                                    unsigned int bgid)
  46 {
  47         if (bl && bgid < BGID_ARRAY)
  48                 return &bl[bgid];
  49
  50         return xa_load(&ctx->io_bl_xa, bgid);
  51 }
  52
  53 static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
  54                                                         unsigned int bgid)
  55 {
  56         lockdep_assert_held(&ctx->uring_lock);
  57
  58         return __io_buffer_get_list(ctx, ctx->io_bl, bgid);
  59 }
  60
  61 static int io_buffer_add_list(struct io_ring_ctx *ctx,
  62                               struct io_buffer_list *bl, unsigned int bgid)
  63 {
  64         /*
  65          * Store buffer group ID and finally mark the list as visible.
  66          * The normal lookup doesn't care about the visibility as we're
  67          * always under the ->uring_lock, but the RCU lookup from mmap does.
  68          */
  69         bl->bgid = bgid;
  70         smp_store_release(&bl->is_ready, 1);
  71
  72         if (bgid < BGID_ARRAY)
  73                 return 0;
  74
  75         return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
  76 }
  77
  78 bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
  79 {
  80         struct io_ring_ctx *ctx = req->ctx;
  81         struct io_buffer_list *bl;
  82         struct io_buffer *buf;
  83
  84         io_ring_submit_lock(ctx, issue_flags);
  85
  86         buf = req->kbuf;
  87         bl = io_buffer_get_list(ctx, buf->bgid);
  88         list_add(&buf->list, &bl->buf_list);
  89         req->flags &= ~REQ_F_BUFFER_SELECTED;
  90         req->buf_index = buf->bgid;
  91
  92         io_ring_submit_unlock(ctx, issue_flags);
  93         return true;
  94 }
  95
  96 void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
  97 {
  98         /*
  99          * We can add this buffer back to two lists:
 100          *
 101          * 1) The io_buffers_cache list. This one is protected by the
 102          *    ctx->uring_lock. If we already hold this lock, add back to this
 103          *    list as we can grab it from issue as well.
 104          * 2) The io_buffers_comp list. This one is protected by the
 105          *    ctx->completion_lock.
 106          *
 107          * We migrate buffers from the comp_list to the issue cache list
 108          * when we need one.
 109          */
 110         if (issue_flags & IO_URING_F_UNLOCKED) {
 111                 struct io_ring_ctx *ctx = req->ctx;
 112
 113                 spin_lock(&ctx->completion_lock);
 114                 __io_put_kbuf_list(req, &ctx->io_buffers_comp);
 115                 spin_unlock(&ctx->completion_lock);
 116         } else {
 117                 lockdep_assert_held(&req->ctx->uring_lock);
 118
 119                 __io_put_kbuf_list(req, &req->ctx->io_buffers_cache);
 120         }
 121 }
 122
 123 static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
 124                                               struct io_buffer_list *bl)
 125 {
 126         if (!list_empty(&bl->buf_list)) {
 127                 struct io_buffer *kbuf;
 128
 129                 kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
 130                 list_del(&kbuf->list);
 131                 if (*len == 0 || *len > kbuf->len)
 132                         *len = kbuf->len;
 133                 if (list_empty(&bl->buf_list))
 134                         req->flags |= REQ_F_BL_EMPTY;
 135                 req->flags |= REQ_F_BUFFER_SELECTED;
 136                 req->kbuf = kbuf;
 137                 req->buf_index = kbuf->bid;
 138                 return u64_to_user_ptr(kbuf->addr);
 139         }
 140         return NULL;
 141 }
 142
 143 static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 144                                           struct io_buffer_list *bl,
 145                                           unsigned int issue_flags)
 146 {
 147         struct io_uring_buf_ring *br = bl->buf_ring;
 148         __u16 tail, head = bl->head;
 149         struct io_uring_buf *buf;
 150
 151         tail = smp_load_acquire(&br->tail);
 152         if (unlikely(tail == head))
 153                 return NULL;
 154
 155         if (head + 1 == tail)
 156                 req->flags |= REQ_F_BL_EMPTY;
 157
 158         head &= bl->mask;
 159         /* mmaped buffers are always contig */
 160         if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) {
 161                 buf = &br->bufs[head];
 162         } else {
 163                 int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
 164                 int index = head / IO_BUFFER_LIST_BUF_PER_PAGE;
 165                 buf = page_address(bl->buf_pages[index]);
 166                 buf += off;
 167         }
 168         if (*len == 0 || *len > buf->len)
 169                 *len = buf->len;
 170         req->flags |= REQ_F_BUFFER_RING;
 171         req->buf_list = bl;
 172         req->buf_index = buf->bid;
 173
 174         if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) {
 175                 /*
 176                  * If we came in unlocked, we have no choice but to consume the
 177                  * buffer here, otherwise nothing ensures that the buffer won't
 178                  * get used by others. This does mean it'll be pinned until the
 179                  * IO completes, coming in unlocked means we're being called from
 180                  * io-wq context and there may be further retries in async hybrid
 181                  * mode. For the locked case, the caller must call commit when
 182                  * the transfer completes (or if we get -EAGAIN and must poll of
 183                  * retry).
 184                  */
 185                 req->buf_list = NULL;
 186                 bl->head++;
 187         }
 188         return u64_to_user_ptr(buf->addr);
 189 }
 190
 191 void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 192                               unsigned int issue_flags)
 193 {
 194         struct io_ring_ctx *ctx = req->ctx;
 195         struct io_buffer_list *bl;
 196         void __user *ret = NULL;
 197
 198         io_ring_submit_lock(req->ctx, issue_flags);
 199
 200         bl = io_buffer_get_list(ctx, req->buf_index);
 201         if (likely(bl)) {
 202                 if (bl->is_buf_ring)
 203                         ret = io_ring_buffer_select(req, len, bl, issue_flags);
 204                 else
 205                         ret = io_provided_buffer_select(req, len, bl);
 206         }
 207         io_ring_submit_unlock(req->ctx, issue_flags);
 208         return ret;
 209 }
 210
 211 static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
 212 {
 213         struct io_buffer_list *bl;
 214         int i;
 215
 216         bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), GFP_KERNEL);
 217         if (!bl)
 218                 return -ENOMEM;
 219
 220         for (i = 0; i < BGID_ARRAY; i++) {
 221                 INIT_LIST_HEAD(&bl[i].buf_list);
 222                 bl[i].bgid = i;
 223         }
 224
 225         smp_store_release(&ctx->io_bl, bl);
 226         return 0;
 227 }
 228
 229 /*
 230  * Mark the given mapped range as free for reuse
 231  */
 232 static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
 233 {
 234         struct io_buf_free *ibf;
 235
 236         hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
 237                 if (bl->buf_ring == ibf->mem) {
 238                         ibf->inuse = 0;
 239                         return;
 240                 }
 241         }
 242
 243         /* can't happen... */
 244         WARN_ON_ONCE(1);
 245 }
 246
 247 static int __io_remove_buffers(struct io_ring_ctx *ctx,
 248                                struct io_buffer_list *bl, unsigned nbufs)
 249 {
 250         unsigned i = 0;
 251
 252         /* shouldn't happen */
 253         if (!nbufs)
 254                 return 0;
 255
 256         if (bl->is_buf_ring) {
 257                 i = bl->buf_ring->tail - bl->head;
 258                 if (bl->is_mmap) {
 259                         /*
 260                          * io_kbuf_list_free() will free the page(s) at
 261                          * ->release() time.
 262                          */
 263                         io_kbuf_mark_free(ctx, bl);
 264                         bl->buf_ring = NULL;
 265                         bl->is_mmap = 0;
 266                 } else if (bl->buf_nr_pages) {
 267                         int j;
 268
 269                         for (j = 0; j < bl->buf_nr_pages; j++)
 270                                 unpin_user_page(bl->buf_pages[j]);
 271                         kvfree(bl->buf_pages);
 272                         bl->buf_pages = NULL;
 273                         bl->buf_nr_pages = 0;
 274                 }
 275                 /* make sure it's seen as empty */
 276                 INIT_LIST_HEAD(&bl->buf_list);
 277                 bl->is_buf_ring = 0;
 278                 return i;
 279         }
 280
 281         /* protects io_buffers_cache */
 282         lockdep_assert_held(&ctx->uring_lock);
 283
 284         while (!list_empty(&bl->buf_list)) {
 285                 struct io_buffer *nxt;
 286
 287                 nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
 288                 list_move(&nxt->list, &ctx->io_buffers_cache);
 289                 if (++i == nbufs)
 290                         return i;
 291                 cond_resched();
 292         }
 293
 294         return i;
 295 }
 296
 297 void io_destroy_buffers(struct io_ring_ctx *ctx)
 298 {
 299         struct io_buffer_list *bl;
 300         struct list_head *item, *tmp;
 301         struct io_buffer *buf;
 302         unsigned long index;
 303         int i;
 304
 305         for (i = 0; i < BGID_ARRAY; i++) {
 306                 if (!ctx->io_bl)
 307                         break;
 308                 __io_remove_buffers(ctx, &ctx->io_bl[i], -1U);
 309         }
 310
 311         xa_for_each(&ctx->io_bl_xa, index, bl) {
 312                 xa_erase(&ctx->io_bl_xa, bl->bgid);
 313                 __io_remove_buffers(ctx, bl, -1U);
 314                 kfree_rcu(bl, rcu);
 315         }
 316
 317         /*
 318          * Move deferred locked entries to cache before pruning
 319          */
 320         spin_lock(&ctx->completion_lock);
 321         if (!list_empty(&ctx->io_buffers_comp))
 322                 list_splice_init(&ctx->io_buffers_comp, &ctx->io_buffers_cache);
 323         spin_unlock(&ctx->completion_lock);
 324
 325         list_for_each_safe(item, tmp, &ctx->io_buffers_cache) {
 326                 buf = list_entry(item, struct io_buffer, list);
 327                 kmem_cache_free(io_buf_cachep, buf);
 328         }
 329 }
 330
 331 int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 332 {
 333         struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
 334         u64 tmp;
 335
 336         if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
 337             sqe->splice_fd_in)
 338                 return -EINVAL;
 339
 340         tmp = READ_ONCE(sqe->fd);
 341         if (!tmp || tmp > MAX_BIDS_PER_BGID)
 342                 return -EINVAL;
 343
 344         memset(p, 0, sizeof(*p));
 345         p->nbufs = tmp;
 346         p->bgid = READ_ONCE(sqe->buf_group);
 347         return 0;
 348 }
 349
 350 int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 351 {
 352         struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
 353         struct io_ring_ctx *ctx = req->ctx;
 354         struct io_buffer_list *bl;
 355         int ret = 0;
 356
 357         io_ring_submit_lock(ctx, issue_flags);
 358
 359         ret = -ENOENT;
 360         bl = io_buffer_get_list(ctx, p->bgid);
 361         if (bl) {
 362                 ret = -EINVAL;
 363                 /* can't use provide/remove buffers command on mapped buffers */
 364                 if (!bl->is_buf_ring)
 365                         ret = __io_remove_buffers(ctx, bl, p->nbufs);
 366         }
 367         io_ring_submit_unlock(ctx, issue_flags);
 368         if (ret < 0)
 369                 req_set_fail(req);
 370         io_req_set_res(req, ret, 0);
 371         return IOU_OK;
 372 }
 373
 374 int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 375 {
 376         unsigned long size, tmp_check;
 377         struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
 378         u64 tmp;
 379
 380         if (sqe->rw_flags || sqe->splice_fd_in)
 381                 return -EINVAL;
 382
 383         tmp = READ_ONCE(sqe->fd);
 384         if (!tmp || tmp > MAX_BIDS_PER_BGID)
 385                 return -E2BIG;
 386         p->nbufs = tmp;
 387         p->addr = READ_ONCE(sqe->addr);
 388         p->len = READ_ONCE(sqe->len);
 389
 390         if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
 391                                 &size))
 392                 return -EOVERFLOW;
 393         if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
 394                 return -EOVERFLOW;
 395
 396         size = (unsigned long)p->len * p->nbufs;
 397         if (!access_ok(u64_to_user_ptr(p->addr), size))
 398                 return -EFAULT;
 399
 400         p->bgid = READ_ONCE(sqe->buf_group);
 401         tmp = READ_ONCE(sqe->off);
 402         if (tmp > USHRT_MAX)
 403                 return -E2BIG;
 404         if (tmp + p->nbufs > MAX_BIDS_PER_BGID)
 405                 return -EINVAL;
 406         p->bid = tmp;
 407         return 0;
 408 }
 409
 410 #define IO_BUFFER_ALLOC_BATCH 64
 411
 412 static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
 413 {
 414         struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH];
 415         int allocated;
 416
 417         /*
 418          * Completions that don't happen inline (eg not under uring_lock) will
 419          * add to ->io_buffers_comp. If we don't have any free buffers, check
 420          * the completion list and splice those entries first.
 421          */
 422         if (!list_empty_careful(&ctx->io_buffers_comp)) {
 423                 spin_lock(&ctx->completion_lock);
 424                 if (!list_empty(&ctx->io_buffers_comp)) {
 425                         list_splice_init(&ctx->io_buffers_comp,
 426                                                 &ctx->io_buffers_cache);
 427                         spin_unlock(&ctx->completion_lock);
 428                         return 0;
 429                 }
 430                 spin_unlock(&ctx->completion_lock);
 431         }
 432
 433         /*
 434          * No free buffers and no completion entries either. Allocate a new
 435          * batch of buffer entries and add those to our freelist.
 436          */
 437
 438         allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT,
 439                                           ARRAY_SIZE(bufs), (void **) bufs);
 440         if (unlikely(!allocated)) {
 441                 /*
 442                  * Bulk alloc is all-or-nothing. If we fail to get a batch,
 443                  * retry single alloc to be on the safe side.
 444                  */
 445                 bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL);
 446                 if (!bufs[0])
 447                         return -ENOMEM;
 448                 allocated = 1;
 449         }
 450
 451         while (allocated)
 452                 list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache);
 453
 454         return 0;
 455 }
 456
 457 static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
 458                           struct io_buffer_list *bl)
 459 {
 460         struct io_buffer *buf;
 461         u64 addr = pbuf->addr;
 462         int i, bid = pbuf->bid;
 463
 464         for (i = 0; i < pbuf->nbufs; i++) {
 465                 if (list_empty(&ctx->io_buffers_cache) &&
 466                     io_refill_buffer_cache(ctx))
 467                         break;
 468                 buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
 469                                         list);
 470                 list_move_tail(&buf->list, &bl->buf_list);
 471                 buf->addr = addr;
 472                 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
 473                 buf->bid = bid;
 474                 buf->bgid = pbuf->bgid;
 475                 addr += pbuf->len;
 476                 bid++;
 477                 cond_resched();
 478         }
 479
 480         return i ? 0 : -ENOMEM;
 481 }
 482
 483 int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 484 {
 485         struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
 486         struct io_ring_ctx *ctx = req->ctx;
 487         struct io_buffer_list *bl;
 488         int ret = 0;
 489
 490         io_ring_submit_lock(ctx, issue_flags);
 491
 492         if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) {
 493                 ret = io_init_bl_list(ctx);
 494                 if (ret)
 495                         goto err;
 496         }
 497
 498         bl = io_buffer_get_list(ctx, p->bgid);
 499         if (unlikely(!bl)) {
 500                 bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
 501                 if (!bl) {
 502                         ret = -ENOMEM;
 503                         goto err;
 504                 }
 505                 INIT_LIST_HEAD(&bl->buf_list);
 506                 ret = io_buffer_add_list(ctx, bl, p->bgid);
 507                 if (ret) {
 508                         /*
 509                          * Doesn't need rcu free as it was never visible, but
 510                          * let's keep it consistent throughout. Also can't
 511                          * be a lower indexed array group, as adding one
 512                          * where lookup failed cannot happen.
 513                          */
 514                         if (p->bgid >= BGID_ARRAY)
 515                                 kfree_rcu(bl, rcu);
 516                         else
 517                                 WARN_ON_ONCE(1);
 518                         goto err;
 519                 }
 520         }
 521         /* can't add buffers via this command for a mapped buffer ring */
 522         if (bl->is_buf_ring) {
 523                 ret = -EINVAL;
 524                 goto err;
 525         }
 526
 527         ret = io_add_buffers(ctx, p, bl);
 528 err:
 529         io_ring_submit_unlock(ctx, issue_flags);
 530
 531         if (ret < 0)
 532                 req_set_fail(req);
 533         io_req_set_res(req, ret, 0);
 534         return IOU_OK;
 535 }
 536
 537 static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
 538                             struct io_buffer_list *bl)
 539 {
 540         struct io_uring_buf_ring *br;
 541         struct page **pages;
 542         int i, nr_pages;
 543
 544         pages = io_pin_pages(reg->ring_addr,
 545                              flex_array_size(br, bufs, reg->ring_entries),
 546                              &nr_pages);
 547         if (IS_ERR(pages))
 548                 return PTR_ERR(pages);
 549
 550         /*
 551          * Apparently some 32-bit boxes (ARM) will return highmem pages,
 552          * which then need to be mapped. We could support that, but it'd
 553          * complicate the code and slowdown the common cases quite a bit.
 554          * So just error out, returning -EINVAL just like we did on kernels
 555          * that didn't support mapped buffer rings.
 556          */
 557         for (i = 0; i < nr_pages; i++)
 558                 if (PageHighMem(pages[i]))
 559                         goto error_unpin;
 560
 561         br = page_address(pages[0]);
 562 #ifdef SHM_COLOUR
 563         /*
 564          * On platforms that have specific aliasing requirements, SHM_COLOUR
 565          * is set and we must guarantee that the kernel and user side align
 566          * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
 567          * the application mmap's the provided ring buffer. Fail the request
 568          * if we, by chance, don't end up with aligned addresses. The app
 569          * should use IOU_PBUF_RING_MMAP instead, and liburing will handle
 570          * this transparently.
 571          */
 572         if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1))
 573                 goto error_unpin;
 574 #endif
 575         bl->buf_pages = pages;
 576         bl->buf_nr_pages = nr_pages;
 577         bl->buf_ring = br;
 578         bl->is_buf_ring = 1;
 579         bl->is_mmap = 0;
 580         return 0;
 581 error_unpin:
 582         for (i = 0; i < nr_pages; i++)
 583                 unpin_user_page(pages[i]);
 584         kvfree(pages);
 585         return -EINVAL;
 586 }
 587
 588 /*
 589  * See if we have a suitable region that we can reuse, rather than allocate
 590  * both a new io_buf_free and mem region again. We leave it on the list as
 591  * even a reused entry will need freeing at ring release.
 592  */
 593 static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx,
 594                                                     size_t ring_size)
 595 {
 596         struct io_buf_free *ibf, *best = NULL;
 597         size_t best_dist;
 598
 599         hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
 600                 size_t dist;
 601
 602                 if (ibf->inuse || ibf->size < ring_size)
 603                         continue;
 604                 dist = ibf->size - ring_size;
 605                 if (!best || dist < best_dist) {
 606                         best = ibf;
 607                         if (!dist)
 608                                 break;
 609                         best_dist = dist;
 610                 }
 611         }
 612
 613         return best;
 614 }
 615
 616 static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
 617                               struct io_uring_buf_reg *reg,
 618                               struct io_buffer_list *bl)
 619 {
 620         struct io_buf_free *ibf;
 621         size_t ring_size;
 622         void *ptr;
 623
 624         ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
 625
 626         /* Reuse existing entry, if we can */
 627         ibf = io_lookup_buf_free_entry(ctx, ring_size);
 628         if (!ibf) {
 629                 ptr = io_mem_alloc(ring_size);
 630                 if (IS_ERR(ptr))
 631                         return PTR_ERR(ptr);
 632
 633                 /* Allocate and store deferred free entry */
 634                 ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT);
 635                 if (!ibf) {
 636                         io_mem_free(ptr);
 637                         return -ENOMEM;
 638                 }
 639                 ibf->mem = ptr;
 640                 ibf->size = ring_size;
 641                 hlist_add_head(&ibf->list, &ctx->io_buf_list);
 642         }
 643         ibf->inuse = 1;
 644         bl->buf_ring = ibf->mem;
 645         bl->is_buf_ring = 1;
 646         bl->is_mmap = 1;
 647         return 0;
 648 }
 649
 650 int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 651 {
 652         struct io_uring_buf_reg reg;
 653         struct io_buffer_list *bl, *free_bl = NULL;
 654         int ret;
 655
 656         lockdep_assert_held(&ctx->uring_lock);
 657
 658         if (copy_from_user(&reg, arg, sizeof(reg)))
 659                 return -EFAULT;
 660
 661         if (reg.resv[0] || reg.resv[1] || reg.resv[2])
 662                 return -EINVAL;
 663         if (reg.flags & ~IOU_PBUF_RING_MMAP)
 664                 return -EINVAL;
 665         if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
 666                 if (!reg.ring_addr)
 667                         return -EFAULT;
 668                 if (reg.ring_addr & ~PAGE_MASK)
 669                         return -EINVAL;
 670         } else {
 671                 if (reg.ring_addr)
 672                         return -EINVAL;
 673         }
 674
 675         if (!is_power_of_2(reg.ring_entries))
 676                 return -EINVAL;
 677
 678         /* cannot disambiguate full vs empty due to head/tail size */
 679         if (reg.ring_entries >= 65536)
 680                 return -EINVAL;
 681
 682         if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
 683                 int ret = io_init_bl_list(ctx);
 684                 if (ret)
 685                         return ret;
 686         }
 687
 688         bl = io_buffer_get_list(ctx, reg.bgid);
 689         if (bl) {
 690                 /* if mapped buffer ring OR classic exists, don't allow */
 691                 if (bl->is_buf_ring || !list_empty(&bl->buf_list))
 692                         return -EEXIST;
 693         } else {
 694                 free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
 695                 if (!bl)
 696                         return -ENOMEM;
 697         }
 698
 699         if (!(reg.flags & IOU_PBUF_RING_MMAP))
 700                 ret = io_pin_pbuf_ring(&reg, bl);
 701         else
 702                 ret = io_alloc_pbuf_ring(ctx, &reg, bl);
 703
 704         if (!ret) {
 705                 bl->nr_entries = reg.ring_entries;
 706                 bl->mask = reg.ring_entries - 1;
 707
 708                 io_buffer_add_list(ctx, bl, reg.bgid);
 709                 return 0;
 710         }
 711
 712         kfree_rcu(free_bl, rcu);
 713         return ret;
 714 }
 715
 716 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 717 {
 718         struct io_uring_buf_reg reg;
 719         struct io_buffer_list *bl;
 720
 721         lockdep_assert_held(&ctx->uring_lock);
 722
 723         if (copy_from_user(&reg, arg, sizeof(reg)))
 724                 return -EFAULT;
 725         if (reg.resv[0] || reg.resv[1] || reg.resv[2])
 726                 return -EINVAL;
 727         if (reg.flags)
 728                 return -EINVAL;
 729
 730         bl = io_buffer_get_list(ctx, reg.bgid);
 731         if (!bl)
 732                 return -ENOENT;
 733         if (!bl->is_buf_ring)
 734                 return -EINVAL;
 735
 736         __io_remove_buffers(ctx, bl, -1U);
 737         if (bl->bgid >= BGID_ARRAY) {
 738                 xa_erase(&ctx->io_bl_xa, bl->bgid);
 739                 kfree_rcu(bl, rcu);
 740         }
 741         return 0;
 742 }
 743
 744 int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
 745 {
 746         struct io_uring_buf_status buf_status;
 747         struct io_buffer_list *bl;
 748         int i;
 749
 750         if (copy_from_user(&buf_status, arg, sizeof(buf_status)))
 751                 return -EFAULT;
 752
 753         for (i = 0; i < ARRAY_SIZE(buf_status.resv); i++)
 754                 if (buf_status.resv[i])
 755                         return -EINVAL;
 756
 757         bl = io_buffer_get_list(ctx, buf_status.buf_group);
 758         if (!bl)
 759                 return -ENOENT;
 760         if (!bl->is_buf_ring)
 761                 return -EINVAL;
 762
 763         buf_status.head = bl->head;
 764         if (copy_to_user(arg, &buf_status, sizeof(buf_status)))
 765                 return -EFAULT;
 766
 767         return 0;
 768 }
 769
 770 void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
 771 {
 772         struct io_buffer_list *bl;
 773
 774         bl = __io_buffer_get_list(ctx, smp_load_acquire(&ctx->io_bl), bgid);
 775
 776         if (!bl || !bl->is_mmap)
 777                 return NULL;
 778         /*
 779          * Ensure the list is fully setup. Only strictly needed for RCU lookup
 780          * via mmap, and in that case only for the array indexed groups. For
 781          * the xarray lookups, it's either visible and ready, or not at all.
 782          */
 783         if (!smp_load_acquire(&bl->is_ready))
 784                 return NULL;
 785
 786         return bl->buf_ring;
 787 }
 788
 789 /*
 790  * Called at or after ->release(), free the mmap'ed buffers that we used
 791  * for memory mapped provided buffer rings.
 792  */
 793 void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx)
 794 {
 795         struct io_buf_free *ibf;
 796         struct hlist_node *tmp;
 797
 798         hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) {
 799                 hlist_del(&ibf->list);
 800                 io_mem_free(ibf->mem);
 801                 kfree(ibf);
 802         }
 803 }