io_uring/kbuf.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 #include <linux/kernel.h>
   3 #include <linux/errno.h>
   4 #include <linux/fs.h>
   5 #include <linux/file.h>
   6 #include <linux/mm.h>
   7 #include <linux/slab.h>
   8 #include <linux/namei.h>
   9 #include <linux/poll.h>
  10 #include <linux/io_uring.h>
  11
  12 #include <uapi/linux/io_uring.h>
  13
  14 #include "io_uring.h"
  15 #include "opdef.h"
  16 #include "kbuf.h"
  17
  18 #define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
  19
  20 #define BGID_ARRAY      64
  21
  22 /* BIDs are addressed by a 16-bit field in a CQE */
  23 #define MAX_BIDS_PER_BGID (1 << 16)
  24
  25 struct kmem_cache *io_buf_cachep;
  26
  27 struct io_provide_buf {
  28         struct file                     *file;
  29         __u64                           addr;
  30         __u32                           len;
  31         __u32                           bgid;
  32         __u32                           nbufs;
  33         __u16                           bid;
  34 };
  35
  36 struct io_buf_free {
  37         struct hlist_node               list;
  38         void                            *mem;
  39         size_t                          size;
  40         int                             inuse;
  41 };
  42
  43 static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
  44                                                         unsigned int bgid)
  45 {
  46         if (ctx->io_bl && bgid < BGID_ARRAY)
  47                 return &ctx->io_bl[bgid];
  48
  49         return xa_load(&ctx->io_bl_xa, bgid);
  50 }
  51
  52 static int io_buffer_add_list(struct io_ring_ctx *ctx,
  53                               struct io_buffer_list *bl, unsigned int bgid)
  54 {
  55         bl->bgid = bgid;
  56         if (bgid < BGID_ARRAY)
  57                 return 0;
  58
  59         return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
  60 }
  61
  62 bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
  63 {
  64         struct io_ring_ctx *ctx = req->ctx;
  65         struct io_buffer_list *bl;
  66         struct io_buffer *buf;
  67
  68         /*
  69          * For legacy provided buffer mode, don't recycle if we already did
  70          * IO to this buffer. For ring-mapped provided buffer mode, we should
  71          * increment ring->head to explicitly monopolize the buffer to avoid
  72          * multiple use.
  73          */
  74         if (req->flags & REQ_F_PARTIAL_IO)
  75                 return false;
  76
  77         io_ring_submit_lock(ctx, issue_flags);
  78
  79         buf = req->kbuf;
  80         bl = io_buffer_get_list(ctx, buf->bgid);
  81         list_add(&buf->list, &bl->buf_list);
  82         req->flags &= ~REQ_F_BUFFER_SELECTED;
  83         req->buf_index = buf->bgid;
  84
  85         io_ring_submit_unlock(ctx, issue_flags);
  86         return true;
  87 }
  88
  89 unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
  90 {
  91         unsigned int cflags;
  92
  93         /*
  94          * We can add this buffer back to two lists:
  95          *
  96          * 1) The io_buffers_cache list. This one is protected by the
  97          *    ctx->uring_lock. If we already hold this lock, add back to this
  98          *    list as we can grab it from issue as well.
  99          * 2) The io_buffers_comp list. This one is protected by the
 100          *    ctx->completion_lock.
 101          *
 102          * We migrate buffers from the comp_list to the issue cache list
 103          * when we need one.
 104          */
 105         if (req->flags & REQ_F_BUFFER_RING) {
 106                 /* no buffers to recycle for this case */
 107                 cflags = __io_put_kbuf_list(req, NULL);
 108         } else if (issue_flags & IO_URING_F_UNLOCKED) {
 109                 struct io_ring_ctx *ctx = req->ctx;
 110
 111                 spin_lock(&ctx->completion_lock);
 112                 cflags = __io_put_kbuf_list(req, &ctx->io_buffers_comp);
 113                 spin_unlock(&ctx->completion_lock);
 114         } else {
 115                 lockdep_assert_held(&req->ctx->uring_lock);
 116
 117                 cflags = __io_put_kbuf_list(req, &req->ctx->io_buffers_cache);
 118         }
 119         return cflags;
 120 }
 121
 122 static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
 123                                               struct io_buffer_list *bl)
 124 {
 125         if (!list_empty(&bl->buf_list)) {
 126                 struct io_buffer *kbuf;
 127
 128                 kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
 129                 list_del(&kbuf->list);
 130                 if (*len == 0 || *len > kbuf->len)
 131                         *len = kbuf->len;
 132                 req->flags |= REQ_F_BUFFER_SELECTED;
 133                 req->kbuf = kbuf;
 134                 req->buf_index = kbuf->bid;
 135                 return u64_to_user_ptr(kbuf->addr);
 136         }
 137         return NULL;
 138 }
 139
 140 static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 141                                           struct io_buffer_list *bl,
 142                                           unsigned int issue_flags)
 143 {
 144         struct io_uring_buf_ring *br = bl->buf_ring;
 145         struct io_uring_buf *buf;
 146         __u16 head = bl->head;
 147
 148         if (unlikely(smp_load_acquire(&br->tail) == head))
 149                 return NULL;
 150
 151         head &= bl->mask;
 152         /* mmaped buffers are always contig */
 153         if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) {
 154                 buf = &br->bufs[head];
 155         } else {
 156                 int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
 157                 int index = head / IO_BUFFER_LIST_BUF_PER_PAGE;
 158                 buf = page_address(bl->buf_pages[index]);
 159                 buf += off;
 160         }
 161         if (*len == 0 || *len > buf->len)
 162                 *len = buf->len;
 163         req->flags |= REQ_F_BUFFER_RING;
 164         req->buf_list = bl;
 165         req->buf_index = buf->bid;
 166
 167         if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) {
 168                 /*
 169                  * If we came in unlocked, we have no choice but to consume the
 170                  * buffer here, otherwise nothing ensures that the buffer won't
 171                  * get used by others. This does mean it'll be pinned until the
 172                  * IO completes, coming in unlocked means we're being called from
 173                  * io-wq context and there may be further retries in async hybrid
 174                  * mode. For the locked case, the caller must call commit when
 175                  * the transfer completes (or if we get -EAGAIN and must poll of
 176                  * retry).
 177                  */
 178                 req->buf_list = NULL;
 179                 bl->head++;
 180         }
 181         return u64_to_user_ptr(buf->addr);
 182 }
 183
 184 void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 185                               unsigned int issue_flags)
 186 {
 187         struct io_ring_ctx *ctx = req->ctx;
 188         struct io_buffer_list *bl;
 189         void __user *ret = NULL;
 190
 191         io_ring_submit_lock(req->ctx, issue_flags);
 192
 193         bl = io_buffer_get_list(ctx, req->buf_index);
 194         if (likely(bl)) {
 195                 if (bl->is_mapped)
 196                         ret = io_ring_buffer_select(req, len, bl, issue_flags);
 197                 else
 198                         ret = io_provided_buffer_select(req, len, bl);
 199         }
 200         io_ring_submit_unlock(req->ctx, issue_flags);
 201         return ret;
 202 }
 203
 204 static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
 205 {
 206         int i;
 207
 208         ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list),
 209                                 GFP_KERNEL);
 210         if (!ctx->io_bl)
 211                 return -ENOMEM;
 212
 213         for (i = 0; i < BGID_ARRAY; i++) {
 214                 INIT_LIST_HEAD(&ctx->io_bl[i].buf_list);
 215                 ctx->io_bl[i].bgid = i;
 216         }
 217
 218         return 0;
 219 }
 220
 221 /*
 222  * Mark the given mapped range as free for reuse
 223  */
 224 static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
 225 {
 226         struct io_buf_free *ibf;
 227
 228         hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
 229                 if (bl->buf_ring == ibf->mem) {
 230                         ibf->inuse = 0;
 231                         return;
 232                 }
 233         }
 234
 235         /* can't happen... */
 236         WARN_ON_ONCE(1);
 237 }
 238
 239 static int __io_remove_buffers(struct io_ring_ctx *ctx,
 240                                struct io_buffer_list *bl, unsigned nbufs)
 241 {
 242         unsigned i = 0;
 243
 244         /* shouldn't happen */
 245         if (!nbufs)
 246                 return 0;
 247
 248         if (bl->is_mapped) {
 249                 i = bl->buf_ring->tail - bl->head;
 250                 if (bl->is_mmap) {
 251                         /*
 252                          * io_kbuf_list_free() will free the page(s) at
 253                          * ->release() time.
 254                          */
 255                         io_kbuf_mark_free(ctx, bl);
 256                         bl->buf_ring = NULL;
 257                         bl->is_mmap = 0;
 258                 } else if (bl->buf_nr_pages) {
 259                         int j;
 260
 261                         for (j = 0; j < bl->buf_nr_pages; j++)
 262                                 unpin_user_page(bl->buf_pages[j]);
 263                         kvfree(bl->buf_pages);
 264                         bl->buf_pages = NULL;
 265                         bl->buf_nr_pages = 0;
 266                 }
 267                 /* make sure it's seen as empty */
 268                 INIT_LIST_HEAD(&bl->buf_list);
 269                 bl->is_mapped = 0;
 270                 return i;
 271         }
 272
 273         /* protects io_buffers_cache */
 274         lockdep_assert_held(&ctx->uring_lock);
 275
 276         while (!list_empty(&bl->buf_list)) {
 277                 struct io_buffer *nxt;
 278
 279                 nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
 280                 list_move(&nxt->list, &ctx->io_buffers_cache);
 281                 if (++i == nbufs)
 282                         return i;
 283                 cond_resched();
 284         }
 285
 286         return i;
 287 }
 288
 289 void io_destroy_buffers(struct io_ring_ctx *ctx)
 290 {
 291         struct io_buffer_list *bl;
 292         struct list_head *item, *tmp;
 293         struct io_buffer *buf;
 294         unsigned long index;
 295         int i;
 296
 297         for (i = 0; i < BGID_ARRAY; i++) {
 298                 if (!ctx->io_bl)
 299                         break;
 300                 __io_remove_buffers(ctx, &ctx->io_bl[i], -1U);
 301         }
 302
 303         xa_for_each(&ctx->io_bl_xa, index, bl) {
 304                 xa_erase(&ctx->io_bl_xa, bl->bgid);
 305                 __io_remove_buffers(ctx, bl, -1U);
 306                 kfree(bl);
 307         }
 308
 309         list_for_each_safe(item, tmp, &ctx->io_buffers_cache) {
 310                 buf = list_entry(item, struct io_buffer, list);
 311                 kmem_cache_free(io_buf_cachep, buf);
 312         }
 313 }
 314
 315 int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 316 {
 317         struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
 318         u64 tmp;
 319
 320         if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
 321             sqe->splice_fd_in)
 322                 return -EINVAL;
 323
 324         tmp = READ_ONCE(sqe->fd);
 325         if (!tmp || tmp > MAX_BIDS_PER_BGID)
 326                 return -EINVAL;
 327
 328         memset(p, 0, sizeof(*p));
 329         p->nbufs = tmp;
 330         p->bgid = READ_ONCE(sqe->buf_group);
 331         return 0;
 332 }
 333
 334 int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 335 {
 336         struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
 337         struct io_ring_ctx *ctx = req->ctx;
 338         struct io_buffer_list *bl;
 339         int ret = 0;
 340
 341         io_ring_submit_lock(ctx, issue_flags);
 342
 343         ret = -ENOENT;
 344         bl = io_buffer_get_list(ctx, p->bgid);
 345         if (bl) {
 346                 ret = -EINVAL;
 347                 /* can't use provide/remove buffers command on mapped buffers */
 348                 if (!bl->is_mapped)
 349                         ret = __io_remove_buffers(ctx, bl, p->nbufs);
 350         }
 351         io_ring_submit_unlock(ctx, issue_flags);
 352         if (ret < 0)
 353                 req_set_fail(req);
 354         io_req_set_res(req, ret, 0);
 355         return IOU_OK;
 356 }
 357
 358 int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 359 {
 360         unsigned long size, tmp_check;
 361         struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
 362         u64 tmp;
 363
 364         if (sqe->rw_flags || sqe->splice_fd_in)
 365                 return -EINVAL;
 366
 367         tmp = READ_ONCE(sqe->fd);
 368         if (!tmp || tmp > MAX_BIDS_PER_BGID)
 369                 return -E2BIG;
 370         p->nbufs = tmp;
 371         p->addr = READ_ONCE(sqe->addr);
 372         p->len = READ_ONCE(sqe->len);
 373
 374         if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
 375                                 &size))
 376                 return -EOVERFLOW;
 377         if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
 378                 return -EOVERFLOW;
 379
 380         size = (unsigned long)p->len * p->nbufs;
 381         if (!access_ok(u64_to_user_ptr(p->addr), size))
 382                 return -EFAULT;
 383
 384         p->bgid = READ_ONCE(sqe->buf_group);
 385         tmp = READ_ONCE(sqe->off);
 386         if (tmp > USHRT_MAX)
 387                 return -E2BIG;
 388         if (tmp + p->nbufs > MAX_BIDS_PER_BGID)
 389                 return -EINVAL;
 390         p->bid = tmp;
 391         return 0;
 392 }
 393
 394 #define IO_BUFFER_ALLOC_BATCH 64
 395
 396 static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
 397 {
 398         struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH];
 399         int allocated;
 400
 401         /*
 402          * Completions that don't happen inline (eg not under uring_lock) will
 403          * add to ->io_buffers_comp. If we don't have any free buffers, check
 404          * the completion list and splice those entries first.
 405          */
 406         if (!list_empty_careful(&ctx->io_buffers_comp)) {
 407                 spin_lock(&ctx->completion_lock);
 408                 if (!list_empty(&ctx->io_buffers_comp)) {
 409                         list_splice_init(&ctx->io_buffers_comp,
 410                                                 &ctx->io_buffers_cache);
 411                         spin_unlock(&ctx->completion_lock);
 412                         return 0;
 413                 }
 414                 spin_unlock(&ctx->completion_lock);
 415         }
 416
 417         /*
 418          * No free buffers and no completion entries either. Allocate a new
 419          * batch of buffer entries and add those to our freelist.
 420          */
 421
 422         allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT,
 423                                           ARRAY_SIZE(bufs), (void **) bufs);
 424         if (unlikely(!allocated)) {
 425                 /*
 426                  * Bulk alloc is all-or-nothing. If we fail to get a batch,
 427                  * retry single alloc to be on the safe side.
 428                  */
 429                 bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL);
 430                 if (!bufs[0])
 431                         return -ENOMEM;
 432                 allocated = 1;
 433         }
 434
 435         while (allocated)
 436                 list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache);
 437
 438         return 0;
 439 }
 440
 441 static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
 442                           struct io_buffer_list *bl)
 443 {
 444         struct io_buffer *buf;
 445         u64 addr = pbuf->addr;
 446         int i, bid = pbuf->bid;
 447
 448         for (i = 0; i < pbuf->nbufs; i++) {
 449                 if (list_empty(&ctx->io_buffers_cache) &&
 450                     io_refill_buffer_cache(ctx))
 451                         break;
 452                 buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
 453                                         list);
 454                 list_move_tail(&buf->list, &bl->buf_list);
 455                 buf->addr = addr;
 456                 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
 457                 buf->bid = bid;
 458                 buf->bgid = pbuf->bgid;
 459                 addr += pbuf->len;
 460                 bid++;
 461                 cond_resched();
 462         }
 463
 464         return i ? 0 : -ENOMEM;
 465 }
 466
 467 int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 468 {
 469         struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
 470         struct io_ring_ctx *ctx = req->ctx;
 471         struct io_buffer_list *bl;
 472         int ret = 0;
 473
 474         io_ring_submit_lock(ctx, issue_flags);
 475
 476         if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) {
 477                 ret = io_init_bl_list(ctx);
 478                 if (ret)
 479                         goto err;
 480         }
 481
 482         bl = io_buffer_get_list(ctx, p->bgid);
 483         if (unlikely(!bl)) {
 484                 bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
 485                 if (!bl) {
 486                         ret = -ENOMEM;
 487                         goto err;
 488                 }
 489                 INIT_LIST_HEAD(&bl->buf_list);
 490                 ret = io_buffer_add_list(ctx, bl, p->bgid);
 491                 if (ret) {
 492                         kfree(bl);
 493                         goto err;
 494                 }
 495         }
 496         /* can't add buffers via this command for a mapped buffer ring */
 497         if (bl->is_mapped) {
 498                 ret = -EINVAL;
 499                 goto err;
 500         }
 501
 502         ret = io_add_buffers(ctx, p, bl);
 503 err:
 504         io_ring_submit_unlock(ctx, issue_flags);
 505
 506         if (ret < 0)
 507                 req_set_fail(req);
 508         io_req_set_res(req, ret, 0);
 509         return IOU_OK;
 510 }
 511
 512 static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
 513                             struct io_buffer_list *bl)
 514 {
 515         struct io_uring_buf_ring *br;
 516         struct page **pages;
 517         int i, nr_pages;
 518
 519         pages = io_pin_pages(reg->ring_addr,
 520                              flex_array_size(br, bufs, reg->ring_entries),
 521                              &nr_pages);
 522         if (IS_ERR(pages))
 523                 return PTR_ERR(pages);
 524
 525         /*
 526          * Apparently some 32-bit boxes (ARM) will return highmem pages,
 527          * which then need to be mapped. We could support that, but it'd
 528          * complicate the code and slowdown the common cases quite a bit.
 529          * So just error out, returning -EINVAL just like we did on kernels
 530          * that didn't support mapped buffer rings.
 531          */
 532         for (i = 0; i < nr_pages; i++)
 533                 if (PageHighMem(pages[i]))
 534                         goto error_unpin;
 535
 536         br = page_address(pages[0]);
 537 #ifdef SHM_COLOUR
 538         /*
 539          * On platforms that have specific aliasing requirements, SHM_COLOUR
 540          * is set and we must guarantee that the kernel and user side align
 541          * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
 542          * the application mmap's the provided ring buffer. Fail the request
 543          * if we, by chance, don't end up with aligned addresses. The app
 544          * should use IOU_PBUF_RING_MMAP instead, and liburing will handle
 545          * this transparently.
 546          */
 547         if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1))
 548                 goto error_unpin;
 549 #endif
 550         bl->buf_pages = pages;
 551         bl->buf_nr_pages = nr_pages;
 552         bl->buf_ring = br;
 553         bl->is_mapped = 1;
 554         bl->is_mmap = 0;
 555         return 0;
 556 error_unpin:
 557         for (i = 0; i < nr_pages; i++)
 558                 unpin_user_page(pages[i]);
 559         kvfree(pages);
 560         return -EINVAL;
 561 }
 562
 563 /*
 564  * See if we have a suitable region that we can reuse, rather than allocate
 565  * both a new io_buf_free and mem region again. We leave it on the list as
 566  * even a reused entry will need freeing at ring release.
 567  */
 568 static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx,
 569                                                     size_t ring_size)
 570 {
 571         struct io_buf_free *ibf, *best = NULL;
 572         size_t best_dist;
 573
 574         hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
 575                 size_t dist;
 576
 577                 if (ibf->inuse || ibf->size < ring_size)
 578                         continue;
 579                 dist = ibf->size - ring_size;
 580                 if (!best || dist < best_dist) {
 581                         best = ibf;
 582                         if (!dist)
 583                                 break;
 584                         best_dist = dist;
 585                 }
 586         }
 587
 588         return best;
 589 }
 590
 591 static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
 592                               struct io_uring_buf_reg *reg,
 593                               struct io_buffer_list *bl)
 594 {
 595         struct io_buf_free *ibf;
 596         size_t ring_size;
 597         void *ptr;
 598
 599         ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
 600
 601         /* Reuse existing entry, if we can */
 602         ibf = io_lookup_buf_free_entry(ctx, ring_size);
 603         if (!ibf) {
 604                 ptr = io_mem_alloc(ring_size);
 605                 if (!ptr)
 606                         return -ENOMEM;
 607
 608                 /* Allocate and store deferred free entry */
 609                 ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT);
 610                 if (!ibf) {
 611                         io_mem_free(ptr);
 612                         return -ENOMEM;
 613                 }
 614                 ibf->mem = ptr;
 615                 ibf->size = ring_size;
 616                 hlist_add_head(&ibf->list, &ctx->io_buf_list);
 617         }
 618         ibf->inuse = 1;
 619         bl->buf_ring = ibf->mem;
 620         bl->is_mapped = 1;
 621         bl->is_mmap = 1;
 622         return 0;
 623 }
 624
 625 int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 626 {
 627         struct io_uring_buf_reg reg;
 628         struct io_buffer_list *bl, *free_bl = NULL;
 629         int ret;
 630
 631         if (copy_from_user(&reg, arg, sizeof(reg)))
 632                 return -EFAULT;
 633
 634         if (reg.resv[0] || reg.resv[1] || reg.resv[2])
 635                 return -EINVAL;
 636         if (reg.flags & ~IOU_PBUF_RING_MMAP)
 637                 return -EINVAL;
 638         if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
 639                 if (!reg.ring_addr)
 640                         return -EFAULT;
 641                 if (reg.ring_addr & ~PAGE_MASK)
 642                         return -EINVAL;
 643         } else {
 644                 if (reg.ring_addr)
 645                         return -EINVAL;
 646         }
 647
 648         if (!is_power_of_2(reg.ring_entries))
 649                 return -EINVAL;
 650
 651         /* cannot disambiguate full vs empty due to head/tail size */
 652         if (reg.ring_entries >= 65536)
 653                 return -EINVAL;
 654
 655         if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
 656                 int ret = io_init_bl_list(ctx);
 657                 if (ret)
 658                         return ret;
 659         }
 660
 661         bl = io_buffer_get_list(ctx, reg.bgid);
 662         if (bl) {
 663                 /* if mapped buffer ring OR classic exists, don't allow */
 664                 if (bl->is_mapped || !list_empty(&bl->buf_list))
 665                         return -EEXIST;
 666         } else {
 667                 free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
 668                 if (!bl)
 669                         return -ENOMEM;
 670         }
 671
 672         if (!(reg.flags & IOU_PBUF_RING_MMAP))
 673                 ret = io_pin_pbuf_ring(&reg, bl);
 674         else
 675                 ret = io_alloc_pbuf_ring(ctx, &reg, bl);
 676
 677         if (!ret) {
 678                 bl->nr_entries = reg.ring_entries;
 679                 bl->mask = reg.ring_entries - 1;
 680
 681                 io_buffer_add_list(ctx, bl, reg.bgid);
 682                 return 0;
 683         }
 684
 685         kfree(free_bl);
 686         return ret;
 687 }
 688
 689 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 690 {
 691         struct io_uring_buf_reg reg;
 692         struct io_buffer_list *bl;
 693
 694         if (copy_from_user(&reg, arg, sizeof(reg)))
 695                 return -EFAULT;
 696         if (reg.resv[0] || reg.resv[1] || reg.resv[2])
 697                 return -EINVAL;
 698         if (reg.flags)
 699                 return -EINVAL;
 700
 701         bl = io_buffer_get_list(ctx, reg.bgid);
 702         if (!bl)
 703                 return -ENOENT;
 704         if (!bl->is_mapped)
 705                 return -EINVAL;
 706
 707         __io_remove_buffers(ctx, bl, -1U);
 708         if (bl->bgid >= BGID_ARRAY) {
 709                 xa_erase(&ctx->io_bl_xa, bl->bgid);
 710                 kfree(bl);
 711         }
 712         return 0;
 713 }
 714
 715 void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
 716 {
 717         struct io_buffer_list *bl;
 718
 719         bl = io_buffer_get_list(ctx, bgid);
 720         if (!bl || !bl->is_mmap)
 721                 return NULL;
 722
 723         return bl->buf_ring;
 724 }
 725
 726 /*
 727  * Called at or after ->release(), free the mmap'ed buffers that we used
 728  * for memory mapped provided buffer rings.
 729  */
 730 void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx)
 731 {
 732         struct io_buf_free *ibf;
 733         struct hlist_node *tmp;
 734
 735         hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) {
 736                 hlist_del(&ibf->list);
 737                 io_mem_free(ibf->mem);
 738                 kfree(ibf);
 739         }
 740 }