fs/io_uring.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Shared application/kernel submission and completion ring pairs, for
   4  * supporting fast/efficient IO.
   5  *
   6  * A note on the read/write ordering memory barriers that are matched between
   7  * the application and kernel side.
   8  *
   9  * After the application reads the CQ ring tail, it must use an
  10  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
  11  * before writing the tail (using smp_load_acquire to read the tail will
  12  * do). It also needs a smp_mb() before updating CQ head (ordering the
  13  * entry load(s) with the head store), pairing with an implicit barrier
  14  * through a control-dependency in io_get_cqe (smp_store_release to
  15  * store head will do). Failure to do so could lead to reading invalid
  16  * CQ entries.
  17  *
  18  * Likewise, the application must use an appropriate smp_wmb() before
  19  * writing the SQ tail (ordering SQ entry stores with the tail store),
  20  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
  21  * to store the tail will do). And it needs a barrier ordering the SQ
  22  * head load before writing new SQ entries (smp_load_acquire to read
  23  * head will do).
  24  *
  25  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
  26  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
  27  * updating the SQ tail; a full memory barrier smp_mb() is needed
  28  * between.
  29  *
  30  * Also see the examples in the liburing library:
  31  *
  32  *      git://git.kernel.dk/liburing
  33  *
  34  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  35  * from data shared between the kernel and application. This is done both
  36  * for ordering purposes, but also to ensure that once a value is loaded from
  37  * data that the application could potentially modify, it remains stable.
  38  *
  39  * Copyright (C) 2018-2019 Jens Axboe
  40  * Copyright (c) 2018-2019 Christoph Hellwig
  41  */
  42 #include <linux/kernel.h>
  43 #include <linux/init.h>
  44 #include <linux/errno.h>
  45 #include <linux/syscalls.h>
  46 #include <linux/compat.h>
  47 #include <net/compat.h>
  48 #include <linux/refcount.h>
  49 #include <linux/uio.h>
  50 #include <linux/bits.h>
  51
  52 #include <linux/sched/signal.h>
  53 #include <linux/fs.h>
  54 #include <linux/file.h>
  55 #include <linux/fdtable.h>
  56 #include <linux/mm.h>
  57 #include <linux/mman.h>
  58 #include <linux/percpu.h>
  59 #include <linux/slab.h>
  60 #include <linux/blkdev.h>
  61 #include <linux/bvec.h>
  62 #include <linux/net.h>
  63 #include <net/sock.h>
  64 #include <net/af_unix.h>
  65 #include <net/scm.h>
  66 #include <linux/anon_inodes.h>
  67 #include <linux/sched/mm.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/nospec.h>
  70 #include <linux/sizes.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/highmem.h>
  73 #include <linux/namei.h>
  74 #include <linux/fsnotify.h>
  75 #include <linux/fadvise.h>
  76 #include <linux/eventpoll.h>
  77 #include <linux/splice.h>
  78 #include <linux/task_work.h>
  79 #include <linux/pagemap.h>
  80 #include <linux/io_uring.h>
  81 #include <linux/tracehook.h>
  82
  83 #define CREATE_TRACE_POINTS
  84 #include <trace/events/io_uring.h>
  85
  86 #include <uapi/linux/io_uring.h>
  87
  88 #include "internal.h"
  89 #include "io-wq.h"
  90
  91 #define IORING_MAX_ENTRIES      32768
  92 #define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
  93 #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
  94
  95 /* only define max */
  96 #define IORING_MAX_FIXED_FILES  (1U << 15)
  97 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
  98                                  IORING_REGISTER_LAST + IORING_OP_LAST)
  99
 100 #define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
 101 #define IO_RSRC_TAG_TABLE_MAX   (1U << IO_RSRC_TAG_TABLE_SHIFT)
 102 #define IO_RSRC_TAG_TABLE_MASK  (IO_RSRC_TAG_TABLE_MAX - 1)
 103
 104 #define IORING_MAX_REG_BUFFERS  (1U << 14)
 105
 106 #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
 107                           IOSQE_IO_HARDLINK | IOSQE_ASYNC)
 108
 109 #define SQE_VALID_FLAGS (SQE_COMMON_FLAGS|IOSQE_BUFFER_SELECT|IOSQE_IO_DRAIN)
 110
 111 #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
 112                                 REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
 113                                 REQ_F_ASYNC_DATA)
 114
 115 #define IO_TCTX_REFS_CACHE_NR   (1U << 10)
 116
 117 struct io_uring {
 118         u32 head ____cacheline_aligned_in_smp;
 119         u32 tail ____cacheline_aligned_in_smp;
 120 };
 121
 122 /*
 123  * This data is shared with the application through the mmap at offsets
 124  * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
 125  *
 126  * The offsets to the member fields are published through struct
 127  * io_sqring_offsets when calling io_uring_setup.
 128  */
 129 struct io_rings {
 130         /*
 131          * Head and tail offsets into the ring; the offsets need to be
 132          * masked to get valid indices.
 133          *
 134          * The kernel controls head of the sq ring and the tail of the cq ring,
 135          * and the application controls tail of the sq ring and the head of the
 136          * cq ring.
 137          */
 138         struct io_uring         sq, cq;
 139         /*
 140          * Bitmasks to apply to head and tail offsets (constant, equals
 141          * ring_entries - 1)
 142          */
 143         u32                     sq_ring_mask, cq_ring_mask;
 144         /* Ring sizes (constant, power of 2) */
 145         u32                     sq_ring_entries, cq_ring_entries;
 146         /*
 147          * Number of invalid entries dropped by the kernel due to
 148          * invalid index stored in array
 149          *
 150          * Written by the kernel, shouldn't be modified by the
 151          * application (i.e. get number of "new events" by comparing to
 152          * cached value).
 153          *
 154          * After a new SQ head value was read by the application this
 155          * counter includes all submissions that were dropped reaching
 156          * the new SQ head (and possibly more).
 157          */
 158         u32                     sq_dropped;
 159         /*
 160          * Runtime SQ flags
 161          *
 162          * Written by the kernel, shouldn't be modified by the
 163          * application.
 164          *
 165          * The application needs a full memory barrier before checking
 166          * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 167          */
 168         u32                     sq_flags;
 169         /*
 170          * Runtime CQ flags
 171          *
 172          * Written by the application, shouldn't be modified by the
 173          * kernel.
 174          */
 175         u32                     cq_flags;
 176         /*
 177          * Number of completion events lost because the queue was full;
 178          * this should be avoided by the application by making sure
 179          * there are not more requests pending than there is space in
 180          * the completion queue.
 181          *
 182          * Written by the kernel, shouldn't be modified by the
 183          * application (i.e. get number of "new events" by comparing to
 184          * cached value).
 185          *
 186          * As completion events come in out of order this counter is not
 187          * ordered with any other data.
 188          */
 189         u32                     cq_overflow;
 190         /*
 191          * Ring buffer of completion events.
 192          *
 193          * The kernel writes completion events fresh every time they are
 194          * produced, so the application is allowed to modify pending
 195          * entries.
 196          */
 197         struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
 198 };
 199
 200 enum io_uring_cmd_flags {
 201         IO_URING_F_COMPLETE_DEFER       = 1,
 202         /* int's last bit, sign checks are usually faster than a bit test */
 203         IO_URING_F_NONBLOCK             = INT_MIN,
 204 };
 205
 206 struct io_mapped_ubuf {
 207         u64             ubuf;
 208         u64             ubuf_end;
 209         unsigned int    nr_bvecs;
 210         unsigned long   acct_pages;
 211         struct bio_vec  bvec[];
 212 };
 213
 214 struct io_ring_ctx;
 215
 216 struct io_overflow_cqe {
 217         struct io_uring_cqe cqe;
 218         struct list_head list;
 219 };
 220
 221 struct io_fixed_file {
 222         /* file * with additional FFS_* flags */
 223         unsigned long file_ptr;
 224 };
 225
 226 struct io_rsrc_put {
 227         struct list_head list;
 228         u64 tag;
 229         union {
 230                 void *rsrc;
 231                 struct file *file;
 232                 struct io_mapped_ubuf *buf;
 233         };
 234 };
 235
 236 struct io_file_table {
 237         struct io_fixed_file *files;
 238 };
 239
 240 struct io_rsrc_node {
 241         struct percpu_ref               refs;
 242         struct list_head                node;
 243         struct list_head                rsrc_list;
 244         struct io_rsrc_data             *rsrc_data;
 245         struct llist_node               llist;
 246         bool                            done;
 247 };
 248
 249 typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
 250
 251 struct io_rsrc_data {
 252         struct io_ring_ctx              *ctx;
 253
 254         u64                             **tags;
 255         unsigned int                    nr;
 256         rsrc_put_fn                     *do_put;
 257         atomic_t                        refs;
 258         struct completion               done;
 259         bool                            quiesce;
 260 };
 261
 262 struct io_buffer {
 263         struct list_head list;
 264         __u64 addr;
 265         __u32 len;
 266         __u16 bid;
 267 };
 268
 269 struct io_restriction {
 270         DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
 271         DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
 272         u8 sqe_flags_allowed;
 273         u8 sqe_flags_required;
 274         bool registered;
 275 };
 276
 277 enum {
 278         IO_SQ_THREAD_SHOULD_STOP = 0,
 279         IO_SQ_THREAD_SHOULD_PARK,
 280 };
 281
 282 struct io_sq_data {
 283         refcount_t              refs;
 284         atomic_t                park_pending;
 285         struct mutex            lock;
 286
 287         /* ctx's that are using this sqd */
 288         struct list_head        ctx_list;
 289
 290         struct task_struct      *thread;
 291         struct wait_queue_head  wait;
 292
 293         unsigned                sq_thread_idle;
 294         int                     sq_cpu;
 295         pid_t                   task_pid;
 296         pid_t                   task_tgid;
 297
 298         unsigned long           state;
 299         struct completion       exited;
 300 };
 301
 302 #define IO_COMPL_BATCH                  32
 303 #define IO_REQ_CACHE_SIZE               32
 304 #define IO_REQ_ALLOC_BATCH              8
 305
 306 struct io_submit_link {
 307         struct io_kiocb         *head;
 308         struct io_kiocb         *last;
 309 };
 310
 311 struct io_submit_state {
 312         /* inline/task_work completion list, under ->uring_lock */
 313         struct io_wq_work_node  free_list;
 314         /* batch completion logic */
 315         struct io_wq_work_list  compl_reqs;
 316         struct io_submit_link   link;
 317
 318         bool                    plug_started;
 319         bool                    need_plug;
 320         struct blk_plug         plug;
 321 };
 322
 323 struct io_ring_ctx {
 324         /* const or read-mostly hot data */
 325         struct {
 326                 struct percpu_ref       refs;
 327
 328                 struct io_rings         *rings;
 329                 unsigned int            flags;
 330                 unsigned int            compat: 1;
 331                 unsigned int            drain_next: 1;
 332                 unsigned int            eventfd_async: 1;
 333                 unsigned int            restricted: 1;
 334                 unsigned int            off_timeout_used: 1;
 335                 unsigned int            drain_active: 1;
 336         } ____cacheline_aligned_in_smp;
 337
 338         /* submission data */
 339         struct {
 340                 struct mutex            uring_lock;
 341
 342                 /*
 343                  * Ring buffer of indices into array of io_uring_sqe, which is
 344                  * mmapped by the application using the IORING_OFF_SQES offset.
 345                  *
 346                  * This indirection could e.g. be used to assign fixed
 347                  * io_uring_sqe entries to operations and only submit them to
 348                  * the queue when needed.
 349                  *
 350                  * The kernel modifies neither the indices array nor the entries
 351                  * array.
 352                  */
 353                 u32                     *sq_array;
 354                 struct io_uring_sqe     *sq_sqes;
 355                 unsigned                cached_sq_head;
 356                 unsigned                sq_entries;
 357                 struct list_head        defer_list;
 358
 359                 /*
 360                  * Fixed resources fast path, should be accessed only under
 361                  * uring_lock, and updated through io_uring_register(2)
 362                  */
 363                 struct io_rsrc_node     *rsrc_node;
 364                 int                     rsrc_cached_refs;
 365                 struct io_file_table    file_table;
 366                 unsigned                nr_user_files;
 367                 unsigned                nr_user_bufs;
 368                 struct io_mapped_ubuf   **user_bufs;
 369
 370                 struct io_submit_state  submit_state;
 371                 struct list_head        timeout_list;
 372                 struct list_head        ltimeout_list;
 373                 struct list_head        cq_overflow_list;
 374                 struct xarray           io_buffers;
 375                 struct xarray           personalities;
 376                 u32                     pers_next;
 377                 unsigned                sq_thread_idle;
 378         } ____cacheline_aligned_in_smp;
 379
 380         /* IRQ completion list, under ->completion_lock */
 381         struct io_wq_work_list  locked_free_list;
 382         unsigned int            locked_free_nr;
 383
 384         const struct cred       *sq_creds;      /* cred used for __io_sq_thread() */
 385         struct io_sq_data       *sq_data;       /* if using sq thread polling */
 386
 387         struct wait_queue_head  sqo_sq_wait;
 388         struct list_head        sqd_list;
 389
 390         unsigned long           check_cq_overflow;
 391
 392         struct {
 393                 unsigned                cached_cq_tail;
 394                 unsigned                cq_entries;
 395                 struct eventfd_ctx      *cq_ev_fd;
 396                 struct wait_queue_head  cq_wait;
 397                 unsigned                cq_extra;
 398                 atomic_t                cq_timeouts;
 399                 unsigned                cq_last_tm_flush;
 400         } ____cacheline_aligned_in_smp;
 401
 402         struct {
 403                 spinlock_t              completion_lock;
 404
 405                 spinlock_t              timeout_lock;
 406
 407                 /*
 408                  * ->iopoll_list is protected by the ctx->uring_lock for
 409                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
 410                  * For SQPOLL, only the single threaded io_sq_thread() will
 411                  * manipulate the list, hence no extra locking is needed there.
 412                  */
 413                 struct io_wq_work_list  iopoll_list;
 414                 struct hlist_head       *cancel_hash;
 415                 unsigned                cancel_hash_bits;
 416                 bool                    poll_multi_queue;
 417         } ____cacheline_aligned_in_smp;
 418
 419         struct io_restriction           restrictions;
 420
 421         /* slow path rsrc auxilary data, used by update/register */
 422         struct {
 423                 struct io_rsrc_node             *rsrc_backup_node;
 424                 struct io_mapped_ubuf           *dummy_ubuf;
 425                 struct io_rsrc_data             *file_data;
 426                 struct io_rsrc_data             *buf_data;
 427
 428                 struct delayed_work             rsrc_put_work;
 429                 struct llist_head               rsrc_put_llist;
 430                 struct list_head                rsrc_ref_list;
 431                 spinlock_t                      rsrc_ref_lock;
 432         };
 433
 434         /* Keep this last, we don't need it for the fast path */
 435         struct {
 436                 #if defined(CONFIG_UNIX)
 437                         struct socket           *ring_sock;
 438                 #endif
 439                 /* hashed buffered write serialization */
 440                 struct io_wq_hash               *hash_map;
 441
 442                 /* Only used for accounting purposes */
 443                 struct user_struct              *user;
 444                 struct mm_struct                *mm_account;
 445
 446                 /* ctx exit and cancelation */
 447                 struct llist_head               fallback_llist;
 448                 struct delayed_work             fallback_work;
 449                 struct work_struct              exit_work;
 450                 struct list_head                tctx_list;
 451                 struct completion               ref_comp;
 452         };
 453 };
 454
 455 struct io_uring_task {
 456         /* submission side */
 457         int                     cached_refs;
 458         struct xarray           xa;
 459         struct wait_queue_head  wait;
 460         const struct io_ring_ctx *last;
 461         struct io_wq            *io_wq;
 462         struct percpu_counter   inflight;
 463         atomic_t                inflight_tracked;
 464         atomic_t                in_idle;
 465
 466         spinlock_t              task_lock;
 467         struct io_wq_work_list  task_list;
 468         struct callback_head    task_work;
 469         bool                    task_running;
 470 };
 471
 472 /*
 473  * First field must be the file pointer in all the
 474  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 475  */
 476 struct io_poll_iocb {
 477         struct file                     *file;
 478         struct wait_queue_head          *head;
 479         __poll_t                        events;
 480         bool                            done;
 481         bool                            canceled;
 482         struct wait_queue_entry         wait;
 483 };
 484
 485 struct io_poll_update {
 486         struct file                     *file;
 487         u64                             old_user_data;
 488         u64                             new_user_data;
 489         __poll_t                        events;
 490         bool                            update_events;
 491         bool                            update_user_data;
 492 };
 493
 494 struct io_close {
 495         struct file                     *file;
 496         int                             fd;
 497         u32                             file_slot;
 498 };
 499
 500 struct io_timeout_data {
 501         struct io_kiocb                 *req;
 502         struct hrtimer                  timer;
 503         struct timespec64               ts;
 504         enum hrtimer_mode               mode;
 505         u32                             flags;
 506 };
 507
 508 struct io_accept {
 509         struct file                     *file;
 510         struct sockaddr __user          *addr;
 511         int __user                      *addr_len;
 512         int                             flags;
 513         u32                             file_slot;
 514         unsigned long                   nofile;
 515 };
 516
 517 struct io_sync {
 518         struct file                     *file;
 519         loff_t                          len;
 520         loff_t                          off;
 521         int                             flags;
 522         int                             mode;
 523 };
 524
 525 struct io_cancel {
 526         struct file                     *file;
 527         u64                             addr;
 528 };
 529
 530 struct io_timeout {
 531         struct file                     *file;
 532         u32                             off;
 533         u32                             target_seq;
 534         struct list_head                list;
 535         /* head of the link, used by linked timeouts only */
 536         struct io_kiocb                 *head;
 537         /* for linked completions */
 538         struct io_kiocb                 *prev;
 539 };
 540
 541 struct io_timeout_rem {
 542         struct file                     *file;
 543         u64                             addr;
 544
 545         /* timeout update */
 546         struct timespec64               ts;
 547         u32                             flags;
 548         bool                            ltimeout;
 549 };
 550
 551 struct io_rw {
 552         /* NOTE: kiocb has the file as the first member, so don't do it here */
 553         struct kiocb                    kiocb;
 554         u64                             addr;
 555         u64                             len;
 556 };
 557
 558 struct io_connect {
 559         struct file                     *file;
 560         struct sockaddr __user          *addr;
 561         int                             addr_len;
 562 };
 563
 564 struct io_sr_msg {
 565         struct file                     *file;
 566         union {
 567                 struct compat_msghdr __user     *umsg_compat;
 568                 struct user_msghdr __user       *umsg;
 569                 void __user                     *buf;
 570         };
 571         int                             msg_flags;
 572         int                             bgid;
 573         size_t                          len;
 574 };
 575
 576 struct io_open {
 577         struct file                     *file;
 578         int                             dfd;
 579         u32                             file_slot;
 580         struct filename                 *filename;
 581         struct open_how                 how;
 582         unsigned long                   nofile;
 583 };
 584
 585 struct io_rsrc_update {
 586         struct file                     *file;
 587         u64                             arg;
 588         u32                             nr_args;
 589         u32                             offset;
 590 };
 591
 592 struct io_fadvise {
 593         struct file                     *file;
 594         u64                             offset;
 595         u32                             len;
 596         u32                             advice;
 597 };
 598
 599 struct io_madvise {
 600         struct file                     *file;
 601         u64                             addr;
 602         u32                             len;
 603         u32                             advice;
 604 };
 605
 606 struct io_epoll {
 607         struct file                     *file;
 608         int                             epfd;
 609         int                             op;
 610         int                             fd;
 611         struct epoll_event              event;
 612 };
 613
 614 struct io_splice {
 615         struct file                     *file_out;
 616         struct file                     *file_in;
 617         loff_t                          off_out;
 618         loff_t                          off_in;
 619         u64                             len;
 620         unsigned int                    flags;
 621 };
 622
 623 struct io_provide_buf {
 624         struct file                     *file;
 625         __u64                           addr;
 626         __u32                           len;
 627         __u32                           bgid;
 628         __u16                           nbufs;
 629         __u16                           bid;
 630 };
 631
 632 struct io_statx {
 633         struct file                     *file;
 634         int                             dfd;
 635         unsigned int                    mask;
 636         unsigned int                    flags;
 637         const char __user               *filename;
 638         struct statx __user             *buffer;
 639 };
 640
 641 struct io_shutdown {
 642         struct file                     *file;
 643         int                             how;
 644 };
 645
 646 struct io_rename {
 647         struct file                     *file;
 648         int                             old_dfd;
 649         int                             new_dfd;
 650         struct filename                 *oldpath;
 651         struct filename                 *newpath;
 652         int                             flags;
 653 };
 654
 655 struct io_unlink {
 656         struct file                     *file;
 657         int                             dfd;
 658         int                             flags;
 659         struct filename                 *filename;
 660 };
 661
 662 struct io_mkdir {
 663         struct file                     *file;
 664         int                             dfd;
 665         umode_t                         mode;
 666         struct filename                 *filename;
 667 };
 668
 669 struct io_symlink {
 670         struct file                     *file;
 671         int                             new_dfd;
 672         struct filename                 *oldpath;
 673         struct filename                 *newpath;
 674 };
 675
 676 struct io_hardlink {
 677         struct file                     *file;
 678         int                             old_dfd;
 679         int                             new_dfd;
 680         struct filename                 *oldpath;
 681         struct filename                 *newpath;
 682         int                             flags;
 683 };
 684
 685 struct io_async_connect {
 686         struct sockaddr_storage         address;
 687 };
 688
 689 struct io_async_msghdr {
 690         struct iovec                    fast_iov[UIO_FASTIOV];
 691         /* points to an allocated iov, if NULL we use fast_iov instead */
 692         struct iovec                    *free_iov;
 693         struct sockaddr __user          *uaddr;
 694         struct msghdr                   msg;
 695         struct sockaddr_storage         addr;
 696 };
 697
 698 struct io_rw_state {
 699         struct iov_iter                 iter;
 700         struct iov_iter_state           iter_state;
 701         struct iovec                    fast_iov[UIO_FASTIOV];
 702 };
 703
 704 struct io_async_rw {
 705         struct io_rw_state              s;
 706         const struct iovec              *free_iovec;
 707         size_t                          bytes_done;
 708         struct wait_page_queue          wpq;
 709 };
 710
 711 enum {
 712         REQ_F_FIXED_FILE_BIT    = IOSQE_FIXED_FILE_BIT,
 713         REQ_F_IO_DRAIN_BIT      = IOSQE_IO_DRAIN_BIT,
 714         REQ_F_LINK_BIT          = IOSQE_IO_LINK_BIT,
 715         REQ_F_HARDLINK_BIT      = IOSQE_IO_HARDLINK_BIT,
 716         REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
 717         REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
 718
 719         /* first byte is taken by user flags, shift it to not overlap */
 720         REQ_F_FAIL_BIT          = 8,
 721         REQ_F_INFLIGHT_BIT,
 722         REQ_F_CUR_POS_BIT,
 723         REQ_F_NOWAIT_BIT,
 724         REQ_F_LINK_TIMEOUT_BIT,
 725         REQ_F_NEED_CLEANUP_BIT,
 726         REQ_F_POLLED_BIT,
 727         REQ_F_BUFFER_SELECTED_BIT,
 728         REQ_F_COMPLETE_INLINE_BIT,
 729         REQ_F_REISSUE_BIT,
 730         REQ_F_CREDS_BIT,
 731         REQ_F_REFCOUNT_BIT,
 732         REQ_F_ARM_LTIMEOUT_BIT,
 733         REQ_F_ASYNC_DATA_BIT,
 734         /* keep async read/write and isreg together and in order */
 735         REQ_F_NOWAIT_READ_BIT,
 736         REQ_F_NOWAIT_WRITE_BIT,
 737         REQ_F_ISREG_BIT,
 738
 739         /* not a real bit, just to check we're not overflowing the space */
 740         __REQ_F_LAST_BIT,
 741 };
 742
 743 enum {
 744         /* ctx owns file */
 745         REQ_F_FIXED_FILE        = BIT(REQ_F_FIXED_FILE_BIT),
 746         /* drain existing IO first */
 747         REQ_F_IO_DRAIN          = BIT(REQ_F_IO_DRAIN_BIT),
 748         /* linked sqes */
 749         REQ_F_LINK              = BIT(REQ_F_LINK_BIT),
 750         /* doesn't sever on completion < 0 */
 751         REQ_F_HARDLINK          = BIT(REQ_F_HARDLINK_BIT),
 752         /* IOSQE_ASYNC */
 753         REQ_F_FORCE_ASYNC       = BIT(REQ_F_FORCE_ASYNC_BIT),
 754         /* IOSQE_BUFFER_SELECT */
 755         REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
 756
 757         /* fail rest of links */
 758         REQ_F_FAIL              = BIT(REQ_F_FAIL_BIT),
 759         /* on inflight list, should be cancelled and waited on exit reliably */
 760         REQ_F_INFLIGHT          = BIT(REQ_F_INFLIGHT_BIT),
 761         /* read/write uses file position */
 762         REQ_F_CUR_POS           = BIT(REQ_F_CUR_POS_BIT),
 763         /* must not punt to workers */
 764         REQ_F_NOWAIT            = BIT(REQ_F_NOWAIT_BIT),
 765         /* has or had linked timeout */
 766         REQ_F_LINK_TIMEOUT      = BIT(REQ_F_LINK_TIMEOUT_BIT),
 767         /* needs cleanup */
 768         REQ_F_NEED_CLEANUP      = BIT(REQ_F_NEED_CLEANUP_BIT),
 769         /* already went through poll handler */
 770         REQ_F_POLLED            = BIT(REQ_F_POLLED_BIT),
 771         /* buffer already selected */
 772         REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
 773         /* completion is deferred through io_comp_state */
 774         REQ_F_COMPLETE_INLINE   = BIT(REQ_F_COMPLETE_INLINE_BIT),
 775         /* caller should reissue async */
 776         REQ_F_REISSUE           = BIT(REQ_F_REISSUE_BIT),
 777         /* supports async reads */
 778         REQ_F_NOWAIT_READ       = BIT(REQ_F_NOWAIT_READ_BIT),
 779         /* supports async writes */
 780         REQ_F_NOWAIT_WRITE      = BIT(REQ_F_NOWAIT_WRITE_BIT),
 781         /* regular file */
 782         REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
 783         /* has creds assigned */
 784         REQ_F_CREDS             = BIT(REQ_F_CREDS_BIT),
 785         /* skip refcounting if not set */
 786         REQ_F_REFCOUNT          = BIT(REQ_F_REFCOUNT_BIT),
 787         /* there is a linked timeout that has to be armed */
 788         REQ_F_ARM_LTIMEOUT      = BIT(REQ_F_ARM_LTIMEOUT_BIT),
 789         /* ->async_data allocated */
 790         REQ_F_ASYNC_DATA        = BIT(REQ_F_ASYNC_DATA_BIT),
 791 };
 792
 793 struct async_poll {
 794         struct io_poll_iocb     poll;
 795         struct io_poll_iocb     *double_poll;
 796 };
 797
 798 typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
 799
 800 struct io_task_work {
 801         union {
 802                 struct io_wq_work_node  node;
 803                 struct llist_node       fallback_node;
 804         };
 805         io_req_tw_func_t                func;
 806 };
 807
 808 enum {
 809         IORING_RSRC_FILE                = 0,
 810         IORING_RSRC_BUFFER              = 1,
 811 };
 812
 813 /*
 814  * NOTE! Each of the iocb union members has the file pointer
 815  * as the first entry in their struct definition. So you can
 816  * access the file pointer through any of the sub-structs,
 817  * or directly as just 'ki_filp' in this struct.
 818  */
 819 struct io_kiocb {
 820         union {
 821                 struct file             *file;
 822                 struct io_rw            rw;
 823                 struct io_poll_iocb     poll;
 824                 struct io_poll_update   poll_update;
 825                 struct io_accept        accept;
 826                 struct io_sync          sync;
 827                 struct io_cancel        cancel;
 828                 struct io_timeout       timeout;
 829                 struct io_timeout_rem   timeout_rem;
 830                 struct io_connect       connect;
 831                 struct io_sr_msg        sr_msg;
 832                 struct io_open          open;
 833                 struct io_close         close;
 834                 struct io_rsrc_update   rsrc_update;
 835                 struct io_fadvise       fadvise;
 836                 struct io_madvise       madvise;
 837                 struct io_epoll         epoll;
 838                 struct io_splice        splice;
 839                 struct io_provide_buf   pbuf;
 840                 struct io_statx         statx;
 841                 struct io_shutdown      shutdown;
 842                 struct io_rename        rename;
 843                 struct io_unlink        unlink;
 844                 struct io_mkdir         mkdir;
 845                 struct io_symlink       symlink;
 846                 struct io_hardlink      hardlink;
 847         };
 848
 849         u8                              opcode;
 850         /* polled IO has completed */
 851         u8                              iopoll_completed;
 852         u16                             buf_index;
 853         unsigned int                    flags;
 854
 855         u64                             user_data;
 856         u32                             result;
 857         u32                             cflags;
 858
 859         struct io_ring_ctx              *ctx;
 860         struct task_struct              *task;
 861
 862         struct percpu_ref               *fixed_rsrc_refs;
 863         /* store used ubuf, so we can prevent reloading */
 864         struct io_mapped_ubuf           *imu;
 865
 866         /* used by request caches, completion batching and iopoll */
 867         struct io_wq_work_node          comp_list;
 868         atomic_t                        refs;
 869         struct io_kiocb                 *link;
 870         struct io_task_work             io_task_work;
 871         /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
 872         struct hlist_node               hash_node;
 873         /* internal polling, see IORING_FEAT_FAST_POLL */
 874         struct async_poll               *apoll;
 875         /* opcode allocated if it needs to store data for async defer */
 876         void                            *async_data;
 877         struct io_wq_work               work;
 878         /* custom credentials, valid IFF REQ_F_CREDS is set */
 879         const struct cred               *creds;
 880         /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
 881         struct io_buffer                *kbuf;
 882 };
 883
 884 struct io_tctx_node {
 885         struct list_head        ctx_node;
 886         struct task_struct      *task;
 887         struct io_ring_ctx      *ctx;
 888 };
 889
 890 struct io_defer_entry {
 891         struct list_head        list;
 892         struct io_kiocb         *req;
 893         u32                     seq;
 894 };
 895
 896 struct io_op_def {
 897         /* needs req->file assigned */
 898         unsigned                needs_file : 1;
 899         /* should block plug */
 900         unsigned                plug : 1;
 901         /* hash wq insertion if file is a regular file */
 902         unsigned                hash_reg_file : 1;
 903         /* unbound wq insertion if file is a non-regular file */
 904         unsigned                unbound_nonreg_file : 1;
 905         /* set if opcode supports polled "wait" */
 906         unsigned                pollin : 1;
 907         unsigned                pollout : 1;
 908         /* op supports buffer selection */
 909         unsigned                buffer_select : 1;
 910         /* do prep async if is going to be punted */
 911         unsigned                needs_async_setup : 1;
 912         /* opcode is not supported by this kernel */
 913         unsigned                not_supported : 1;
 914         /* size of async data needed, if any */
 915         unsigned short          async_size;
 916 };
 917
 918 static const struct io_op_def io_op_defs[] = {
 919         [IORING_OP_NOP] = {},
 920         [IORING_OP_READV] = {
 921                 .needs_file             = 1,
 922                 .unbound_nonreg_file    = 1,
 923                 .pollin                 = 1,
 924                 .buffer_select          = 1,
 925                 .needs_async_setup      = 1,
 926                 .plug                   = 1,
 927                 .async_size             = sizeof(struct io_async_rw),
 928         },
 929         [IORING_OP_WRITEV] = {
 930                 .needs_file             = 1,
 931                 .hash_reg_file          = 1,
 932                 .unbound_nonreg_file    = 1,
 933                 .pollout                = 1,
 934                 .needs_async_setup      = 1,
 935                 .plug                   = 1,
 936                 .async_size             = sizeof(struct io_async_rw),
 937         },
 938         [IORING_OP_FSYNC] = {
 939                 .needs_file             = 1,
 940         },
 941         [IORING_OP_READ_FIXED] = {
 942                 .needs_file             = 1,
 943                 .unbound_nonreg_file    = 1,
 944                 .pollin                 = 1,
 945                 .plug                   = 1,
 946                 .async_size             = sizeof(struct io_async_rw),
 947         },
 948         [IORING_OP_WRITE_FIXED] = {
 949                 .needs_file             = 1,
 950                 .hash_reg_file          = 1,
 951                 .unbound_nonreg_file    = 1,
 952                 .pollout                = 1,
 953                 .plug                   = 1,
 954                 .async_size             = sizeof(struct io_async_rw),
 955         },
 956         [IORING_OP_POLL_ADD] = {
 957                 .needs_file             = 1,
 958                 .unbound_nonreg_file    = 1,
 959         },
 960         [IORING_OP_POLL_REMOVE] = {},
 961         [IORING_OP_SYNC_FILE_RANGE] = {
 962                 .needs_file             = 1,
 963         },
 964         [IORING_OP_SENDMSG] = {
 965                 .needs_file             = 1,
 966                 .unbound_nonreg_file    = 1,
 967                 .pollout                = 1,
 968                 .needs_async_setup      = 1,
 969                 .async_size             = sizeof(struct io_async_msghdr),
 970         },
 971         [IORING_OP_RECVMSG] = {
 972                 .needs_file             = 1,
 973                 .unbound_nonreg_file    = 1,
 974                 .pollin                 = 1,
 975                 .buffer_select          = 1,
 976                 .needs_async_setup      = 1,
 977                 .async_size             = sizeof(struct io_async_msghdr),
 978         },
 979         [IORING_OP_TIMEOUT] = {
 980                 .async_size             = sizeof(struct io_timeout_data),
 981         },
 982         [IORING_OP_TIMEOUT_REMOVE] = {
 983                 /* used by timeout updates' prep() */
 984         },
 985         [IORING_OP_ACCEPT] = {
 986                 .needs_file             = 1,
 987                 .unbound_nonreg_file    = 1,
 988                 .pollin                 = 1,
 989         },
 990         [IORING_OP_ASYNC_CANCEL] = {},
 991         [IORING_OP_LINK_TIMEOUT] = {
 992                 .async_size             = sizeof(struct io_timeout_data),
 993         },
 994         [IORING_OP_CONNECT] = {
 995                 .needs_file             = 1,
 996                 .unbound_nonreg_file    = 1,
 997                 .pollout                = 1,
 998                 .needs_async_setup      = 1,
 999                 .async_size             = sizeof(struct io_async_connect),
1000         },
1001         [IORING_OP_FALLOCATE] = {
1002                 .needs_file             = 1,
1003         },
1004         [IORING_OP_OPENAT] = {},
1005         [IORING_OP_CLOSE] = {},
1006         [IORING_OP_FILES_UPDATE] = {},
1007         [IORING_OP_STATX] = {},
1008         [IORING_OP_READ] = {
1009                 .needs_file             = 1,
1010                 .unbound_nonreg_file    = 1,
1011                 .pollin                 = 1,
1012                 .buffer_select          = 1,
1013                 .plug                   = 1,
1014                 .async_size             = sizeof(struct io_async_rw),
1015         },
1016         [IORING_OP_WRITE] = {
1017                 .needs_file             = 1,
1018                 .hash_reg_file          = 1,
1019                 .unbound_nonreg_file    = 1,
1020                 .pollout                = 1,
1021                 .plug                   = 1,
1022                 .async_size             = sizeof(struct io_async_rw),
1023         },
1024         [IORING_OP_FADVISE] = {
1025                 .needs_file             = 1,
1026         },
1027         [IORING_OP_MADVISE] = {},
1028         [IORING_OP_SEND] = {
1029                 .needs_file             = 1,
1030                 .unbound_nonreg_file    = 1,
1031                 .pollout                = 1,
1032         },
1033         [IORING_OP_RECV] = {
1034                 .needs_file             = 1,
1035                 .unbound_nonreg_file    = 1,
1036                 .pollin                 = 1,
1037                 .buffer_select          = 1,
1038         },
1039         [IORING_OP_OPENAT2] = {
1040         },
1041         [IORING_OP_EPOLL_CTL] = {
1042                 .unbound_nonreg_file    = 1,
1043         },
1044         [IORING_OP_SPLICE] = {
1045                 .needs_file             = 1,
1046                 .hash_reg_file          = 1,
1047                 .unbound_nonreg_file    = 1,
1048         },
1049         [IORING_OP_PROVIDE_BUFFERS] = {},
1050         [IORING_OP_REMOVE_BUFFERS] = {},
1051         [IORING_OP_TEE] = {
1052                 .needs_file             = 1,
1053                 .hash_reg_file          = 1,
1054                 .unbound_nonreg_file    = 1,
1055         },
1056         [IORING_OP_SHUTDOWN] = {
1057                 .needs_file             = 1,
1058         },
1059         [IORING_OP_RENAMEAT] = {},
1060         [IORING_OP_UNLINKAT] = {},
1061         [IORING_OP_MKDIRAT] = {},
1062         [IORING_OP_SYMLINKAT] = {},
1063         [IORING_OP_LINKAT] = {},
1064 };
1065
1066 /* requests with any of those set should undergo io_disarm_next() */
1067 #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
1068
1069 static bool io_disarm_next(struct io_kiocb *req);
1070 static void io_uring_del_tctx_node(unsigned long index);
1071 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
1072                                          struct task_struct *task,
1073                                          bool cancel_all);
1074 static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
1075
1076 static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
1077                                  s32 res, u32 cflags);
1078 static void io_put_req(struct io_kiocb *req);
1079 static void io_put_req_deferred(struct io_kiocb *req);
1080 static void io_dismantle_req(struct io_kiocb *req);
1081 static void io_queue_linked_timeout(struct io_kiocb *req);
1082 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
1083                                      struct io_uring_rsrc_update2 *up,
1084                                      unsigned nr_args);
1085 static void io_clean_op(struct io_kiocb *req);
1086 static struct file *io_file_get(struct io_ring_ctx *ctx,
1087                                 struct io_kiocb *req, int fd, bool fixed);
1088 static void __io_queue_sqe(struct io_kiocb *req);
1089 static void io_rsrc_put_work(struct work_struct *work);
1090
1091 static void io_req_task_queue(struct io_kiocb *req);
1092 static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
1093 static int io_req_prep_async(struct io_kiocb *req);
1094
1095 static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
1096                                  unsigned int issue_flags, u32 slot_index);
1097 static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
1098
1099 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
1100
1101 static struct kmem_cache *req_cachep;
1102
1103 static const struct file_operations io_uring_fops;
1104
1105 struct sock *io_uring_get_socket(struct file *file)
1106 {
1107 #if defined(CONFIG_UNIX)
1108         if (file->f_op == &io_uring_fops) {
1109                 struct io_ring_ctx *ctx = file->private_data;
1110
1111                 return ctx->ring_sock->sk;
1112         }
1113 #endif
1114         return NULL;
1115 }
1116 EXPORT_SYMBOL(io_uring_get_socket);
1117
1118 static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
1119 {
1120         if (!*locked) {
1121                 mutex_lock(&ctx->uring_lock);
1122                 *locked = true;
1123         }
1124 }
1125
1126 #define io_for_each_link(pos, head) \
1127         for (pos = (head); pos; pos = pos->link)
1128
1129 /*
1130  * Shamelessly stolen from the mm implementation of page reference checking,
1131  * see commit f958d7b528b1 for details.
1132  */
1133 #define req_ref_zero_or_close_to_overflow(req)  \
1134         ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
1135
1136 static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
1137 {
1138         WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
1139         return atomic_inc_not_zero(&req->refs);
1140 }
1141
1142 static inline bool req_ref_put_and_test(struct io_kiocb *req)
1143 {
1144         if (likely(!(req->flags & REQ_F_REFCOUNT)))
1145                 return true;
1146
1147         WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1148         return atomic_dec_and_test(&req->refs);
1149 }
1150
1151 static inline void req_ref_put(struct io_kiocb *req)
1152 {
1153         WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
1154         WARN_ON_ONCE(req_ref_put_and_test(req));
1155 }
1156
1157 static inline void req_ref_get(struct io_kiocb *req)
1158 {
1159         WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
1160         WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1161         atomic_inc(&req->refs);
1162 }
1163
1164 static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
1165 {
1166         if (!wq_list_empty(&ctx->submit_state.compl_reqs))
1167                 __io_submit_flush_completions(ctx);
1168 }
1169
1170 static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
1171 {
1172         if (!(req->flags & REQ_F_REFCOUNT)) {
1173                 req->flags |= REQ_F_REFCOUNT;
1174                 atomic_set(&req->refs, nr);
1175         }
1176 }
1177
1178 static inline void io_req_set_refcount(struct io_kiocb *req)
1179 {
1180         __io_req_set_refcount(req, 1);
1181 }
1182
1183 #define IO_RSRC_REF_BATCH       100
1184
1185 static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
1186                                           struct io_ring_ctx *ctx)
1187         __must_hold(&ctx->uring_lock)
1188 {
1189         struct percpu_ref *ref = req->fixed_rsrc_refs;
1190
1191         if (ref) {
1192                 if (ref == &ctx->rsrc_node->refs)
1193                         ctx->rsrc_cached_refs++;
1194                 else
1195                         percpu_ref_put(ref);
1196         }
1197 }
1198
1199 static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx)
1200 {
1201         if (req->fixed_rsrc_refs)
1202                 percpu_ref_put(req->fixed_rsrc_refs);
1203 }
1204
1205 static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
1206         __must_hold(&ctx->uring_lock)
1207 {
1208         if (ctx->rsrc_cached_refs) {
1209                 percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs);
1210                 ctx->rsrc_cached_refs = 0;
1211         }
1212 }
1213
1214 static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
1215         __must_hold(&ctx->uring_lock)
1216 {
1217         ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
1218         percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
1219 }
1220
1221 static inline void io_req_set_rsrc_node(struct io_kiocb *req,
1222                                         struct io_ring_ctx *ctx)
1223 {
1224         if (!req->fixed_rsrc_refs) {
1225                 req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
1226                 ctx->rsrc_cached_refs--;
1227                 if (unlikely(ctx->rsrc_cached_refs < 0))
1228                         io_rsrc_refs_refill(ctx);
1229         }
1230 }
1231
1232 static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
1233 {
1234         bool got = percpu_ref_tryget(ref);
1235
1236         /* already at zero, wait for ->release() */
1237         if (!got)
1238                 wait_for_completion(compl);
1239         percpu_ref_resurrect(ref);
1240         if (got)
1241                 percpu_ref_put(ref);
1242 }
1243
1244 static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
1245                           bool cancel_all)
1246 {
1247         struct io_kiocb *req;
1248
1249         if (task && head->task != task)
1250                 return false;
1251         if (cancel_all)
1252                 return true;
1253
1254         io_for_each_link(req, head) {
1255                 if (req->flags & REQ_F_INFLIGHT)
1256                         return true;
1257         }
1258         return false;
1259 }
1260
1261 static inline bool req_has_async_data(struct io_kiocb *req)
1262 {
1263         return req->flags & REQ_F_ASYNC_DATA;
1264 }
1265
1266 static inline void req_set_fail(struct io_kiocb *req)
1267 {
1268         req->flags |= REQ_F_FAIL;
1269 }
1270
1271 static inline void req_fail_link_node(struct io_kiocb *req, int res)
1272 {
1273         req_set_fail(req);
1274         req->result = res;
1275 }
1276
1277 static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
1278 {
1279         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1280
1281         complete(&ctx->ref_comp);
1282 }
1283
1284 static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1285 {
1286         return !req->timeout.off;
1287 }
1288
1289 static __cold void io_fallback_req_func(struct work_struct *work)
1290 {
1291         struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
1292                                                 fallback_work.work);
1293         struct llist_node *node = llist_del_all(&ctx->fallback_llist);
1294         struct io_kiocb *req, *tmp;
1295         bool locked = false;
1296
1297         percpu_ref_get(&ctx->refs);
1298         llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
1299                 req->io_task_work.func(req, &locked);
1300
1301         if (locked) {
1302                 io_submit_flush_completions(ctx);
1303                 mutex_unlock(&ctx->uring_lock);
1304         }
1305         percpu_ref_put(&ctx->refs);
1306 }
1307
1308 static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
1309 {
1310         struct io_ring_ctx *ctx;
1311         int hash_bits;
1312
1313         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1314         if (!ctx)
1315                 return NULL;
1316
1317         /*
1318          * Use 5 bits less than the max cq entries, that should give us around
1319          * 32 entries per hash list if totally full and uniformly spread.
1320          */
1321         hash_bits = ilog2(p->cq_entries);
1322         hash_bits -= 5;
1323         if (hash_bits <= 0)
1324                 hash_bits = 1;
1325         ctx->cancel_hash_bits = hash_bits;
1326         ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1327                                         GFP_KERNEL);
1328         if (!ctx->cancel_hash)
1329                 goto err;
1330         __hash_init(ctx->cancel_hash, 1U << hash_bits);
1331
1332         ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
1333         if (!ctx->dummy_ubuf)
1334                 goto err;
1335         /* set invalid range, so io_import_fixed() fails meeting it */
1336         ctx->dummy_ubuf->ubuf = -1UL;
1337
1338         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
1339                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1340                 goto err;
1341
1342         ctx->flags = p->flags;
1343         init_waitqueue_head(&ctx->sqo_sq_wait);
1344         INIT_LIST_HEAD(&ctx->sqd_list);
1345         INIT_LIST_HEAD(&ctx->cq_overflow_list);
1346         init_completion(&ctx->ref_comp);
1347         xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
1348         xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
1349         mutex_init(&ctx->uring_lock);
1350         init_waitqueue_head(&ctx->cq_wait);
1351         spin_lock_init(&ctx->completion_lock);
1352         spin_lock_init(&ctx->timeout_lock);
1353         INIT_WQ_LIST(&ctx->iopoll_list);
1354         INIT_LIST_HEAD(&ctx->defer_list);
1355         INIT_LIST_HEAD(&ctx->timeout_list);
1356         INIT_LIST_HEAD(&ctx->ltimeout_list);
1357         spin_lock_init(&ctx->rsrc_ref_lock);
1358         INIT_LIST_HEAD(&ctx->rsrc_ref_list);
1359         INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
1360         init_llist_head(&ctx->rsrc_put_llist);
1361         INIT_LIST_HEAD(&ctx->tctx_list);
1362         ctx->submit_state.free_list.next = NULL;
1363         INIT_WQ_LIST(&ctx->locked_free_list);
1364         INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
1365         INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
1366         return ctx;
1367 err:
1368         kfree(ctx->dummy_ubuf);
1369         kfree(ctx->cancel_hash);
1370         kfree(ctx);
1371         return NULL;
1372 }
1373
1374 static void io_account_cq_overflow(struct io_ring_ctx *ctx)
1375 {
1376         struct io_rings *r = ctx->rings;
1377
1378         WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
1379         ctx->cq_extra--;
1380 }
1381
1382 static bool req_need_defer(struct io_kiocb *req, u32 seq)
1383 {
1384         if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1385                 struct io_ring_ctx *ctx = req->ctx;
1386
1387                 return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
1388         }
1389
1390         return false;
1391 }
1392
1393 #define FFS_ASYNC_READ          0x1UL
1394 #define FFS_ASYNC_WRITE         0x2UL
1395 #ifdef CONFIG_64BIT
1396 #define FFS_ISREG               0x4UL
1397 #else
1398 #define FFS_ISREG               0x0UL
1399 #endif
1400 #define FFS_MASK                ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
1401
1402 static inline bool io_req_ffs_set(struct io_kiocb *req)
1403 {
1404         return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE);
1405 }
1406
1407 static inline void io_req_track_inflight(struct io_kiocb *req)
1408 {
1409         if (!(req->flags & REQ_F_INFLIGHT)) {
1410                 req->flags |= REQ_F_INFLIGHT;
1411                 atomic_inc(&current->io_uring->inflight_tracked);
1412         }
1413 }
1414
1415 static inline void io_unprep_linked_timeout(struct io_kiocb *req)
1416 {
1417         req->flags &= ~REQ_F_LINK_TIMEOUT;
1418 }
1419
1420 static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
1421 {
1422         if (WARN_ON_ONCE(!req->link))
1423                 return NULL;
1424
1425         req->flags &= ~REQ_F_ARM_LTIMEOUT;
1426         req->flags |= REQ_F_LINK_TIMEOUT;
1427
1428         /* linked timeouts should have two refs once prep'ed */
1429         io_req_set_refcount(req);
1430         __io_req_set_refcount(req->link, 2);
1431         return req->link;
1432 }
1433
1434 static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
1435 {
1436         if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
1437                 return NULL;
1438         return __io_prep_linked_timeout(req);
1439 }
1440
1441 static void io_prep_async_work(struct io_kiocb *req)
1442 {
1443         const struct io_op_def *def = &io_op_defs[req->opcode];
1444         struct io_ring_ctx *ctx = req->ctx;
1445
1446         if (!(req->flags & REQ_F_CREDS)) {
1447                 req->flags |= REQ_F_CREDS;
1448                 req->creds = get_current_cred();
1449         }
1450
1451         req->work.list.next = NULL;
1452         req->work.flags = 0;
1453         if (req->flags & REQ_F_FORCE_ASYNC)
1454                 req->work.flags |= IO_WQ_WORK_CONCURRENT;
1455
1456         if (req->flags & REQ_F_ISREG) {
1457                 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
1458                         io_wq_hash_work(&req->work, file_inode(req->file));
1459         } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
1460                 if (def->unbound_nonreg_file)
1461                         req->work.flags |= IO_WQ_WORK_UNBOUND;
1462         }
1463
1464         switch (req->opcode) {
1465         case IORING_OP_SPLICE:
1466         case IORING_OP_TEE:
1467                 if (!S_ISREG(file_inode(req->splice.file_in)->i_mode))
1468                         req->work.flags |= IO_WQ_WORK_UNBOUND;
1469                 break;
1470         }
1471 }
1472
1473 static void io_prep_async_link(struct io_kiocb *req)
1474 {
1475         struct io_kiocb *cur;
1476
1477         if (req->flags & REQ_F_LINK_TIMEOUT) {
1478                 struct io_ring_ctx *ctx = req->ctx;
1479
1480                 spin_lock(&ctx->completion_lock);
1481                 io_for_each_link(cur, req)
1482                         io_prep_async_work(cur);
1483                 spin_unlock(&ctx->completion_lock);
1484         } else {
1485                 io_for_each_link(cur, req)
1486                         io_prep_async_work(cur);
1487         }
1488 }
1489
1490 static inline void io_req_add_compl_list(struct io_kiocb *req)
1491 {
1492         struct io_submit_state *state = &req->ctx->submit_state;
1493
1494         wq_list_add_tail(&req->comp_list, &state->compl_reqs);
1495 }
1496
1497 static void io_queue_async_work(struct io_kiocb *req, bool *locked)
1498 {
1499         struct io_ring_ctx *ctx = req->ctx;
1500         struct io_kiocb *link = io_prep_linked_timeout(req);
1501         struct io_uring_task *tctx = req->task->io_uring;
1502
1503         /* must not take the lock, NULL it as a precaution */
1504         locked = NULL;
1505
1506         BUG_ON(!tctx);
1507         BUG_ON(!tctx->io_wq);
1508
1509         /* init ->work of the whole link before punting */
1510         io_prep_async_link(req);
1511
1512         /*
1513          * Not expected to happen, but if we do have a bug where this _can_
1514          * happen, catch it here and ensure the request is marked as
1515          * canceled. That will make io-wq go through the usual work cancel
1516          * procedure rather than attempt to run this request (or create a new
1517          * worker for it).
1518          */
1519         if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
1520                 req->work.flags |= IO_WQ_WORK_CANCEL;
1521
1522         trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1523                                         &req->work, req->flags);
1524         io_wq_enqueue(tctx->io_wq, &req->work);
1525         if (link)
1526                 io_queue_linked_timeout(link);
1527 }
1528
1529 static void io_kill_timeout(struct io_kiocb *req, int status)
1530         __must_hold(&req->ctx->completion_lock)
1531         __must_hold(&req->ctx->timeout_lock)
1532 {
1533         struct io_timeout_data *io = req->async_data;
1534
1535         if (hrtimer_try_to_cancel(&io->timer) != -1) {
1536                 if (status)
1537                         req_set_fail(req);
1538                 atomic_set(&req->ctx->cq_timeouts,
1539                         atomic_read(&req->ctx->cq_timeouts) + 1);
1540                 list_del_init(&req->timeout.list);
1541                 io_cqring_fill_event(req->ctx, req->user_data, status, 0);
1542                 io_put_req_deferred(req);
1543         }
1544 }
1545
1546 static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
1547 {
1548         while (!list_empty(&ctx->defer_list)) {
1549                 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1550                                                 struct io_defer_entry, list);
1551
1552                 if (req_need_defer(de->req, de->seq))
1553                         break;
1554                 list_del_init(&de->list);
1555                 io_req_task_queue(de->req);
1556                 kfree(de);
1557         }
1558 }
1559
1560 static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
1561         __must_hold(&ctx->completion_lock)
1562 {
1563         u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
1564
1565         spin_lock_irq(&ctx->timeout_lock);
1566         while (!list_empty(&ctx->timeout_list)) {
1567                 u32 events_needed, events_got;
1568                 struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
1569                                                 struct io_kiocb, timeout.list);
1570
1571                 if (io_is_timeout_noseq(req))
1572                         break;
1573
1574                 /*
1575                  * Since seq can easily wrap around over time, subtract
1576                  * the last seq at which timeouts were flushed before comparing.
1577                  * Assuming not more than 2^31-1 events have happened since,
1578                  * these subtractions won't have wrapped, so we can check if
1579                  * target is in [last_seq, current_seq] by comparing the two.
1580                  */
1581                 events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
1582                 events_got = seq - ctx->cq_last_tm_flush;
1583                 if (events_got < events_needed)
1584                         break;
1585
1586                 list_del_init(&req->timeout.list);
1587                 io_kill_timeout(req, 0);
1588         }
1589         ctx->cq_last_tm_flush = seq;
1590         spin_unlock_irq(&ctx->timeout_lock);
1591 }
1592
1593 static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
1594 {
1595         if (ctx->off_timeout_used)
1596                 io_flush_timeouts(ctx);
1597         if (ctx->drain_active)
1598                 io_queue_deferred(ctx);
1599 }
1600
1601 static inline void io_commit_cqring(struct io_ring_ctx *ctx)
1602 {
1603         if (unlikely(ctx->off_timeout_used || ctx->drain_active))
1604                 __io_commit_cqring_flush(ctx);
1605         /* order cqe stores with ring update */
1606         smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
1607 }
1608
1609 static inline bool io_sqring_full(struct io_ring_ctx *ctx)
1610 {
1611         struct io_rings *r = ctx->rings;
1612
1613         return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
1614 }
1615
1616 static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
1617 {
1618         return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
1619 }
1620
1621 static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
1622 {
1623         struct io_rings *rings = ctx->rings;
1624         unsigned tail, mask = ctx->cq_entries - 1;
1625
1626         /*
1627          * writes to the cq entry need to come after reading head; the
1628          * control dependency is enough as we're using WRITE_ONCE to
1629          * fill the cq entry
1630          */
1631         if (__io_cqring_events(ctx) == ctx->cq_entries)
1632                 return NULL;
1633
1634         tail = ctx->cached_cq_tail++;
1635         return &rings->cqes[tail & mask];
1636 }
1637
1638 static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1639 {
1640         if (likely(!ctx->cq_ev_fd))
1641                 return false;
1642         if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1643                 return false;
1644         return !ctx->eventfd_async || io_wq_current_is_worker();
1645 }
1646
1647 /*
1648  * This should only get called when at least one event has been posted.
1649  * Some applications rely on the eventfd notification count only changing
1650  * IFF a new CQE has been added to the CQ ring. There's no depedency on
1651  * 1:1 relationship between how many times this function is called (and
1652  * hence the eventfd count) and number of CQEs posted to the CQ ring.
1653  */
1654 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1655 {
1656         /*
1657          * wake_up_all() may seem excessive, but io_wake_function() and
1658          * io_should_wake() handle the termination of the loop and only
1659          * wake as many waiters as we need to.
1660          */
1661         if (wq_has_sleeper(&ctx->cq_wait))
1662                 wake_up_all(&ctx->cq_wait);
1663         if (io_should_trigger_evfd(ctx))
1664                 eventfd_signal(ctx->cq_ev_fd, 1);
1665 }
1666
1667 static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
1668 {
1669         /* see waitqueue_active() comment */
1670         smp_mb();
1671
1672         if (ctx->flags & IORING_SETUP_SQPOLL) {
1673                 if (waitqueue_active(&ctx->cq_wait))
1674                         wake_up_all(&ctx->cq_wait);
1675         }
1676         if (io_should_trigger_evfd(ctx))
1677                 eventfd_signal(ctx->cq_ev_fd, 1);
1678 }
1679
1680 /* Returns true if there are no backlogged entries after the flush */
1681 static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1682 {
1683         bool all_flushed, posted;
1684
1685         if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
1686                 return false;
1687
1688         posted = false;
1689         spin_lock(&ctx->completion_lock);
1690         while (!list_empty(&ctx->cq_overflow_list)) {
1691                 struct io_uring_cqe *cqe = io_get_cqe(ctx);
1692                 struct io_overflow_cqe *ocqe;
1693
1694                 if (!cqe && !force)
1695                         break;
1696                 ocqe = list_first_entry(&ctx->cq_overflow_list,
1697                                         struct io_overflow_cqe, list);
1698                 if (cqe)
1699                         memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
1700                 else
1701                         io_account_cq_overflow(ctx);
1702
1703                 posted = true;
1704                 list_del(&ocqe->list);
1705                 kfree(ocqe);
1706         }
1707
1708         all_flushed = list_empty(&ctx->cq_overflow_list);
1709         if (all_flushed) {
1710                 clear_bit(0, &ctx->check_cq_overflow);
1711                 WRITE_ONCE(ctx->rings->sq_flags,
1712                            ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
1713         }
1714
1715         if (posted)
1716                 io_commit_cqring(ctx);
1717         spin_unlock(&ctx->completion_lock);
1718         if (posted)
1719                 io_cqring_ev_posted(ctx);
1720         return all_flushed;
1721 }
1722
1723 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
1724 {
1725         bool ret = true;
1726
1727         if (test_bit(0, &ctx->check_cq_overflow)) {
1728                 /* iopoll syncs against uring_lock, not completion_lock */
1729                 if (ctx->flags & IORING_SETUP_IOPOLL)
1730                         mutex_lock(&ctx->uring_lock);
1731                 ret = __io_cqring_overflow_flush(ctx, false);
1732                 if (ctx->flags & IORING_SETUP_IOPOLL)
1733                         mutex_unlock(&ctx->uring_lock);
1734         }
1735
1736         return ret;
1737 }
1738
1739 /* must to be called somewhat shortly after putting a request */
1740 static inline void io_put_task(struct task_struct *task, int nr)
1741 {
1742         struct io_uring_task *tctx = task->io_uring;
1743
1744         if (likely(task == current)) {
1745                 tctx->cached_refs += nr;
1746         } else {
1747                 percpu_counter_sub(&tctx->inflight, nr);
1748                 if (unlikely(atomic_read(&tctx->in_idle)))
1749                         wake_up(&tctx->wait);
1750                 put_task_struct_many(task, nr);
1751         }
1752 }
1753
1754 static void io_task_refs_refill(struct io_uring_task *tctx)
1755 {
1756         unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
1757
1758         percpu_counter_add(&tctx->inflight, refill);
1759         refcount_add(refill, &current->usage);
1760         tctx->cached_refs += refill;
1761 }
1762
1763 static inline void io_get_task_refs(int nr)
1764 {
1765         struct io_uring_task *tctx = current->io_uring;
1766
1767         tctx->cached_refs -= nr;
1768         if (unlikely(tctx->cached_refs < 0))
1769                 io_task_refs_refill(tctx);
1770 }
1771
1772 static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
1773                                      s32 res, u32 cflags)
1774 {
1775         struct io_overflow_cqe *ocqe;
1776
1777         ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
1778         if (!ocqe) {
1779                 /*
1780                  * If we're in ring overflow flush mode, or in task cancel mode,
1781                  * or cannot allocate an overflow entry, then we need to drop it
1782                  * on the floor.
1783                  */
1784                 io_account_cq_overflow(ctx);
1785                 return false;
1786         }
1787         if (list_empty(&ctx->cq_overflow_list)) {
1788                 set_bit(0, &ctx->check_cq_overflow);
1789                 WRITE_ONCE(ctx->rings->sq_flags,
1790                            ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
1791
1792         }
1793         ocqe->cqe.user_data = user_data;
1794         ocqe->cqe.res = res;
1795         ocqe->cqe.flags = cflags;
1796         list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
1797         return true;
1798 }
1799
1800 static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
1801                                           s32 res, u32 cflags)
1802 {
1803         struct io_uring_cqe *cqe;
1804
1805         trace_io_uring_complete(ctx, user_data, res, cflags);
1806
1807         /*
1808          * If we can't get a cq entry, userspace overflowed the
1809          * submission (by quite a lot). Increment the overflow count in
1810          * the ring.
1811          */
1812         cqe = io_get_cqe(ctx);
1813         if (likely(cqe)) {
1814                 WRITE_ONCE(cqe->user_data, user_data);
1815                 WRITE_ONCE(cqe->res, res);
1816                 WRITE_ONCE(cqe->flags, cflags);
1817                 return true;
1818         }
1819         return io_cqring_event_overflow(ctx, user_data, res, cflags);
1820 }
1821
1822 /* not as hot to bloat with inlining */
1823 static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
1824                                           s32 res, u32 cflags)
1825 {
1826         return __io_cqring_fill_event(ctx, user_data, res, cflags);
1827 }
1828
1829 static void io_req_complete_post(struct io_kiocb *req, s32 res,
1830                                  u32 cflags)
1831 {
1832         struct io_ring_ctx *ctx = req->ctx;
1833
1834         spin_lock(&ctx->completion_lock);
1835         __io_cqring_fill_event(ctx, req->user_data, res, cflags);
1836         /*
1837          * If we're the last reference to this request, add to our locked
1838          * free_list cache.
1839          */
1840         if (req_ref_put_and_test(req)) {
1841                 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
1842                         if (req->flags & IO_DISARM_MASK)
1843                                 io_disarm_next(req);
1844                         if (req->link) {
1845                                 io_req_task_queue(req->link);
1846                                 req->link = NULL;
1847                         }
1848                 }
1849                 io_req_put_rsrc(req, ctx);
1850                 io_dismantle_req(req);
1851                 io_put_task(req->task, 1);
1852                 wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
1853                 ctx->locked_free_nr++;
1854         }
1855         io_commit_cqring(ctx);
1856         spin_unlock(&ctx->completion_lock);
1857         io_cqring_ev_posted(ctx);
1858 }
1859
1860 static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
1861                                          u32 cflags)
1862 {
1863         req->result = res;
1864         req->cflags = cflags;
1865         req->flags |= REQ_F_COMPLETE_INLINE;
1866 }
1867
1868 static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
1869                                      s32 res, u32 cflags)
1870 {
1871         if (issue_flags & IO_URING_F_COMPLETE_DEFER)
1872                 io_req_complete_state(req, res, cflags);
1873         else
1874                 io_req_complete_post(req, res, cflags);
1875 }
1876
1877 static inline void io_req_complete(struct io_kiocb *req, s32 res)
1878 {
1879         __io_req_complete(req, 0, res, 0);
1880 }
1881
1882 static void io_req_complete_failed(struct io_kiocb *req, s32 res)
1883 {
1884         req_set_fail(req);
1885         io_req_complete_post(req, res, 0);
1886 }
1887
1888 static void io_req_complete_fail_submit(struct io_kiocb *req)
1889 {
1890         /*
1891          * We don't submit, fail them all, for that replace hardlinks with
1892          * normal links. Extra REQ_F_LINK is tolerated.
1893          */
1894         req->flags &= ~REQ_F_HARDLINK;
1895         req->flags |= REQ_F_LINK;
1896         io_req_complete_failed(req, req->result);
1897 }
1898
1899 /*
1900  * Don't initialise the fields below on every allocation, but do that in
1901  * advance and keep them valid across allocations.
1902  */
1903 static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
1904 {
1905         req->ctx = ctx;
1906         req->link = NULL;
1907         req->async_data = NULL;
1908         /* not necessary, but safer to zero */
1909         req->result = 0;
1910 }
1911
1912 static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
1913                                         struct io_submit_state *state)
1914 {
1915         spin_lock(&ctx->completion_lock);
1916         wq_list_splice(&ctx->locked_free_list, &state->free_list);
1917         ctx->locked_free_nr = 0;
1918         spin_unlock(&ctx->completion_lock);
1919 }
1920
1921 /* Returns true IFF there are requests in the cache */
1922 static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
1923 {
1924         struct io_submit_state *state = &ctx->submit_state;
1925
1926         /*
1927          * If we have more than a batch's worth of requests in our IRQ side
1928          * locked cache, grab the lock and move them over to our submission
1929          * side cache.
1930          */
1931         if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
1932                 io_flush_cached_locked_reqs(ctx, state);
1933         return !!state->free_list.next;
1934 }
1935
1936 /*
1937  * A request might get retired back into the request caches even before opcode
1938  * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
1939  * Because of that, io_alloc_req() should be called only under ->uring_lock
1940  * and with extra caution to not get a request that is still worked on.
1941  */
1942 static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
1943         __must_hold(&ctx->uring_lock)
1944 {
1945         struct io_submit_state *state = &ctx->submit_state;
1946         gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1947         void *reqs[IO_REQ_ALLOC_BATCH];
1948         struct io_kiocb *req;
1949         int ret, i;
1950
1951         if (likely(state->free_list.next || io_flush_cached_reqs(ctx)))
1952                 return true;
1953
1954         ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
1955
1956         /*
1957          * Bulk alloc is all-or-nothing. If we fail to get a batch,
1958          * retry single alloc to be on the safe side.
1959          */
1960         if (unlikely(ret <= 0)) {
1961                 reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1962                 if (!reqs[0])
1963                         return false;
1964                 ret = 1;
1965         }
1966
1967         percpu_ref_get_many(&ctx->refs, ret);
1968         for (i = 0; i < ret; i++) {
1969                 req = reqs[i];
1970
1971                 io_preinit_req(req, ctx);
1972                 wq_stack_add_head(&req->comp_list, &state->free_list);
1973         }
1974         return true;
1975 }
1976
1977 static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
1978 {
1979         if (unlikely(!ctx->submit_state.free_list.next))
1980                 return __io_alloc_req_refill(ctx);
1981         return true;
1982 }
1983
1984 static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
1985 {
1986         struct io_wq_work_node *node;
1987
1988         node = wq_stack_extract(&ctx->submit_state.free_list);
1989         return container_of(node, struct io_kiocb, comp_list);
1990 }
1991
1992 static inline void io_put_file(struct file *file)
1993 {
1994         if (file)
1995                 fput(file);
1996 }
1997
1998 static inline void io_dismantle_req(struct io_kiocb *req)
1999 {
2000         unsigned int flags = req->flags;
2001
2002         if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
2003                 io_clean_op(req);
2004         if (!(flags & REQ_F_FIXED_FILE))
2005                 io_put_file(req->file);
2006 }
2007
2008 static __cold void __io_free_req(struct io_kiocb *req)
2009 {
2010         struct io_ring_ctx *ctx = req->ctx;
2011
2012         io_req_put_rsrc(req, ctx);
2013         io_dismantle_req(req);
2014         io_put_task(req->task, 1);
2015
2016         spin_lock(&ctx->completion_lock);
2017         wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
2018         ctx->locked_free_nr++;
2019         spin_unlock(&ctx->completion_lock);
2020 }
2021
2022 static inline void io_remove_next_linked(struct io_kiocb *req)
2023 {
2024         struct io_kiocb *nxt = req->link;
2025
2026         req->link = nxt->link;
2027         nxt->link = NULL;
2028 }
2029
2030 static bool io_kill_linked_timeout(struct io_kiocb *req)
2031         __must_hold(&req->ctx->completion_lock)
2032         __must_hold(&req->ctx->timeout_lock)
2033 {
2034         struct io_kiocb *link = req->link;
2035
2036         if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
2037                 struct io_timeout_data *io = link->async_data;
2038
2039                 io_remove_next_linked(req);
2040                 link->timeout.head = NULL;
2041                 if (hrtimer_try_to_cancel(&io->timer) != -1) {
2042                         list_del(&link->timeout.list);
2043                         io_cqring_fill_event(link->ctx, link->user_data,
2044                                              -ECANCELED, 0);
2045                         io_put_req_deferred(link);
2046                         return true;
2047                 }
2048         }
2049         return false;
2050 }
2051
2052 static void io_fail_links(struct io_kiocb *req)
2053         __must_hold(&req->ctx->completion_lock)
2054 {
2055         struct io_kiocb *nxt, *link = req->link;
2056
2057         req->link = NULL;
2058         while (link) {
2059                 long res = -ECANCELED;
2060
2061                 if (link->flags & REQ_F_FAIL)
2062                         res = link->result;
2063
2064                 nxt = link->link;
2065                 link->link = NULL;
2066
2067                 trace_io_uring_fail_link(req, link);
2068                 io_cqring_fill_event(link->ctx, link->user_data, res, 0);
2069                 io_put_req_deferred(link);
2070                 link = nxt;
2071         }
2072 }
2073
2074 static bool io_disarm_next(struct io_kiocb *req)
2075         __must_hold(&req->ctx->completion_lock)
2076 {
2077         bool posted = false;
2078
2079         if (req->flags & REQ_F_ARM_LTIMEOUT) {
2080                 struct io_kiocb *link = req->link;
2081
2082                 req->flags &= ~REQ_F_ARM_LTIMEOUT;
2083                 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
2084                         io_remove_next_linked(req);
2085                         io_cqring_fill_event(link->ctx, link->user_data,
2086                                              -ECANCELED, 0);
2087                         io_put_req_deferred(link);
2088                         posted = true;
2089                 }
2090         } else if (req->flags & REQ_F_LINK_TIMEOUT) {
2091                 struct io_ring_ctx *ctx = req->ctx;
2092
2093                 spin_lock_irq(&ctx->timeout_lock);
2094                 posted = io_kill_linked_timeout(req);
2095                 spin_unlock_irq(&ctx->timeout_lock);
2096         }
2097         if (unlikely((req->flags & REQ_F_FAIL) &&
2098                      !(req->flags & REQ_F_HARDLINK))) {
2099                 posted |= (req->link != NULL);
2100                 io_fail_links(req);
2101         }
2102         return posted;
2103 }
2104
2105 static void __io_req_find_next_prep(struct io_kiocb *req)
2106 {
2107         struct io_ring_ctx *ctx = req->ctx;
2108         bool posted;
2109
2110         spin_lock(&ctx->completion_lock);
2111         posted = io_disarm_next(req);
2112         if (posted)
2113                 io_commit_cqring(req->ctx);
2114         spin_unlock(&ctx->completion_lock);
2115         if (posted)
2116                 io_cqring_ev_posted(ctx);
2117 }
2118
2119 static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
2120 {
2121         struct io_kiocb *nxt;
2122
2123         if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
2124                 return NULL;
2125         /*
2126          * If LINK is set, we have dependent requests in this chain. If we
2127          * didn't fail this request, queue the first one up, moving any other
2128          * dependencies to the next request. In case of failure, fail the rest
2129          * of the chain.
2130          */
2131         if (unlikely(req->flags & IO_DISARM_MASK))
2132                 __io_req_find_next_prep(req);
2133         nxt = req->link;
2134         req->link = NULL;
2135         return nxt;
2136 }
2137
2138 static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
2139 {
2140         if (!ctx)
2141                 return;
2142         if (*locked) {
2143                 io_submit_flush_completions(ctx);
2144                 mutex_unlock(&ctx->uring_lock);
2145                 *locked = false;
2146         }
2147         percpu_ref_put(&ctx->refs);
2148 }
2149
2150 static void tctx_task_work(struct callback_head *cb)
2151 {
2152         bool locked = false;
2153         struct io_ring_ctx *ctx = NULL;
2154         struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
2155                                                   task_work);
2156
2157         while (1) {
2158                 struct io_wq_work_node *node;
2159
2160                 if (!tctx->task_list.first && locked)
2161                         io_submit_flush_completions(ctx);
2162
2163                 spin_lock_irq(&tctx->task_lock);
2164                 node = tctx->task_list.first;
2165                 INIT_WQ_LIST(&tctx->task_list);
2166                 if (!node)
2167                         tctx->task_running = false;
2168                 spin_unlock_irq(&tctx->task_lock);
2169                 if (!node)
2170                         break;
2171
2172                 do {
2173                         struct io_wq_work_node *next = node->next;
2174                         struct io_kiocb *req = container_of(node, struct io_kiocb,
2175                                                             io_task_work.node);
2176
2177                         if (req->ctx != ctx) {
2178                                 ctx_flush_and_put(ctx, &locked);
2179                                 ctx = req->ctx;
2180                                 /* if not contended, grab and improve batching */
2181                                 locked = mutex_trylock(&ctx->uring_lock);
2182                                 percpu_ref_get(&ctx->refs);
2183                         }
2184                         req->io_task_work.func(req, &locked);
2185                         node = next;
2186                 } while (node);
2187
2188                 cond_resched();
2189         }
2190
2191         ctx_flush_and_put(ctx, &locked);
2192 }
2193
2194 static void io_req_task_work_add(struct io_kiocb *req)
2195 {
2196         struct task_struct *tsk = req->task;
2197         struct io_uring_task *tctx = tsk->io_uring;
2198         enum task_work_notify_mode notify;
2199         struct io_wq_work_node *node;
2200         unsigned long flags;
2201         bool running;
2202
2203         WARN_ON_ONCE(!tctx);
2204
2205         spin_lock_irqsave(&tctx->task_lock, flags);
2206         wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
2207         running = tctx->task_running;
2208         if (!running)
2209                 tctx->task_running = true;
2210         spin_unlock_irqrestore(&tctx->task_lock, flags);
2211
2212         /* task_work already pending, we're done */
2213         if (running)
2214                 return;
2215
2216         /*
2217          * SQPOLL kernel thread doesn't need notification, just a wakeup. For
2218          * all other cases, use TWA_SIGNAL unconditionally to ensure we're
2219          * processing task_work. There's no reliable way to tell if TWA_RESUME
2220          * will do the job.
2221          */
2222         notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
2223         if (likely(!task_work_add(tsk, &tctx->task_work, notify))) {
2224                 if (notify == TWA_NONE)
2225                         wake_up_process(tsk);
2226                 return;
2227         }
2228
2229         spin_lock_irqsave(&tctx->task_lock, flags);
2230         tctx->task_running = false;
2231         node = tctx->task_list.first;
2232         INIT_WQ_LIST(&tctx->task_list);
2233         spin_unlock_irqrestore(&tctx->task_lock, flags);
2234
2235         while (node) {
2236                 req = container_of(node, struct io_kiocb, io_task_work.node);
2237                 node = node->next;
2238                 if (llist_add(&req->io_task_work.fallback_node,
2239                               &req->ctx->fallback_llist))
2240                         schedule_delayed_work(&req->ctx->fallback_work, 1);
2241         }
2242 }
2243
2244 static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
2245 {
2246         struct io_ring_ctx *ctx = req->ctx;
2247
2248         /* not needed for normal modes, but SQPOLL depends on it */
2249         io_tw_lock(ctx, locked);
2250         io_req_complete_failed(req, req->result);
2251 }
2252
2253 static void io_req_task_submit(struct io_kiocb *req, bool *locked)
2254 {
2255         struct io_ring_ctx *ctx = req->ctx;
2256
2257         io_tw_lock(ctx, locked);
2258         /* req->task == current here, checking PF_EXITING is safe */
2259         if (likely(!(req->task->flags & PF_EXITING)))
2260                 __io_queue_sqe(req);
2261         else
2262                 io_req_complete_failed(req, -EFAULT);
2263 }
2264
2265 static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
2266 {
2267         req->result = ret;
2268         req->io_task_work.func = io_req_task_cancel;
2269         io_req_task_work_add(req);
2270 }
2271
2272 static void io_req_task_queue(struct io_kiocb *req)
2273 {
2274         req->io_task_work.func = io_req_task_submit;
2275         io_req_task_work_add(req);
2276 }
2277
2278 static void io_req_task_queue_reissue(struct io_kiocb *req)
2279 {
2280         req->io_task_work.func = io_queue_async_work;
2281         io_req_task_work_add(req);
2282 }
2283
2284 static inline void io_queue_next(struct io_kiocb *req)
2285 {
2286         struct io_kiocb *nxt = io_req_find_next(req);
2287
2288         if (nxt)
2289                 io_req_task_queue(nxt);
2290 }
2291
2292 static void io_free_req(struct io_kiocb *req)
2293 {
2294         io_queue_next(req);
2295         __io_free_req(req);
2296 }
2297
2298 static void io_free_req_work(struct io_kiocb *req, bool *locked)
2299 {
2300         io_free_req(req);
2301 }
2302
2303 static void io_free_batch_list(struct io_ring_ctx *ctx,
2304                                 struct io_wq_work_node *node)
2305         __must_hold(&ctx->uring_lock)
2306 {
2307         struct task_struct *task = NULL;
2308         int task_refs = 0;
2309
2310         do {
2311                 struct io_kiocb *req = container_of(node, struct io_kiocb,
2312                                                     comp_list);
2313
2314                 if (unlikely(req->flags & REQ_F_REFCOUNT)) {
2315                         node = req->comp_list.next;
2316                         if (!req_ref_put_and_test(req))
2317                                 continue;
2318                 }
2319
2320                 io_req_put_rsrc_locked(req, ctx);
2321                 io_queue_next(req);
2322                 io_dismantle_req(req);
2323
2324                 if (req->task != task) {
2325                         if (task)
2326                                 io_put_task(task, task_refs);
2327                         task = req->task;
2328                         task_refs = 0;
2329                 }
2330                 task_refs++;
2331                 node = req->comp_list.next;
2332                 wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
2333         } while (node);
2334
2335         if (task)
2336                 io_put_task(task, task_refs);
2337 }
2338
2339 static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
2340         __must_hold(&ctx->uring_lock)
2341 {
2342         struct io_wq_work_node *node, *prev;
2343         struct io_submit_state *state = &ctx->submit_state;
2344
2345         spin_lock(&ctx->completion_lock);
2346         wq_list_for_each(node, prev, &state->compl_reqs) {
2347                 struct io_kiocb *req = container_of(node, struct io_kiocb,
2348                                                     comp_list);
2349
2350                 __io_cqring_fill_event(ctx, req->user_data, req->result,
2351                                         req->cflags);
2352         }
2353         io_commit_cqring(ctx);
2354         spin_unlock(&ctx->completion_lock);
2355         io_cqring_ev_posted(ctx);
2356
2357         io_free_batch_list(ctx, state->compl_reqs.first);
2358         INIT_WQ_LIST(&state->compl_reqs);
2359 }
2360
2361 /*
2362  * Drop reference to request, return next in chain (if there is one) if this
2363  * was the last reference to this request.
2364  */
2365 static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
2366 {
2367         struct io_kiocb *nxt = NULL;
2368
2369         if (req_ref_put_and_test(req)) {
2370                 nxt = io_req_find_next(req);
2371                 __io_free_req(req);
2372         }
2373         return nxt;
2374 }
2375
2376 static inline void io_put_req(struct io_kiocb *req)
2377 {
2378         if (req_ref_put_and_test(req))
2379                 io_free_req(req);
2380 }
2381
2382 static inline void io_put_req_deferred(struct io_kiocb *req)
2383 {
2384         if (req_ref_put_and_test(req)) {
2385                 req->io_task_work.func = io_free_req_work;
2386                 io_req_task_work_add(req);
2387         }
2388 }
2389
2390 static unsigned io_cqring_events(struct io_ring_ctx *ctx)
2391 {
2392         /* See comment at the top of this file */
2393         smp_rmb();
2394         return __io_cqring_events(ctx);
2395 }
2396
2397 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2398 {
2399         struct io_rings *rings = ctx->rings;
2400
2401         /* make sure SQ entry isn't read before tail */
2402         return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2403 }
2404
2405 static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
2406 {
2407         unsigned int cflags;
2408
2409         cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
2410         cflags |= IORING_CQE_F_BUFFER;
2411         req->flags &= ~REQ_F_BUFFER_SELECTED;
2412         kfree(kbuf);
2413         return cflags;
2414 }
2415
2416 static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
2417 {
2418         if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
2419                 return 0;
2420         return io_put_kbuf(req, req->kbuf);
2421 }
2422
2423 static inline bool io_run_task_work(void)
2424 {
2425         if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) {
2426                 __set_current_state(TASK_RUNNING);
2427                 tracehook_notify_signal();
2428                 return true;
2429         }
2430
2431         return false;
2432 }
2433
2434 static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
2435 {
2436         struct io_wq_work_node *pos, *start, *prev;
2437         unsigned int poll_flags = BLK_POLL_NOSLEEP;
2438         DEFINE_IO_COMP_BATCH(iob);
2439         int nr_events = 0;
2440
2441         /*
2442          * Only spin for completions if we don't have multiple devices hanging
2443          * off our complete list.
2444          */
2445         if (ctx->poll_multi_queue || force_nonspin)
2446                 poll_flags |= BLK_POLL_ONESHOT;
2447
2448         wq_list_for_each(pos, start, &ctx->iopoll_list) {
2449                 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
2450                 struct kiocb *kiocb = &req->rw.kiocb;
2451                 int ret;
2452
2453                 /*
2454                  * Move completed and retryable entries to our local lists.
2455                  * If we find a request that requires polling, break out
2456                  * and complete those lists first, if we have entries there.
2457                  */
2458                 if (READ_ONCE(req->iopoll_completed))
2459                         break;
2460
2461                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
2462                 if (unlikely(ret < 0))
2463                         return ret;
2464                 else if (ret)
2465                         poll_flags |= BLK_POLL_ONESHOT;
2466
2467                 /* iopoll may have completed current req */
2468                 if (!rq_list_empty(iob.req_list) ||
2469                     READ_ONCE(req->iopoll_completed))
2470                         break;
2471         }
2472
2473         if (!rq_list_empty(iob.req_list))
2474                 iob.complete(&iob);
2475         else if (!pos)
2476                 return 0;
2477
2478         prev = start;
2479         wq_list_for_each_resume(pos, prev) {
2480                 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
2481
2482                 /* order with io_complete_rw_iopoll(), e.g. ->result updates */
2483                 if (!smp_load_acquire(&req->iopoll_completed))
2484                         break;
2485                 __io_cqring_fill_event(ctx, req->user_data, req->result,
2486                                         io_put_rw_kbuf(req));
2487                 nr_events++;
2488         }
2489
2490         if (unlikely(!nr_events))
2491                 return 0;
2492
2493         io_commit_cqring(ctx);
2494         io_cqring_ev_posted_iopoll(ctx);
2495         pos = start ? start->next : ctx->iopoll_list.first;
2496         wq_list_cut(&ctx->iopoll_list, prev, start);
2497         io_free_batch_list(ctx, pos);
2498         return nr_events;
2499 }
2500
2501 /*
2502  * We can't just wait for polled events to come to us, we have to actively
2503  * find and complete them.
2504  */
2505 static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
2506 {
2507         if (!(ctx->flags & IORING_SETUP_IOPOLL))
2508                 return;
2509
2510         mutex_lock(&ctx->uring_lock);
2511         while (!wq_list_empty(&ctx->iopoll_list)) {
2512                 /* let it sleep and repeat later if can't complete a request */
2513                 if (io_do_iopoll(ctx, true) == 0)
2514                         break;
2515                 /*
2516                  * Ensure we allow local-to-the-cpu processing to take place,
2517                  * in this case we need to ensure that we reap all events.
2518                  * Also let task_work, etc. to progress by releasing the mutex
2519                  */
2520                 if (need_resched()) {
2521                         mutex_unlock(&ctx->uring_lock);
2522                         cond_resched();
2523                         mutex_lock(&ctx->uring_lock);
2524                 }
2525         }
2526         mutex_unlock(&ctx->uring_lock);
2527 }
2528
2529 static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
2530 {
2531         unsigned int nr_events = 0;
2532         int ret = 0;
2533
2534         /*
2535          * We disallow the app entering submit/complete with polling, but we
2536          * still need to lock the ring to prevent racing with polled issue
2537          * that got punted to a workqueue.
2538          */
2539         mutex_lock(&ctx->uring_lock);
2540         /*
2541          * Don't enter poll loop if we already have events pending.
2542          * If we do, we can potentially be spinning for commands that
2543          * already triggered a CQE (eg in error).
2544          */
2545         if (test_bit(0, &ctx->check_cq_overflow))
2546                 __io_cqring_overflow_flush(ctx, false);
2547         if (io_cqring_events(ctx))
2548                 goto out;
2549         do {
2550                 /*
2551                  * If a submit got punted to a workqueue, we can have the
2552                  * application entering polling for a command before it gets
2553                  * issued. That app will hold the uring_lock for the duration
2554                  * of the poll right here, so we need to take a breather every
2555                  * now and then to ensure that the issue has a chance to add
2556                  * the poll to the issued list. Otherwise we can spin here
2557                  * forever, while the workqueue is stuck trying to acquire the
2558                  * very same mutex.
2559                  */
2560                 if (wq_list_empty(&ctx->iopoll_list)) {
2561                         u32 tail = ctx->cached_cq_tail;
2562
2563                         mutex_unlock(&ctx->uring_lock);
2564                         io_run_task_work();
2565                         mutex_lock(&ctx->uring_lock);
2566
2567                         /* some requests don't go through iopoll_list */
2568                         if (tail != ctx->cached_cq_tail ||
2569                             wq_list_empty(&ctx->iopoll_list))
2570                                 break;
2571                 }
2572                 ret = io_do_iopoll(ctx, !min);
2573                 if (ret < 0)
2574                         break;
2575                 nr_events += ret;
2576                 ret = 0;
2577         } while (nr_events < min && !need_resched());
2578 out:
2579         mutex_unlock(&ctx->uring_lock);
2580         return ret;
2581 }
2582
2583 static void kiocb_end_write(struct io_kiocb *req)
2584 {
2585         /*
2586          * Tell lockdep we inherited freeze protection from submission
2587          * thread.
2588          */
2589         if (req->flags & REQ_F_ISREG) {
2590                 struct super_block *sb = file_inode(req->file)->i_sb;
2591
2592                 __sb_writers_acquired(sb, SB_FREEZE_WRITE);
2593                 sb_end_write(sb);
2594         }
2595 }
2596
2597 #ifdef CONFIG_BLOCK
2598 static bool io_resubmit_prep(struct io_kiocb *req)
2599 {
2600         struct io_async_rw *rw = req->async_data;
2601
2602         if (!req_has_async_data(req))
2603                 return !io_req_prep_async(req);
2604         iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
2605         return true;
2606 }
2607
2608 static bool io_rw_should_reissue(struct io_kiocb *req)
2609 {
2610         umode_t mode = file_inode(req->file)->i_mode;
2611         struct io_ring_ctx *ctx = req->ctx;
2612
2613         if (!S_ISBLK(mode) && !S_ISREG(mode))
2614                 return false;
2615         if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
2616             !(ctx->flags & IORING_SETUP_IOPOLL)))
2617                 return false;
2618         /*
2619          * If ref is dying, we might be running poll reap from the exit work.
2620          * Don't attempt to reissue from that path, just let it fail with
2621          * -EAGAIN.
2622          */
2623         if (percpu_ref_is_dying(&ctx->refs))
2624                 return false;
2625         /*
2626          * Play it safe and assume not safe to re-import and reissue if we're
2627          * not in the original thread group (or in task context).
2628          */
2629         if (!same_thread_group(req->task, current) || !in_task())
2630                 return false;
2631         return true;
2632 }
2633 #else
2634 static bool io_resubmit_prep(struct io_kiocb *req)
2635 {
2636         return false;
2637 }
2638 static bool io_rw_should_reissue(struct io_kiocb *req)
2639 {
2640         return false;
2641 }
2642 #endif
2643
2644 static bool __io_complete_rw_common(struct io_kiocb *req, long res)
2645 {
2646         if (req->rw.kiocb.ki_flags & IOCB_WRITE)
2647                 kiocb_end_write(req);
2648         if (unlikely(res != req->result)) {
2649                 if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
2650                     io_rw_should_reissue(req)) {
2651                         req->flags |= REQ_F_REISSUE;
2652                         return true;
2653                 }
2654                 req_set_fail(req);
2655                 req->result = res;
2656         }
2657         return false;
2658 }
2659
2660 static void io_req_task_complete(struct io_kiocb *req, bool *locked)
2661 {
2662         unsigned int cflags = io_put_rw_kbuf(req);
2663         int res = req->result;
2664
2665         if (*locked) {
2666                 io_req_complete_state(req, res, cflags);
2667                 io_req_add_compl_list(req);
2668         } else {
2669                 io_req_complete_post(req, res, cflags);
2670         }
2671 }
2672
2673 static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
2674                              unsigned int issue_flags)
2675 {
2676         if (__io_complete_rw_common(req, res))
2677                 return;
2678         __io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req));
2679 }
2680
2681 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
2682 {
2683         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2684
2685         if (__io_complete_rw_common(req, res))
2686                 return;
2687         req->result = res;
2688         req->io_task_work.func = io_req_task_complete;
2689         io_req_task_work_add(req);
2690 }
2691
2692 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
2693 {
2694         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2695
2696         if (kiocb->ki_flags & IOCB_WRITE)
2697                 kiocb_end_write(req);
2698         if (unlikely(res != req->result)) {
2699                 if (res == -EAGAIN && io_rw_should_reissue(req)) {
2700                         req->flags |= REQ_F_REISSUE;
2701                         return;
2702                 }
2703                 req->result = res;
2704         }
2705
2706         /* order with io_iopoll_complete() checking ->iopoll_completed */
2707         smp_store_release(&req->iopoll_completed, 1);
2708 }
2709
2710 /*
2711  * After the iocb has been issued, it's safe to be found on the poll list.
2712  * Adding the kiocb to the list AFTER submission ensures that we don't
2713  * find it from a io_do_iopoll() thread before the issuer is done
2714  * accessing the kiocb cookie.
2715  */
2716 static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
2717 {
2718         struct io_ring_ctx *ctx = req->ctx;
2719         const bool need_lock = !(issue_flags & IO_URING_F_NONBLOCK);
2720
2721         /* workqueue context doesn't hold uring_lock, grab it now */
2722         if (unlikely(need_lock))
2723                 mutex_lock(&ctx->uring_lock);
2724
2725         /*
2726          * Track whether we have multiple files in our lists. This will impact
2727          * how we do polling eventually, not spinning if we're on potentially
2728          * different devices.
2729          */
2730         if (wq_list_empty(&ctx->iopoll_list)) {
2731                 ctx->poll_multi_queue = false;
2732         } else if (!ctx->poll_multi_queue) {
2733                 struct io_kiocb *list_req;
2734
2735                 list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
2736                                         comp_list);
2737                 if (list_req->file != req->file)
2738                         ctx->poll_multi_queue = true;
2739         }
2740
2741         /*
2742          * For fast devices, IO may have already completed. If it has, add
2743          * it to the front so we find it first.
2744          */
2745         if (READ_ONCE(req->iopoll_completed))
2746                 wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
2747         else
2748                 wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
2749
2750         if (unlikely(need_lock)) {
2751                 /*
2752                  * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
2753                  * in sq thread task context or in io worker task context. If
2754                  * current task context is sq thread, we don't need to check
2755                  * whether should wake up sq thread.
2756                  */
2757                 if ((ctx->flags & IORING_SETUP_SQPOLL) &&
2758                     wq_has_sleeper(&ctx->sq_data->wait))
2759                         wake_up(&ctx->sq_data->wait);
2760
2761                 mutex_unlock(&ctx->uring_lock);
2762         }
2763 }
2764
2765 static bool io_bdev_nowait(struct block_device *bdev)
2766 {
2767         return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
2768 }
2769
2770 /*
2771  * If we tracked the file through the SCM inflight mechanism, we could support
2772  * any file. For now, just ensure that anything potentially problematic is done
2773  * inline.
2774  */
2775 static bool __io_file_supports_nowait(struct file *file, int rw)
2776 {
2777         umode_t mode = file_inode(file)->i_mode;
2778
2779         if (S_ISBLK(mode)) {
2780                 if (IS_ENABLED(CONFIG_BLOCK) &&
2781                     io_bdev_nowait(I_BDEV(file->f_mapping->host)))
2782                         return true;
2783                 return false;
2784         }
2785         if (S_ISSOCK(mode))
2786                 return true;
2787         if (S_ISREG(mode)) {
2788                 if (IS_ENABLED(CONFIG_BLOCK) &&
2789                     io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
2790                     file->f_op != &io_uring_fops)
2791                         return true;
2792                 return false;
2793         }
2794
2795         /* any ->read/write should understand O_NONBLOCK */
2796         if (file->f_flags & O_NONBLOCK)
2797                 return true;
2798
2799         if (!(file->f_mode & FMODE_NOWAIT))
2800                 return false;
2801
2802         if (rw == READ)
2803                 return file->f_op->read_iter != NULL;
2804
2805         return file->f_op->write_iter != NULL;
2806 }
2807
2808 static bool io_file_supports_nowait(struct io_kiocb *req, int rw)
2809 {
2810         if (rw == READ && (req->flags & REQ_F_NOWAIT_READ))
2811                 return true;
2812         else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE))
2813                 return true;
2814
2815         return __io_file_supports_nowait(req->file, rw);
2816 }
2817
2818 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2819                       int rw)
2820 {
2821         struct io_ring_ctx *ctx = req->ctx;
2822         struct kiocb *kiocb = &req->rw.kiocb;
2823         struct file *file = req->file;
2824         unsigned ioprio;
2825         int ret;
2826
2827         if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode))
2828                 req->flags |= REQ_F_ISREG;
2829
2830         kiocb->ki_pos = READ_ONCE(sqe->off);
2831         if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
2832                 req->flags |= REQ_F_CUR_POS;
2833                 kiocb->ki_pos = file->f_pos;
2834         }
2835         kiocb->ki_hint = ki_hint_validate(file_write_hint(file));
2836         kiocb->ki_flags = iocb_flags(file);
2837         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2838         if (unlikely(ret))
2839                 return ret;
2840
2841         /*
2842          * If the file is marked O_NONBLOCK, still allow retry for it if it
2843          * supports async. Otherwise it's impossible to use O_NONBLOCK files
2844          * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
2845          */
2846         if ((kiocb->ki_flags & IOCB_NOWAIT) ||
2847             ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req, rw)))
2848                 req->flags |= REQ_F_NOWAIT;
2849
2850         ioprio = READ_ONCE(sqe->ioprio);
2851         if (ioprio) {
2852                 ret = ioprio_check_cap(ioprio);
2853                 if (ret)
2854                         return ret;
2855
2856                 kiocb->ki_ioprio = ioprio;
2857         } else
2858                 kiocb->ki_ioprio = get_current_ioprio();
2859
2860         if (ctx->flags & IORING_SETUP_IOPOLL) {
2861                 if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
2862                         return -EOPNOTSUPP;
2863
2864                 kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
2865                 kiocb->ki_complete = io_complete_rw_iopoll;
2866                 req->iopoll_completed = 0;
2867         } else {
2868                 if (kiocb->ki_flags & IOCB_HIPRI)
2869                         return -EINVAL;
2870                 kiocb->ki_complete = io_complete_rw;
2871         }
2872
2873         req->imu = NULL;
2874         req->rw.addr = READ_ONCE(sqe->addr);
2875         req->rw.len = READ_ONCE(sqe->len);
2876         req->buf_index = READ_ONCE(sqe->buf_index);
2877         return 0;
2878 }
2879
2880 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2881 {
2882         switch (ret) {
2883         case -EIOCBQUEUED:
2884                 break;
2885         case -ERESTARTSYS:
2886         case -ERESTARTNOINTR:
2887         case -ERESTARTNOHAND:
2888         case -ERESTART_RESTARTBLOCK:
2889                 /*
2890                  * We can't just restart the syscall, since previously
2891                  * submitted sqes may already be in progress. Just fail this
2892                  * IO with EINTR.
2893                  */
2894                 ret = -EINTR;
2895                 fallthrough;
2896         default:
2897                 kiocb->ki_complete(kiocb, ret, 0);
2898         }
2899 }
2900
2901 static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
2902                        unsigned int issue_flags)
2903 {
2904         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2905         struct io_async_rw *io = req->async_data;
2906
2907         /* add previously done IO, if any */
2908         if (req_has_async_data(req) && io->bytes_done > 0) {
2909                 if (ret < 0)
2910                         ret = io->bytes_done;
2911                 else
2912                         ret += io->bytes_done;
2913         }
2914
2915         if (req->flags & REQ_F_CUR_POS)
2916                 req->file->f_pos = kiocb->ki_pos;
2917         if (ret >= 0 && (kiocb->ki_complete == io_complete_rw))
2918                 __io_complete_rw(req, ret, 0, issue_flags);
2919         else
2920                 io_rw_done(kiocb, ret);
2921
2922         if (req->flags & REQ_F_REISSUE) {
2923                 req->flags &= ~REQ_F_REISSUE;
2924                 if (io_resubmit_prep(req)) {
2925                         io_req_task_queue_reissue(req);
2926                 } else {
2927                         unsigned int cflags = io_put_rw_kbuf(req);
2928                         struct io_ring_ctx *ctx = req->ctx;
2929
2930                         req_set_fail(req);
2931                         if (!(issue_flags & IO_URING_F_NONBLOCK)) {
2932                                 mutex_lock(&ctx->uring_lock);
2933                                 __io_req_complete(req, issue_flags, ret, cflags);
2934                                 mutex_unlock(&ctx->uring_lock);
2935                         } else {
2936                                 __io_req_complete(req, issue_flags, ret, cflags);
2937                         }
2938                 }
2939         }
2940 }
2941
2942 static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
2943                              struct io_mapped_ubuf *imu)
2944 {
2945         size_t len = req->rw.len;
2946         u64 buf_end, buf_addr = req->rw.addr;
2947         size_t offset;
2948
2949         if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
2950                 return -EFAULT;
2951         /* not inside the mapped region */
2952         if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
2953                 return -EFAULT;
2954
2955         /*
2956          * May not be a start of buffer, set size appropriately
2957          * and advance us to the beginning.
2958          */
2959         offset = buf_addr - imu->ubuf;
2960         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
2961
2962         if (offset) {
2963                 /*
2964                  * Don't use iov_iter_advance() here, as it's really slow for
2965                  * using the latter parts of a big fixed buffer - it iterates
2966                  * over each segment manually. We can cheat a bit here, because
2967                  * we know that:
2968                  *
2969                  * 1) it's a BVEC iter, we set it up
2970                  * 2) all bvecs are PAGE_SIZE in size, except potentially the
2971                  *    first and last bvec
2972                  *
2973                  * So just find our index, and adjust the iterator afterwards.
2974                  * If the offset is within the first bvec (or the whole first
2975                  * bvec, just use iov_iter_advance(). This makes it easier
2976                  * since we can just skip the first segment, which may not
2977                  * be PAGE_SIZE aligned.
2978                  */
2979                 const struct bio_vec *bvec = imu->bvec;
2980
2981                 if (offset <= bvec->bv_len) {
2982                         iov_iter_advance(iter, offset);
2983                 } else {
2984                         unsigned long seg_skip;
2985
2986                         /* skip first vec */
2987                         offset -= bvec->bv_len;
2988                         seg_skip = 1 + (offset >> PAGE_SHIFT);
2989
2990                         iter->bvec = bvec + seg_skip;
2991                         iter->nr_segs -= seg_skip;
2992                         iter->count -= bvec->bv_len + offset;
2993                         iter->iov_offset = offset & ~PAGE_MASK;
2994                 }
2995         }
2996
2997         return 0;
2998 }
2999
3000 static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
3001 {
3002         struct io_mapped_ubuf *imu = req->imu;
3003         u16 index, buf_index = req->buf_index;
3004
3005         if (likely(!imu)) {
3006                 struct io_ring_ctx *ctx = req->ctx;
3007
3008                 if (unlikely(buf_index >= ctx->nr_user_bufs))
3009                         return -EFAULT;
3010                 io_req_set_rsrc_node(req, ctx);
3011                 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
3012                 imu = READ_ONCE(ctx->user_bufs[index]);
3013                 req->imu = imu;
3014         }
3015         return __io_import_fixed(req, rw, iter, imu);
3016 }
3017
3018 static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
3019 {
3020         if (needs_lock)
3021                 mutex_unlock(&ctx->uring_lock);
3022 }
3023
3024 static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
3025 {
3026         /*
3027          * "Normal" inline submissions always hold the uring_lock, since we
3028          * grab it from the system call. Same is true for the SQPOLL offload.
3029          * The only exception is when we've detached the request and issue it
3030          * from an async worker thread, grab the lock for that case.
3031          */
3032         if (needs_lock)
3033                 mutex_lock(&ctx->uring_lock);
3034 }
3035
3036 static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
3037                                           int bgid, unsigned int issue_flags)
3038 {
3039         struct io_buffer *kbuf = req->kbuf;
3040         struct io_buffer *head;
3041         bool needs_lock = !(issue_flags & IO_URING_F_NONBLOCK);
3042
3043         if (req->flags & REQ_F_BUFFER_SELECTED)
3044                 return kbuf;
3045
3046         io_ring_submit_lock(req->ctx, needs_lock);
3047
3048         lockdep_assert_held(&req->ctx->uring_lock);
3049
3050         head = xa_load(&req->ctx->io_buffers, bgid);
3051         if (head) {
3052                 if (!list_empty(&head->list)) {
3053                         kbuf = list_last_entry(&head->list, struct io_buffer,
3054                                                         list);
3055                         list_del(&kbuf->list);
3056                 } else {
3057                         kbuf = head;
3058                         xa_erase(&req->ctx->io_buffers, bgid);
3059                 }
3060                 if (*len > kbuf->len)
3061                         *len = kbuf->len;
3062                 req->flags |= REQ_F_BUFFER_SELECTED;
3063                 req->kbuf = kbuf;
3064         } else {
3065                 kbuf = ERR_PTR(-ENOBUFS);
3066         }
3067
3068         io_ring_submit_unlock(req->ctx, needs_lock);
3069         return kbuf;
3070 }
3071
3072 static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
3073                                         unsigned int issue_flags)
3074 {
3075         struct io_buffer *kbuf;
3076         u16 bgid;
3077
3078         bgid = req->buf_index;
3079         kbuf = io_buffer_select(req, len, bgid, issue_flags);
3080         if (IS_ERR(kbuf))
3081                 return kbuf;
3082         return u64_to_user_ptr(kbuf->addr);
3083 }
3084
3085 #ifdef CONFIG_COMPAT
3086 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
3087                                 unsigned int issue_flags)
3088 {
3089         struct compat_iovec __user *uiov;
3090         compat_ssize_t clen;
3091         void __user *buf;
3092         ssize_t len;
3093
3094         uiov = u64_to_user_ptr(req->rw.addr);
3095         if (!access_ok(uiov, sizeof(*uiov)))
3096                 return -EFAULT;
3097         if (__get_user(clen, &uiov->iov_len))
3098                 return -EFAULT;
3099         if (clen < 0)
3100                 return -EINVAL;
3101
3102         len = clen;
3103         buf = io_rw_buffer_select(req, &len, issue_flags);
3104         if (IS_ERR(buf))
3105                 return PTR_ERR(buf);
3106         iov[0].iov_base = buf;
3107         iov[0].iov_len = (compat_size_t) len;
3108         return 0;
3109 }
3110 #endif
3111
3112 static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3113                                       unsigned int issue_flags)
3114 {
3115         struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
3116         void __user *buf;
3117         ssize_t len;
3118
3119         if (copy_from_user(iov, uiov, sizeof(*uiov)))
3120                 return -EFAULT;
3121
3122         len = iov[0].iov_len;
3123         if (len < 0)
3124                 return -EINVAL;
3125         buf = io_rw_buffer_select(req, &len, issue_flags);
3126         if (IS_ERR(buf))
3127                 return PTR_ERR(buf);
3128         iov[0].iov_base = buf;
3129         iov[0].iov_len = len;
3130         return 0;
3131 }
3132
3133 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3134                                     unsigned int issue_flags)
3135 {
3136         if (req->flags & REQ_F_BUFFER_SELECTED) {
3137                 struct io_buffer *kbuf = req->kbuf;
3138
3139                 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
3140                 iov[0].iov_len = kbuf->len;
3141                 return 0;
3142         }
3143         if (req->rw.len != 1)
3144                 return -EINVAL;
3145
3146 #ifdef CONFIG_COMPAT
3147         if (req->ctx->compat)
3148                 return io_compat_import(req, iov, issue_flags);
3149 #endif
3150
3151         return __io_iov_buffer_select(req, iov, issue_flags);
3152 }
3153
3154 static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
3155                                        struct io_rw_state *s,
3156                                        unsigned int issue_flags)
3157 {
3158         struct iov_iter *iter = &s->iter;
3159         u8 opcode = req->opcode;
3160         struct iovec *iovec;
3161         void __user *buf;
3162         size_t sqe_len;
3163         ssize_t ret;
3164
3165         BUILD_BUG_ON(ERR_PTR(0) != NULL);
3166
3167         if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED)
3168                 return ERR_PTR(io_import_fixed(req, rw, iter));
3169
3170         /* buffer index only valid with fixed read/write, or buffer select  */
3171         if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)))
3172                 return ERR_PTR(-EINVAL);
3173
3174         buf = u64_to_user_ptr(req->rw.addr);
3175         sqe_len = req->rw.len;
3176
3177         if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
3178                 if (req->flags & REQ_F_BUFFER_SELECT) {
3179                         buf = io_rw_buffer_select(req, &sqe_len, issue_flags);
3180                         if (IS_ERR(buf))
3181                                 return ERR_PTR(PTR_ERR(buf));
3182                         req->rw.len = sqe_len;
3183                 }
3184
3185                 ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter);
3186                 return ERR_PTR(ret);
3187         }
3188
3189         iovec = s->fast_iov;
3190         if (req->flags & REQ_F_BUFFER_SELECT) {
3191                 ret = io_iov_buffer_select(req, iovec, issue_flags);
3192                 if (!ret)
3193                         iov_iter_init(iter, rw, iovec, 1, iovec->iov_len);
3194                 return ERR_PTR(ret);
3195         }
3196
3197         ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
3198                               req->ctx->compat);
3199         if (unlikely(ret < 0))
3200                 return ERR_PTR(ret);
3201         return iovec;
3202 }
3203
3204 static inline int io_import_iovec(int rw, struct io_kiocb *req,
3205                                   struct iovec **iovec, struct io_rw_state *s,
3206                                   unsigned int issue_flags)
3207 {
3208         *iovec = __io_import_iovec(rw, req, s, issue_flags);
3209         if (unlikely(IS_ERR(*iovec)))
3210                 return PTR_ERR(*iovec);
3211
3212         iov_iter_save_state(&s->iter, &s->iter_state);
3213         return 0;
3214 }
3215
3216 static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
3217 {
3218         return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
3219 }
3220
3221 /*
3222  * For files that don't have ->read_iter() and ->write_iter(), handle them
3223  * by looping over ->read() or ->write() manually.
3224  */
3225 static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
3226 {
3227         struct kiocb *kiocb = &req->rw.kiocb;
3228         struct file *file = req->file;
3229         ssize_t ret = 0;
3230
3231         /*
3232          * Don't support polled IO through this interface, and we can't
3233          * support non-blocking either. For the latter, this just causes
3234          * the kiocb to be handled from an async context.
3235          */
3236         if (kiocb->ki_flags & IOCB_HIPRI)
3237                 return -EOPNOTSUPP;
3238         if (kiocb->ki_flags & IOCB_NOWAIT)
3239                 return -EAGAIN;
3240
3241         while (iov_iter_count(iter)) {
3242                 struct iovec iovec;
3243                 ssize_t nr;
3244
3245                 if (!iov_iter_is_bvec(iter)) {
3246                         iovec = iov_iter_iovec(iter);
3247                 } else {
3248                         iovec.iov_base = u64_to_user_ptr(req->rw.addr);
3249                         iovec.iov_len = req->rw.len;
3250                 }
3251
3252                 if (rw == READ) {
3253                         nr = file->f_op->read(file, iovec.iov_base,
3254                                               iovec.iov_len, io_kiocb_ppos(kiocb));
3255                 } else {
3256                         nr = file->f_op->write(file, iovec.iov_base,
3257                                                iovec.iov_len, io_kiocb_ppos(kiocb));
3258                 }
3259
3260                 if (nr < 0) {
3261                         if (!ret)
3262                                 ret = nr;
3263                         break;
3264                 }
3265                 if (!iov_iter_is_bvec(iter)) {
3266                         iov_iter_advance(iter, nr);
3267                 } else {
3268                         req->rw.len -= nr;
3269                         req->rw.addr += nr;
3270                 }
3271                 ret += nr;
3272                 if (nr != iovec.iov_len)
3273                         break;
3274         }
3275
3276         return ret;
3277 }
3278
3279 static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3280                           const struct iovec *fast_iov, struct iov_iter *iter)
3281 {
3282         struct io_async_rw *rw = req->async_data;
3283
3284         memcpy(&rw->s.iter, iter, sizeof(*iter));
3285         rw->free_iovec = iovec;
3286         rw->bytes_done = 0;
3287         /* can only be fixed buffers, no need to do anything */
3288         if (iov_iter_is_bvec(iter))
3289                 return;
3290         if (!iovec) {
3291                 unsigned iov_off = 0;
3292
3293                 rw->s.iter.iov = rw->s.fast_iov;
3294                 if (iter->iov != fast_iov) {
3295                         iov_off = iter->iov - fast_iov;
3296                         rw->s.iter.iov += iov_off;
3297                 }
3298                 if (rw->s.fast_iov != fast_iov)
3299                         memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off,
3300                                sizeof(struct iovec) * iter->nr_segs);
3301         } else {
3302                 req->flags |= REQ_F_NEED_CLEANUP;
3303         }
3304 }
3305
3306 static inline bool io_alloc_async_data(struct io_kiocb *req)
3307 {
3308         WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
3309         req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
3310         if (req->async_data) {
3311                 req->flags |= REQ_F_ASYNC_DATA;
3312                 return false;
3313         }
3314         return true;
3315 }
3316
3317 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
3318                              struct io_rw_state *s, bool force)
3319 {
3320         if (!force && !io_op_defs[req->opcode].needs_async_setup)
3321                 return 0;
3322         if (!req_has_async_data(req)) {
3323                 struct io_async_rw *iorw;
3324
3325                 if (io_alloc_async_data(req)) {
3326                         kfree(iovec);
3327                         return -ENOMEM;
3328                 }
3329
3330                 io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
3331                 iorw = req->async_data;
3332                 /* we've copied and mapped the iter, ensure state is saved */
3333                 iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
3334         }
3335         return 0;
3336 }
3337
3338 static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
3339 {
3340         struct io_async_rw *iorw = req->async_data;
3341         struct iovec *iov;
3342         int ret;
3343
3344         /* submission path, ->uring_lock should already be taken */
3345         ret = io_import_iovec(rw, req, &iov, &iorw->s, IO_URING_F_NONBLOCK);
3346         if (unlikely(ret < 0))
3347                 return ret;
3348
3349         iorw->bytes_done = 0;
3350         iorw->free_iovec = iov;
3351         if (iov)
3352                 req->flags |= REQ_F_NEED_CLEANUP;
3353         return 0;
3354 }
3355
3356 static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3357 {
3358         if (unlikely(!(req->file->f_mode & FMODE_READ)))
3359                 return -EBADF;
3360         return io_prep_rw(req, sqe, READ);
3361 }
3362
3363 /*
3364  * This is our waitqueue callback handler, registered through lock_page_async()
3365  * when we initially tried to do the IO with the iocb armed our waitqueue.
3366  * This gets called when the page is unlocked, and we generally expect that to
3367  * happen when the page IO is completed and the page is now uptodate. This will
3368  * queue a task_work based retry of the operation, attempting to copy the data
3369  * again. If the latter fails because the page was NOT uptodate, then we will
3370  * do a thread based blocking retry of the operation. That's the unexpected
3371  * slow path.
3372  */
3373 static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3374                              int sync, void *arg)
3375 {
3376         struct wait_page_queue *wpq;
3377         struct io_kiocb *req = wait->private;
3378         struct wait_page_key *key = arg;
3379
3380         wpq = container_of(wait, struct wait_page_queue, wait);
3381
3382         if (!wake_page_match(wpq, key))
3383                 return 0;
3384
3385         req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
3386         list_del_init(&wait->entry);
3387         io_req_task_queue(req);
3388         return 1;
3389 }
3390
3391 /*
3392  * This controls whether a given IO request should be armed for async page
3393  * based retry. If we return false here, the request is handed to the async
3394  * worker threads for retry. If we're doing buffered reads on a regular file,
3395  * we prepare a private wait_page_queue entry and retry the operation. This
3396  * will either succeed because the page is now uptodate and unlocked, or it
3397  * will register a callback when the page is unlocked at IO completion. Through
3398  * that callback, io_uring uses task_work to setup a retry of the operation.
3399  * That retry will attempt the buffered read again. The retry will generally
3400  * succeed, or in rare cases where it fails, we then fall back to using the
3401  * async worker threads for a blocking retry.
3402  */
3403 static bool io_rw_should_retry(struct io_kiocb *req)
3404 {
3405         struct io_async_rw *rw = req->async_data;
3406         struct wait_page_queue *wait = &rw->wpq;
3407         struct kiocb *kiocb = &req->rw.kiocb;
3408
3409         /* never retry for NOWAIT, we just complete with -EAGAIN */
3410         if (req->flags & REQ_F_NOWAIT)
3411                 return false;
3412
3413         /* Only for buffered IO */
3414         if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
3415                 return false;
3416
3417         /*
3418          * just use poll if we can, and don't attempt if the fs doesn't
3419          * support callback based unlocks
3420          */
3421         if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3422                 return false;
3423
3424         wait->wait.func = io_async_buf_func;
3425         wait->wait.private = req;
3426         wait->wait.flags = 0;
3427         INIT_LIST_HEAD(&wait->wait.entry);
3428         kiocb->ki_flags |= IOCB_WAITQ;
3429         kiocb->ki_flags &= ~IOCB_NOWAIT;
3430         kiocb->ki_waitq = wait;
3431         return true;
3432 }
3433
3434 static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
3435 {
3436         if (likely(req->file->f_op->read_iter))
3437                 return call_read_iter(req->file, &req->rw.kiocb, iter);
3438         else if (req->file->f_op->read)
3439                 return loop_rw_iter(READ, req, iter);
3440         else
3441                 return -EINVAL;
3442 }
3443
3444 static bool need_read_all(struct io_kiocb *req)
3445 {
3446         return req->flags & REQ_F_ISREG ||
3447                 S_ISBLK(file_inode(req->file)->i_mode);
3448 }
3449
3450 static int io_read(struct io_kiocb *req, unsigned int issue_flags)
3451 {
3452         struct io_rw_state __s, *s = &__s;
3453         struct iovec *iovec;
3454         struct kiocb *kiocb = &req->rw.kiocb;
3455         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3456         struct io_async_rw *rw;
3457         ssize_t ret, ret2;
3458
3459         if (!req_has_async_data(req)) {
3460                 ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
3461                 if (unlikely(ret < 0))
3462                         return ret;
3463         } else {
3464                 rw = req->async_data;
3465                 s = &rw->s;
3466                 /*
3467                  * We come here from an earlier attempt, restore our state to
3468                  * match in case it doesn't. It's cheap enough that we don't
3469                  * need to make this conditional.
3470                  */
3471                 iov_iter_restore(&s->iter, &s->iter_state);
3472                 iovec = NULL;
3473         }
3474         req->result = iov_iter_count(&s->iter);
3475
3476         if (force_nonblock) {
3477                 /* If the file doesn't support async, just async punt */
3478                 if (unlikely(!io_file_supports_nowait(req, READ))) {
3479                         ret = io_setup_async_rw(req, iovec, s, true);
3480                         return ret ?: -EAGAIN;
3481                 }
3482                 kiocb->ki_flags |= IOCB_NOWAIT;
3483         } else {
3484                 /* Ensure we clear previously set non-block flag */
3485                 kiocb->ki_flags &= ~IOCB_NOWAIT;
3486         }
3487
3488         ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);
3489         if (unlikely(ret)) {
3490                 kfree(iovec);
3491                 return ret;
3492         }
3493
3494         ret = io_iter_do_read(req, &s->iter);
3495
3496         if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
3497                 req->flags &= ~REQ_F_REISSUE;
3498                 /* IOPOLL retry should happen for io-wq threads */
3499                 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
3500                         goto done;
3501                 /* no retry on NONBLOCK nor RWF_NOWAIT */
3502                 if (req->flags & REQ_F_NOWAIT)
3503                         goto done;
3504                 ret = 0;
3505         } else if (ret == -EIOCBQUEUED) {
3506                 goto out_free;
3507         } else if (ret == req->result || ret <= 0 || !force_nonblock ||
3508                    (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
3509                 /* read all, failed, already did sync or don't want to retry */
3510                 goto done;
3511         }
3512
3513         /*
3514          * Don't depend on the iter state matching what was consumed, or being
3515          * untouched in case of error. Restore it and we'll advance it
3516          * manually if we need to.
3517          */
3518         iov_iter_restore(&s->iter, &s->iter_state);
3519
3520         ret2 = io_setup_async_rw(req, iovec, s, true);
3521         if (ret2)
3522                 return ret2;
3523
3524         iovec = NULL;
3525         rw = req->async_data;
3526         s = &rw->s;
3527         /*
3528          * Now use our persistent iterator and state, if we aren't already.
3529          * We've restored and mapped the iter to match.
3530          */
3531
3532         do {
3533                 /*
3534                  * We end up here because of a partial read, either from
3535                  * above or inside this loop. Advance the iter by the bytes
3536                  * that were consumed.
3537                  */
3538                 iov_iter_advance(&s->iter, ret);
3539                 if (!iov_iter_count(&s->iter))
3540                         break;
3541                 rw->bytes_done += ret;
3542                 iov_iter_save_state(&s->iter, &s->iter_state);
3543
3544                 /* if we can retry, do so with the callbacks armed */
3545                 if (!io_rw_should_retry(req)) {
3546                         kiocb->ki_flags &= ~IOCB_WAITQ;
3547                         return -EAGAIN;
3548                 }
3549
3550                 /*
3551                  * Now retry read with the IOCB_WAITQ parts set in the iocb. If
3552                  * we get -EIOCBQUEUED, then we'll get a notification when the
3553                  * desired page gets unlocked. We can also get a partial read
3554                  * here, and if we do, then just retry at the new offset.
3555                  */
3556                 ret = io_iter_do_read(req, &s->iter);
3557                 if (ret == -EIOCBQUEUED)
3558                         return 0;
3559                 /* we got some bytes, but not all. retry. */
3560                 kiocb->ki_flags &= ~IOCB_WAITQ;
3561                 iov_iter_restore(&s->iter, &s->iter_state);
3562         } while (ret > 0);
3563 done:
3564         kiocb_done(kiocb, ret, issue_flags);
3565 out_free:
3566         /* it's faster to check here then delegate to kfree */
3567         if (iovec)
3568                 kfree(iovec);
3569         return 0;
3570 }
3571
3572 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3573 {
3574         if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
3575                 return -EBADF;
3576         return io_prep_rw(req, sqe, WRITE);
3577 }
3578
3579 static int io_write(struct io_kiocb *req, unsigned int issue_flags)
3580 {
3581         struct io_rw_state __s, *s = &__s;
3582         struct iovec *iovec;
3583         struct kiocb *kiocb = &req->rw.kiocb;
3584         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3585         ssize_t ret, ret2;
3586
3587         if (!req_has_async_data(req)) {
3588                 ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
3589                 if (unlikely(ret < 0))
3590                         return ret;
3591         } else {
3592                 struct io_async_rw *rw = req->async_data;
3593
3594                 s = &rw->s;
3595                 iov_iter_restore(&s->iter, &s->iter_state);
3596                 iovec = NULL;
3597         }
3598         req->result = iov_iter_count(&s->iter);
3599
3600         if (force_nonblock) {
3601                 /* If the file doesn't support async, just async punt */
3602                 if (unlikely(!io_file_supports_nowait(req, WRITE)))
3603                         goto copy_iov;
3604
3605                 /* file path doesn't support NOWAIT for non-direct_IO */
3606                 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
3607                     (req->flags & REQ_F_ISREG))
3608                         goto copy_iov;
3609
3610                 kiocb->ki_flags |= IOCB_NOWAIT;
3611         } else {
3612                 /* Ensure we clear previously set non-block flag */
3613                 kiocb->ki_flags &= ~IOCB_NOWAIT;
3614         }
3615
3616         ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);
3617         if (unlikely(ret))
3618                 goto out_free;
3619
3620         /*
3621          * Open-code file_start_write here to grab freeze protection,
3622          * which will be released by another thread in
3623          * io_complete_rw().  Fool lockdep by telling it the lock got
3624          * released so that it doesn't complain about the held lock when
3625          * we return to userspace.
3626          */
3627         if (req->flags & REQ_F_ISREG) {
3628                 sb_start_write(file_inode(req->file)->i_sb);
3629                 __sb_writers_release(file_inode(req->file)->i_sb,
3630                                         SB_FREEZE_WRITE);
3631         }
3632         kiocb->ki_flags |= IOCB_WRITE;
3633
3634         if (req->file->f_op->write_iter)
3635                 ret2 = call_write_iter(req->file, kiocb, &s->iter);
3636         else if (req->file->f_op->write)
3637                 ret2 = loop_rw_iter(WRITE, req, &s->iter);
3638         else
3639                 ret2 = -EINVAL;
3640
3641         if (req->flags & REQ_F_REISSUE) {
3642                 req->flags &= ~REQ_F_REISSUE;
3643                 ret2 = -EAGAIN;
3644         }
3645
3646         /*
3647          * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
3648          * retry them without IOCB_NOWAIT.
3649          */
3650         if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
3651                 ret2 = -EAGAIN;
3652         /* no retry on NONBLOCK nor RWF_NOWAIT */
3653         if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
3654                 goto done;
3655         if (!force_nonblock || ret2 != -EAGAIN) {
3656                 /* IOPOLL retry should happen for io-wq threads */
3657                 if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
3658                         goto copy_iov;
3659 done:
3660                 kiocb_done(kiocb, ret2, issue_flags);
3661         } else {
3662 copy_iov:
3663                 iov_iter_restore(&s->iter, &s->iter_state);
3664                 ret = io_setup_async_rw(req, iovec, s, false);
3665                 return ret ?: -EAGAIN;
3666         }
3667 out_free:
3668         /* it's reportedly faster than delegating the null check to kfree() */
3669         if (iovec)
3670                 kfree(iovec);
3671         return ret;
3672 }
3673
3674 static int io_renameat_prep(struct io_kiocb *req,
3675                             const struct io_uring_sqe *sqe)
3676 {
3677         struct io_rename *ren = &req->rename;
3678         const char __user *oldf, *newf;
3679
3680         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3681                 return -EINVAL;
3682         if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
3683                 return -EINVAL;
3684         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3685                 return -EBADF;
3686
3687         ren->old_dfd = READ_ONCE(sqe->fd);
3688         oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
3689         newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3690         ren->new_dfd = READ_ONCE(sqe->len);
3691         ren->flags = READ_ONCE(sqe->rename_flags);
3692
3693         ren->oldpath = getname(oldf);
3694         if (IS_ERR(ren->oldpath))
3695                 return PTR_ERR(ren->oldpath);
3696
3697         ren->newpath = getname(newf);
3698         if (IS_ERR(ren->newpath)) {
3699                 putname(ren->oldpath);
3700                 return PTR_ERR(ren->newpath);
3701         }
3702
3703         req->flags |= REQ_F_NEED_CLEANUP;
3704         return 0;
3705 }
3706
3707 static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
3708 {
3709         struct io_rename *ren = &req->rename;
3710         int ret;
3711
3712         if (issue_flags & IO_URING_F_NONBLOCK)
3713                 return -EAGAIN;
3714
3715         ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
3716                                 ren->newpath, ren->flags);
3717
3718         req->flags &= ~REQ_F_NEED_CLEANUP;
3719         if (ret < 0)
3720                 req_set_fail(req);
3721         io_req_complete(req, ret);
3722         return 0;
3723 }
3724
3725 static int io_unlinkat_prep(struct io_kiocb *req,
3726                             const struct io_uring_sqe *sqe)
3727 {
3728         struct io_unlink *un = &req->unlink;
3729         const char __user *fname;
3730
3731         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3732                 return -EINVAL;
3733         if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
3734             sqe->splice_fd_in)
3735                 return -EINVAL;
3736         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3737                 return -EBADF;
3738
3739         un->dfd = READ_ONCE(sqe->fd);
3740
3741         un->flags = READ_ONCE(sqe->unlink_flags);
3742         if (un->flags & ~AT_REMOVEDIR)
3743                 return -EINVAL;
3744
3745         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3746         un->filename = getname(fname);
3747         if (IS_ERR(un->filename))
3748                 return PTR_ERR(un->filename);
3749
3750         req->flags |= REQ_F_NEED_CLEANUP;
3751         return 0;
3752 }
3753
3754 static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
3755 {
3756         struct io_unlink *un = &req->unlink;
3757         int ret;
3758
3759         if (issue_flags & IO_URING_F_NONBLOCK)
3760                 return -EAGAIN;
3761
3762         if (un->flags & AT_REMOVEDIR)
3763                 ret = do_rmdir(un->dfd, un->filename);
3764         else
3765                 ret = do_unlinkat(un->dfd, un->filename);
3766
3767         req->flags &= ~REQ_F_NEED_CLEANUP;
3768         if (ret < 0)
3769                 req_set_fail(req);
3770         io_req_complete(req, ret);
3771         return 0;
3772 }
3773
3774 static int io_mkdirat_prep(struct io_kiocb *req,
3775                             const struct io_uring_sqe *sqe)
3776 {
3777         struct io_mkdir *mkd = &req->mkdir;
3778         const char __user *fname;
3779
3780         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3781                 return -EINVAL;
3782         if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index ||
3783             sqe->splice_fd_in)
3784                 return -EINVAL;
3785         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3786                 return -EBADF;
3787
3788         mkd->dfd = READ_ONCE(sqe->fd);
3789         mkd->mode = READ_ONCE(sqe->len);
3790
3791         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3792         mkd->filename = getname(fname);
3793         if (IS_ERR(mkd->filename))
3794                 return PTR_ERR(mkd->filename);
3795
3796         req->flags |= REQ_F_NEED_CLEANUP;
3797         return 0;
3798 }
3799
3800 static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
3801 {
3802         struct io_mkdir *mkd = &req->mkdir;
3803         int ret;
3804
3805         if (issue_flags & IO_URING_F_NONBLOCK)
3806                 return -EAGAIN;
3807
3808         ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
3809
3810         req->flags &= ~REQ_F_NEED_CLEANUP;
3811         if (ret < 0)
3812                 req_set_fail(req);
3813         io_req_complete(req, ret);
3814         return 0;
3815 }
3816
3817 static int io_symlinkat_prep(struct io_kiocb *req,
3818                             const struct io_uring_sqe *sqe)
3819 {
3820         struct io_symlink *sl = &req->symlink;
3821         const char __user *oldpath, *newpath;
3822
3823         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3824                 return -EINVAL;
3825         if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index ||
3826             sqe->splice_fd_in)
3827                 return -EINVAL;
3828         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3829                 return -EBADF;
3830
3831         sl->new_dfd = READ_ONCE(sqe->fd);
3832         oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr));
3833         newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3834
3835         sl->oldpath = getname(oldpath);
3836         if (IS_ERR(sl->oldpath))
3837                 return PTR_ERR(sl->oldpath);
3838
3839         sl->newpath = getname(newpath);
3840         if (IS_ERR(sl->newpath)) {
3841                 putname(sl->oldpath);
3842                 return PTR_ERR(sl->newpath);
3843         }
3844
3845         req->flags |= REQ_F_NEED_CLEANUP;
3846         return 0;
3847 }
3848
3849 static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
3850 {
3851         struct io_symlink *sl = &req->symlink;
3852         int ret;
3853
3854         if (issue_flags & IO_URING_F_NONBLOCK)
3855                 return -EAGAIN;
3856
3857         ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
3858
3859         req->flags &= ~REQ_F_NEED_CLEANUP;
3860         if (ret < 0)
3861                 req_set_fail(req);
3862         io_req_complete(req, ret);
3863         return 0;
3864 }
3865
3866 static int io_linkat_prep(struct io_kiocb *req,
3867                             const struct io_uring_sqe *sqe)
3868 {
3869         struct io_hardlink *lnk = &req->hardlink;
3870         const char __user *oldf, *newf;
3871
3872         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3873                 return -EINVAL;
3874         if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
3875                 return -EINVAL;
3876         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3877                 return -EBADF;
3878
3879         lnk->old_dfd = READ_ONCE(sqe->fd);
3880         lnk->new_dfd = READ_ONCE(sqe->len);
3881         oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
3882         newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3883         lnk->flags = READ_ONCE(sqe->hardlink_flags);
3884
3885         lnk->oldpath = getname(oldf);
3886         if (IS_ERR(lnk->oldpath))
3887                 return PTR_ERR(lnk->oldpath);
3888
3889         lnk->newpath = getname(newf);
3890         if (IS_ERR(lnk->newpath)) {
3891                 putname(lnk->oldpath);
3892                 return PTR_ERR(lnk->newpath);
3893         }
3894
3895         req->flags |= REQ_F_NEED_CLEANUP;
3896         return 0;
3897 }
3898
3899 static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
3900 {
3901         struct io_hardlink *lnk = &req->hardlink;
3902         int ret;
3903
3904         if (issue_flags & IO_URING_F_NONBLOCK)
3905                 return -EAGAIN;
3906
3907         ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd,
3908                                 lnk->newpath, lnk->flags);
3909
3910         req->flags &= ~REQ_F_NEED_CLEANUP;
3911         if (ret < 0)
3912                 req_set_fail(req);
3913         io_req_complete(req, ret);
3914         return 0;
3915 }
3916
3917 static int io_shutdown_prep(struct io_kiocb *req,
3918                             const struct io_uring_sqe *sqe)
3919 {
3920 #if defined(CONFIG_NET)
3921         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3922                 return -EINVAL;
3923         if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
3924                      sqe->buf_index || sqe->splice_fd_in))
3925                 return -EINVAL;
3926
3927         req->shutdown.how = READ_ONCE(sqe->len);
3928         return 0;
3929 #else
3930         return -EOPNOTSUPP;
3931 #endif
3932 }
3933
3934 static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
3935 {
3936 #if defined(CONFIG_NET)
3937         struct socket *sock;
3938         int ret;
3939
3940         if (issue_flags & IO_URING_F_NONBLOCK)
3941                 return -EAGAIN;
3942
3943         sock = sock_from_file(req->file);
3944         if (unlikely(!sock))
3945                 return -ENOTSOCK;
3946
3947         ret = __sys_shutdown_sock(sock, req->shutdown.how);
3948         if (ret < 0)
3949                 req_set_fail(req);
3950         io_req_complete(req, ret);
3951         return 0;
3952 #else
3953         return -EOPNOTSUPP;
3954 #endif
3955 }
3956
3957 static int __io_splice_prep(struct io_kiocb *req,
3958                             const struct io_uring_sqe *sqe)
3959 {
3960         struct io_splice *sp = &req->splice;
3961         unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
3962
3963         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3964                 return -EINVAL;
3965
3966         sp->file_in = NULL;
3967         sp->len = READ_ONCE(sqe->len);
3968         sp->flags = READ_ONCE(sqe->splice_flags);
3969
3970         if (unlikely(sp->flags & ~valid_flags))
3971                 return -EINVAL;
3972
3973         sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in),
3974                                   (sp->flags & SPLICE_F_FD_IN_FIXED));
3975         if (!sp->file_in)
3976                 return -EBADF;
3977         req->flags |= REQ_F_NEED_CLEANUP;
3978         return 0;
3979 }
3980
3981 static int io_tee_prep(struct io_kiocb *req,
3982                        const struct io_uring_sqe *sqe)
3983 {
3984         if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
3985                 return -EINVAL;
3986         return __io_splice_prep(req, sqe);
3987 }
3988
3989 static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
3990 {
3991         struct io_splice *sp = &req->splice;
3992         struct file *in = sp->file_in;
3993         struct file *out = sp->file_out;
3994         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3995         long ret = 0;
3996
3997         if (issue_flags & IO_URING_F_NONBLOCK)
3998                 return -EAGAIN;
3999         if (sp->len)
4000                 ret = do_tee(in, out, sp->len, flags);
4001
4002         if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
4003                 io_put_file(in);
4004         req->flags &= ~REQ_F_NEED_CLEANUP;
4005
4006         if (ret != sp->len)
4007                 req_set_fail(req);
4008         io_req_complete(req, ret);
4009         return 0;
4010 }
4011
4012 static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4013 {
4014         struct io_splice *sp = &req->splice;
4015
4016         sp->off_in = READ_ONCE(sqe->splice_off_in);
4017         sp->off_out = READ_ONCE(sqe->off);
4018         return __io_splice_prep(req, sqe);
4019 }
4020
4021 static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
4022 {
4023         struct io_splice *sp = &req->splice;
4024         struct file *in = sp->file_in;
4025         struct file *out = sp->file_out;
4026         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
4027         loff_t *poff_in, *poff_out;
4028         long ret = 0;
4029
4030         if (issue_flags & IO_URING_F_NONBLOCK)
4031                 return -EAGAIN;
4032
4033         poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
4034         poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
4035
4036         if (sp->len)
4037                 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
4038
4039         if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
4040                 io_put_file(in);
4041         req->flags &= ~REQ_F_NEED_CLEANUP;
4042
4043         if (ret != sp->len)
4044                 req_set_fail(req);
4045         io_req_complete(req, ret);
4046         return 0;
4047 }
4048
4049 /*
4050  * IORING_OP_NOP just posts a completion event, nothing else.
4051  */
4052 static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
4053 {
4054         struct io_ring_ctx *ctx = req->ctx;
4055
4056         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4057                 return -EINVAL;
4058
4059         __io_req_complete(req, issue_flags, 0, 0);
4060         return 0;
4061 }
4062
4063 static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4064 {
4065         struct io_ring_ctx *ctx = req->ctx;
4066
4067         if (!req->file)
4068                 return -EBADF;
4069
4070         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4071                 return -EINVAL;
4072         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
4073                      sqe->splice_fd_in))
4074                 return -EINVAL;
4075
4076         req->sync.flags = READ_ONCE(sqe->fsync_flags);
4077         if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
4078                 return -EINVAL;
4079
4080         req->sync.off = READ_ONCE(sqe->off);
4081         req->sync.len = READ_ONCE(sqe->len);
4082         return 0;
4083 }
4084
4085 static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
4086 {
4087         loff_t end = req->sync.off + req->sync.len;
4088         int ret;
4089
4090         /* fsync always requires a blocking context */
4091         if (issue_flags & IO_URING_F_NONBLOCK)
4092                 return -EAGAIN;
4093
4094         ret = vfs_fsync_range(req->file, req->sync.off,
4095                                 end > 0 ? end : LLONG_MAX,
4096                                 req->sync.flags & IORING_FSYNC_DATASYNC);
4097         if (ret < 0)
4098                 req_set_fail(req);
4099         io_req_complete(req, ret);
4100         return 0;
4101 }
4102
4103 static int io_fallocate_prep(struct io_kiocb *req,
4104                              const struct io_uring_sqe *sqe)
4105 {
4106         if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
4107             sqe->splice_fd_in)
4108                 return -EINVAL;
4109         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4110                 return -EINVAL;
4111
4112         req->sync.off = READ_ONCE(sqe->off);
4113         req->sync.len = READ_ONCE(sqe->addr);
4114         req->sync.mode = READ_ONCE(sqe->len);
4115         return 0;
4116 }
4117
4118 static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
4119 {
4120         int ret;
4121
4122         /* fallocate always requiring blocking context */
4123         if (issue_flags & IO_URING_F_NONBLOCK)
4124                 return -EAGAIN;
4125         ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
4126                                 req->sync.len);
4127         if (ret < 0)
4128                 req_set_fail(req);
4129         io_req_complete(req, ret);
4130         return 0;
4131 }
4132
4133 static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4134 {
4135         const char __user *fname;
4136         int ret;
4137
4138         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4139                 return -EINVAL;
4140         if (unlikely(sqe->ioprio || sqe->buf_index))
4141                 return -EINVAL;
4142         if (unlikely(req->flags & REQ_F_FIXED_FILE))
4143                 return -EBADF;
4144
4145         /* open.how should be already initialised */
4146         if (!(req->open.how.flags & O_PATH) && force_o_largefile())
4147                 req->open.how.flags |= O_LARGEFILE;
4148
4149         req->open.dfd = READ_ONCE(sqe->fd);
4150         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
4151         req->open.filename = getname(fname);
4152         if (IS_ERR(req->open.filename)) {
4153                 ret = PTR_ERR(req->open.filename);
4154                 req->open.filename = NULL;
4155                 return ret;
4156         }
4157
4158         req->open.file_slot = READ_ONCE(sqe->file_index);
4159         if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
4160                 return -EINVAL;
4161
4162         req->open.nofile = rlimit(RLIMIT_NOFILE);
4163         req->flags |= REQ_F_NEED_CLEANUP;
4164         return 0;
4165 }
4166
4167 static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4168 {
4169         u64 mode = READ_ONCE(sqe->len);
4170         u64 flags = READ_ONCE(sqe->open_flags);
4171
4172         req->open.how = build_open_how(flags, mode);
4173         return __io_openat_prep(req, sqe);
4174 }
4175
4176 static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4177 {
4178         struct open_how __user *how;
4179         size_t len;
4180         int ret;
4181
4182         how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4183         len = READ_ONCE(sqe->len);
4184         if (len < OPEN_HOW_SIZE_VER0)
4185                 return -EINVAL;
4186
4187         ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
4188                                         len);
4189         if (ret)
4190                 return ret;
4191
4192         return __io_openat_prep(req, sqe);
4193 }
4194
4195 static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
4196 {
4197         struct open_flags op;
4198         struct file *file;
4199         bool resolve_nonblock, nonblock_set;
4200         bool fixed = !!req->open.file_slot;
4201         int ret;
4202
4203         ret = build_open_flags(&req->open.how, &op);
4204         if (ret)
4205                 goto err;
4206         nonblock_set = op.open_flag & O_NONBLOCK;
4207         resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
4208         if (issue_flags & IO_URING_F_NONBLOCK) {
4209                 /*
4210                  * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
4211                  * it'll always -EAGAIN
4212                  */
4213                 if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
4214                         return -EAGAIN;
4215                 op.lookup_flags |= LOOKUP_CACHED;
4216                 op.open_flag |= O_NONBLOCK;
4217         }
4218
4219         if (!fixed) {
4220                 ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
4221                 if (ret < 0)
4222                         goto err;
4223         }
4224
4225         file = do_filp_open(req->open.dfd, req->open.filename, &op);
4226         if (IS_ERR(file)) {
4227                 /*
4228                  * We could hang on to this 'fd' on retrying, but seems like
4229                  * marginal gain for something that is now known to be a slower
4230                  * path. So just put it, and we'll get a new one when we retry.
4231                  */
4232                 if (!fixed)
4233                         put_unused_fd(ret);
4234
4235                 ret = PTR_ERR(file);
4236                 /* only retry if RESOLVE_CACHED wasn't already set by application */
4237                 if (ret == -EAGAIN &&
4238                     (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
4239                         return -EAGAIN;
4240                 goto err;
4241         }
4242
4243         if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
4244                 file->f_flags &= ~O_NONBLOCK;
4245         fsnotify_open(file);
4246
4247         if (!fixed)
4248                 fd_install(ret, file);
4249         else
4250                 ret = io_install_fixed_file(req, file, issue_flags,
4251                                             req->open.file_slot - 1);
4252 err:
4253         putname(req->open.filename);
4254         req->flags &= ~REQ_F_NEED_CLEANUP;
4255         if (ret < 0)
4256                 req_set_fail(req);
4257         __io_req_complete(req, issue_flags, ret, 0);
4258         return 0;
4259 }
4260
4261 static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
4262 {
4263         return io_openat2(req, issue_flags);
4264 }
4265
4266 static int io_remove_buffers_prep(struct io_kiocb *req,
4267                                   const struct io_uring_sqe *sqe)
4268 {
4269         struct io_provide_buf *p = &req->pbuf;
4270         u64 tmp;
4271
4272         if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
4273             sqe->splice_fd_in)
4274                 return -EINVAL;
4275
4276         tmp = READ_ONCE(sqe->fd);
4277         if (!tmp || tmp > USHRT_MAX)
4278                 return -EINVAL;
4279
4280         memset(p, 0, sizeof(*p));
4281         p->nbufs = tmp;
4282         p->bgid = READ_ONCE(sqe->buf_group);
4283         return 0;
4284 }
4285
4286 static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
4287                                int bgid, unsigned nbufs)
4288 {
4289         unsigned i = 0;
4290
4291         /* shouldn't happen */
4292         if (!nbufs)
4293                 return 0;
4294
4295         /* the head kbuf is the list itself */
4296         while (!list_empty(&buf->list)) {
4297                 struct io_buffer *nxt;
4298
4299                 nxt = list_first_entry(&buf->list, struct io_buffer, list);
4300                 list_del(&nxt->list);
4301                 kfree(nxt);
4302                 if (++i == nbufs)
4303                         return i;
4304         }
4305         i++;
4306         kfree(buf);
4307         xa_erase(&ctx->io_buffers, bgid);
4308
4309         return i;
4310 }
4311
4312 static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
4313 {
4314         struct io_provide_buf *p = &req->pbuf;
4315         struct io_ring_ctx *ctx = req->ctx;
4316         struct io_buffer *head;
4317         int ret = 0;
4318         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4319
4320         io_ring_submit_lock(ctx, !force_nonblock);
4321
4322         lockdep_assert_held(&ctx->uring_lock);
4323
4324         ret = -ENOENT;
4325         head = xa_load(&ctx->io_buffers, p->bgid);
4326         if (head)
4327                 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
4328         if (ret < 0)
4329                 req_set_fail(req);
4330
4331         /* complete before unlock, IOPOLL may need the lock */
4332         __io_req_complete(req, issue_flags, ret, 0);
4333         io_ring_submit_unlock(ctx, !force_nonblock);
4334         return 0;
4335 }
4336
4337 static int io_provide_buffers_prep(struct io_kiocb *req,
4338                                    const struct io_uring_sqe *sqe)
4339 {
4340         unsigned long size, tmp_check;
4341         struct io_provide_buf *p = &req->pbuf;
4342         u64 tmp;
4343
4344         if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
4345                 return -EINVAL;
4346
4347         tmp = READ_ONCE(sqe->fd);
4348         if (!tmp || tmp > USHRT_MAX)
4349                 return -E2BIG;
4350         p->nbufs = tmp;
4351         p->addr = READ_ONCE(sqe->addr);
4352         p->len = READ_ONCE(sqe->len);
4353
4354         if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
4355                                 &size))
4356                 return -EOVERFLOW;
4357         if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
4358                 return -EOVERFLOW;
4359
4360         size = (unsigned long)p->len * p->nbufs;
4361         if (!access_ok(u64_to_user_ptr(p->addr), size))
4362                 return -EFAULT;
4363
4364         p->bgid = READ_ONCE(sqe->buf_group);
4365         tmp = READ_ONCE(sqe->off);
4366         if (tmp > USHRT_MAX)
4367                 return -E2BIG;
4368         p->bid = tmp;
4369         return 0;
4370 }
4371
4372 static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
4373 {
4374         struct io_buffer *buf;
4375         u64 addr = pbuf->addr;
4376         int i, bid = pbuf->bid;
4377
4378         for (i = 0; i < pbuf->nbufs; i++) {
4379                 buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
4380                 if (!buf)
4381                         break;
4382
4383                 buf->addr = addr;
4384                 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
4385                 buf->bid = bid;
4386                 addr += pbuf->len;
4387                 bid++;
4388                 if (!*head) {
4389                         INIT_LIST_HEAD(&buf->list);
4390                         *head = buf;
4391                 } else {
4392                         list_add_tail(&buf->list, &(*head)->list);
4393                 }
4394         }
4395
4396         return i ? i : -ENOMEM;
4397 }
4398
4399 static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
4400 {
4401         struct io_provide_buf *p = &req->pbuf;
4402         struct io_ring_ctx *ctx = req->ctx;
4403         struct io_buffer *head, *list;
4404         int ret = 0;
4405         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4406
4407         io_ring_submit_lock(ctx, !force_nonblock);
4408
4409         lockdep_assert_held(&ctx->uring_lock);
4410
4411         list = head = xa_load(&ctx->io_buffers, p->bgid);
4412
4413         ret = io_add_buffers(p, &head);
4414         if (ret >= 0 && !list) {
4415                 ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL);
4416                 if (ret < 0)
4417                         __io_remove_buffers(ctx, head, p->bgid, -1U);
4418         }
4419         if (ret < 0)
4420                 req_set_fail(req);
4421         /* complete before unlock, IOPOLL may need the lock */
4422         __io_req_complete(req, issue_flags, ret, 0);
4423         io_ring_submit_unlock(ctx, !force_nonblock);
4424         return 0;
4425 }
4426
4427 static int io_epoll_ctl_prep(struct io_kiocb *req,
4428                              const struct io_uring_sqe *sqe)
4429 {
4430 #if defined(CONFIG_EPOLL)
4431         if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
4432                 return -EINVAL;
4433         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4434                 return -EINVAL;
4435
4436         req->epoll.epfd = READ_ONCE(sqe->fd);
4437         req->epoll.op = READ_ONCE(sqe->len);
4438         req->epoll.fd = READ_ONCE(sqe->off);
4439
4440         if (ep_op_has_event(req->epoll.op)) {
4441                 struct epoll_event __user *ev;
4442
4443                 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
4444                 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
4445                         return -EFAULT;
4446         }
4447
4448         return 0;
4449 #else
4450         return -EOPNOTSUPP;
4451 #endif
4452 }
4453
4454 static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
4455 {
4456 #if defined(CONFIG_EPOLL)
4457         struct io_epoll *ie = &req->epoll;
4458         int ret;
4459         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4460
4461         ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
4462         if (force_nonblock && ret == -EAGAIN)
4463                 return -EAGAIN;
4464
4465         if (ret < 0)
4466                 req_set_fail(req);
4467         __io_req_complete(req, issue_flags, ret, 0);
4468         return 0;
4469 #else
4470         return -EOPNOTSUPP;
4471 #endif
4472 }
4473
4474 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4475 {
4476 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4477         if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
4478                 return -EINVAL;
4479         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4480                 return -EINVAL;
4481
4482         req->madvise.addr = READ_ONCE(sqe->addr);
4483         req->madvise.len = READ_ONCE(sqe->len);
4484         req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
4485         return 0;
4486 #else
4487         return -EOPNOTSUPP;
4488 #endif
4489 }
4490
4491 static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
4492 {
4493 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4494         struct io_madvise *ma = &req->madvise;
4495         int ret;
4496
4497         if (issue_flags & IO_URING_F_NONBLOCK)
4498                 return -EAGAIN;
4499
4500         ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
4501         if (ret < 0)
4502                 req_set_fail(req);
4503         io_req_complete(req, ret);
4504         return 0;
4505 #else
4506         return -EOPNOTSUPP;
4507 #endif
4508 }
4509
4510 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4511 {
4512         if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
4513                 return -EINVAL;
4514         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4515                 return -EINVAL;
4516
4517         req->fadvise.offset = READ_ONCE(sqe->off);
4518         req->fadvise.len = READ_ONCE(sqe->len);
4519         req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
4520         return 0;
4521 }
4522
4523 static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
4524 {
4525         struct io_fadvise *fa = &req->fadvise;
4526         int ret;
4527
4528         if (issue_flags & IO_URING_F_NONBLOCK) {
4529                 switch (fa->advice) {
4530                 case POSIX_FADV_NORMAL:
4531                 case POSIX_FADV_RANDOM:
4532                 case POSIX_FADV_SEQUENTIAL:
4533                         break;
4534                 default:
4535                         return -EAGAIN;
4536                 }
4537         }
4538
4539         ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
4540         if (ret < 0)
4541                 req_set_fail(req);
4542         __io_req_complete(req, issue_flags, ret, 0);
4543         return 0;
4544 }
4545
4546 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4547 {
4548         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4549                 return -EINVAL;
4550         if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
4551                 return -EINVAL;
4552         if (req->flags & REQ_F_FIXED_FILE)
4553                 return -EBADF;
4554
4555         req->statx.dfd = READ_ONCE(sqe->fd);
4556         req->statx.mask = READ_ONCE(sqe->len);
4557         req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
4558         req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4559         req->statx.flags = READ_ONCE(sqe->statx_flags);
4560
4561         return 0;
4562 }
4563
4564 static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
4565 {
4566         struct io_statx *ctx = &req->statx;
4567         int ret;
4568
4569         if (issue_flags & IO_URING_F_NONBLOCK)
4570                 return -EAGAIN;
4571
4572         ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
4573                        ctx->buffer);
4574
4575         if (ret < 0)
4576                 req_set_fail(req);
4577         io_req_complete(req, ret);
4578         return 0;
4579 }
4580
4581 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4582 {
4583         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4584                 return -EINVAL;
4585         if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
4586             sqe->rw_flags || sqe->buf_index)
4587                 return -EINVAL;
4588         if (req->flags & REQ_F_FIXED_FILE)
4589                 return -EBADF;
4590
4591         req->close.fd = READ_ONCE(sqe->fd);
4592         req->close.file_slot = READ_ONCE(sqe->file_index);
4593         if (req->close.file_slot && req->close.fd)
4594                 return -EINVAL;
4595
4596         return 0;
4597 }
4598
4599 static int io_close(struct io_kiocb *req, unsigned int issue_flags)
4600 {
4601         struct files_struct *files = current->files;
4602         struct io_close *close = &req->close;
4603         struct fdtable *fdt;
4604         struct file *file = NULL;
4605         int ret = -EBADF;
4606
4607         if (req->close.file_slot) {
4608                 ret = io_close_fixed(req, issue_flags);
4609                 goto err;
4610         }
4611
4612         spin_lock(&files->file_lock);
4613         fdt = files_fdtable(files);
4614         if (close->fd >= fdt->max_fds) {
4615                 spin_unlock(&files->file_lock);
4616                 goto err;
4617         }
4618         file = fdt->fd[close->fd];
4619         if (!file || file->f_op == &io_uring_fops) {
4620                 spin_unlock(&files->file_lock);
4621                 file = NULL;
4622                 goto err;
4623         }
4624
4625         /* if the file has a flush method, be safe and punt to async */
4626         if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
4627                 spin_unlock(&files->file_lock);
4628                 return -EAGAIN;
4629         }
4630
4631         ret = __close_fd_get_file(close->fd, &file);
4632         spin_unlock(&files->file_lock);
4633         if (ret < 0) {
4634                 if (ret == -ENOENT)
4635                         ret = -EBADF;
4636                 goto err;
4637         }
4638
4639         /* No ->flush() or already async, safely close from here */
4640         ret = filp_close(file, current->files);
4641 err:
4642         if (ret < 0)
4643                 req_set_fail(req);
4644         if (file)
4645                 fput(file);
4646         __io_req_complete(req, issue_flags, ret, 0);
4647         return 0;
4648 }
4649
4650 static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4651 {
4652         struct io_ring_ctx *ctx = req->ctx;
4653
4654         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4655                 return -EINVAL;
4656         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
4657                      sqe->splice_fd_in))
4658                 return -EINVAL;
4659
4660         req->sync.off = READ_ONCE(sqe->off);
4661         req->sync.len = READ_ONCE(sqe->len);
4662         req->sync.flags = READ_ONCE(sqe->sync_range_flags);
4663         return 0;
4664 }
4665
4666 static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
4667 {
4668         int ret;
4669
4670         /* sync_file_range always requires a blocking context */
4671         if (issue_flags & IO_URING_F_NONBLOCK)
4672                 return -EAGAIN;
4673
4674         ret = sync_file_range(req->file, req->sync.off, req->sync.len,
4675                                 req->sync.flags);
4676         if (ret < 0)
4677                 req_set_fail(req);
4678         io_req_complete(req, ret);
4679         return 0;
4680 }
4681
4682 #if defined(CONFIG_NET)
4683 static int io_setup_async_msg(struct io_kiocb *req,
4684                               struct io_async_msghdr *kmsg)
4685 {
4686         struct io_async_msghdr *async_msg = req->async_data;
4687
4688         if (async_msg)
4689                 return -EAGAIN;
4690         if (io_alloc_async_data(req)) {
4691                 kfree(kmsg->free_iov);
4692                 return -ENOMEM;
4693         }
4694         async_msg = req->async_data;
4695         req->flags |= REQ_F_NEED_CLEANUP;
4696         memcpy(async_msg, kmsg, sizeof(*kmsg));
4697         async_msg->msg.msg_name = &async_msg->addr;
4698         /* if were using fast_iov, set it to the new one */
4699         if (!async_msg->free_iov)
4700                 async_msg->msg.msg_iter.iov = async_msg->fast_iov;
4701
4702         return -EAGAIN;
4703 }
4704
4705 static int io_sendmsg_copy_hdr(struct io_kiocb *req,
4706                                struct io_async_msghdr *iomsg)
4707 {
4708         iomsg->msg.msg_name = &iomsg->addr;
4709         iomsg->free_iov = iomsg->fast_iov;
4710         return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
4711                                    req->sr_msg.msg_flags, &iomsg->free_iov);
4712 }
4713
4714 static int io_sendmsg_prep_async(struct io_kiocb *req)
4715 {
4716         int ret;
4717
4718         ret = io_sendmsg_copy_hdr(req, req->async_data);
4719         if (!ret)
4720                 req->flags |= REQ_F_NEED_CLEANUP;
4721         return ret;
4722 }
4723
4724 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4725 {
4726         struct io_sr_msg *sr = &req->sr_msg;
4727
4728         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4729                 return -EINVAL;
4730
4731         sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4732         sr->len = READ_ONCE(sqe->len);
4733         sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
4734         if (sr->msg_flags & MSG_DONTWAIT)
4735                 req->flags |= REQ_F_NOWAIT;
4736
4737 #ifdef CONFIG_COMPAT
4738         if (req->ctx->compat)
4739                 sr->msg_flags |= MSG_CMSG_COMPAT;
4740 #endif
4741         return 0;
4742 }
4743
4744 static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
4745 {
4746         struct io_async_msghdr iomsg, *kmsg;
4747         struct socket *sock;
4748         unsigned flags;
4749         int min_ret = 0;
4750         int ret;
4751
4752         sock = sock_from_file(req->file);
4753         if (unlikely(!sock))
4754                 return -ENOTSOCK;
4755
4756         if (req_has_async_data(req)) {
4757                 kmsg = req->async_data;
4758         } else {
4759                 ret = io_sendmsg_copy_hdr(req, &iomsg);
4760                 if (ret)
4761                         return ret;
4762                 kmsg = &iomsg;
4763         }
4764
4765         flags = req->sr_msg.msg_flags;
4766         if (issue_flags & IO_URING_F_NONBLOCK)
4767                 flags |= MSG_DONTWAIT;
4768         if (flags & MSG_WAITALL)
4769                 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
4770
4771         ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
4772         if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4773                 return io_setup_async_msg(req, kmsg);
4774         if (ret == -ERESTARTSYS)
4775                 ret = -EINTR;
4776
4777         /* fast path, check for non-NULL to avoid function call */
4778         if (kmsg->free_iov)
4779                 kfree(kmsg->free_iov);
4780         req->flags &= ~REQ_F_NEED_CLEANUP;
4781         if (ret < min_ret)
4782                 req_set_fail(req);
4783         __io_req_complete(req, issue_flags, ret, 0);
4784         return 0;
4785 }
4786
4787 static int io_send(struct io_kiocb *req, unsigned int issue_flags)
4788 {
4789         struct io_sr_msg *sr = &req->sr_msg;
4790         struct msghdr msg;
4791         struct iovec iov;
4792         struct socket *sock;
4793         unsigned flags;
4794         int min_ret = 0;
4795         int ret;
4796
4797         sock = sock_from_file(req->file);
4798         if (unlikely(!sock))
4799                 return -ENOTSOCK;
4800
4801         ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
4802         if (unlikely(ret))
4803                 return ret;
4804
4805         msg.msg_name = NULL;
4806         msg.msg_control = NULL;
4807         msg.msg_controllen = 0;
4808         msg.msg_namelen = 0;
4809
4810         flags = req->sr_msg.msg_flags;
4811         if (issue_flags & IO_URING_F_NONBLOCK)
4812                 flags |= MSG_DONTWAIT;
4813         if (flags & MSG_WAITALL)
4814                 min_ret = iov_iter_count(&msg.msg_iter);
4815
4816         msg.msg_flags = flags;
4817         ret = sock_sendmsg(sock, &msg);
4818         if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4819                 return -EAGAIN;
4820         if (ret == -ERESTARTSYS)
4821                 ret = -EINTR;
4822
4823         if (ret < min_ret)
4824                 req_set_fail(req);
4825         __io_req_complete(req, issue_flags, ret, 0);
4826         return 0;
4827 }
4828
4829 static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
4830                                  struct io_async_msghdr *iomsg)
4831 {
4832         struct io_sr_msg *sr = &req->sr_msg;
4833         struct iovec __user *uiov;
4834         size_t iov_len;
4835         int ret;
4836
4837         ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
4838                                         &iomsg->uaddr, &uiov, &iov_len);
4839         if (ret)
4840                 return ret;
4841
4842         if (req->flags & REQ_F_BUFFER_SELECT) {
4843                 if (iov_len > 1)
4844                         return -EINVAL;
4845                 if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
4846                         return -EFAULT;
4847                 sr->len = iomsg->fast_iov[0].iov_len;
4848                 iomsg->free_iov = NULL;
4849         } else {
4850                 iomsg->free_iov = iomsg->fast_iov;
4851                 ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
4852                                      &iomsg->free_iov, &iomsg->msg.msg_iter,
4853                                      false);
4854                 if (ret > 0)
4855                         ret = 0;
4856         }
4857
4858         return ret;
4859 }
4860
4861 #ifdef CONFIG_COMPAT
4862 static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
4863                                         struct io_async_msghdr *iomsg)
4864 {
4865         struct io_sr_msg *sr = &req->sr_msg;
4866         struct compat_iovec __user *uiov;
4867         compat_uptr_t ptr;
4868         compat_size_t len;
4869         int ret;
4870
4871         ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
4872                                   &ptr, &len);
4873         if (ret)
4874                 return ret;
4875
4876         uiov = compat_ptr(ptr);
4877         if (req->flags & REQ_F_BUFFER_SELECT) {
4878                 compat_ssize_t clen;
4879
4880                 if (len > 1)
4881                         return -EINVAL;
4882                 if (!access_ok(uiov, sizeof(*uiov)))
4883                         return -EFAULT;
4884                 if (__get_user(clen, &uiov->iov_len))
4885                         return -EFAULT;
4886                 if (clen < 0)
4887                         return -EINVAL;
4888                 sr->len = clen;
4889                 iomsg->free_iov = NULL;
4890         } else {
4891                 iomsg->free_iov = iomsg->fast_iov;
4892                 ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
4893                                    UIO_FASTIOV, &iomsg->free_iov,
4894                                    &iomsg->msg.msg_iter, true);
4895                 if (ret < 0)
4896                         return ret;
4897         }
4898
4899         return 0;
4900 }
4901 #endif
4902
4903 static int io_recvmsg_copy_hdr(struct io_kiocb *req,
4904                                struct io_async_msghdr *iomsg)
4905 {
4906         iomsg->msg.msg_name = &iomsg->addr;
4907
4908 #ifdef CONFIG_COMPAT
4909         if (req->ctx->compat)
4910                 return __io_compat_recvmsg_copy_hdr(req, iomsg);
4911 #endif
4912
4913         return __io_recvmsg_copy_hdr(req, iomsg);
4914 }
4915
4916 static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
4917                                                unsigned int issue_flags)
4918 {
4919         struct io_sr_msg *sr = &req->sr_msg;
4920
4921         return io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
4922 }
4923
4924 static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
4925 {
4926         return io_put_kbuf(req, req->kbuf);
4927 }
4928
4929 static int io_recvmsg_prep_async(struct io_kiocb *req)
4930 {
4931         int ret;
4932
4933         ret = io_recvmsg_copy_hdr(req, req->async_data);
4934         if (!ret)
4935                 req->flags |= REQ_F_NEED_CLEANUP;
4936         return ret;
4937 }
4938
4939 static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4940 {
4941         struct io_sr_msg *sr = &req->sr_msg;
4942
4943         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4944                 return -EINVAL;
4945
4946         sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4947         sr->len = READ_ONCE(sqe->len);
4948         sr->bgid = READ_ONCE(sqe->buf_group);
4949         sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
4950         if (sr->msg_flags & MSG_DONTWAIT)
4951                 req->flags |= REQ_F_NOWAIT;
4952
4953 #ifdef CONFIG_COMPAT
4954         if (req->ctx->compat)
4955                 sr->msg_flags |= MSG_CMSG_COMPAT;
4956 #endif
4957         return 0;
4958 }
4959
4960 static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
4961 {
4962         struct io_async_msghdr iomsg, *kmsg;
4963         struct socket *sock;
4964         struct io_buffer *kbuf;
4965         unsigned flags;
4966         int min_ret = 0;
4967         int ret, cflags = 0;
4968         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4969
4970         sock = sock_from_file(req->file);
4971         if (unlikely(!sock))
4972                 return -ENOTSOCK;
4973
4974         if (req_has_async_data(req)) {
4975                 kmsg = req->async_data;
4976         } else {
4977                 ret = io_recvmsg_copy_hdr(req, &iomsg);
4978                 if (ret)
4979                         return ret;
4980                 kmsg = &iomsg;
4981         }
4982
4983         if (req->flags & REQ_F_BUFFER_SELECT) {
4984                 kbuf = io_recv_buffer_select(req, issue_flags);
4985                 if (IS_ERR(kbuf))
4986                         return PTR_ERR(kbuf);
4987                 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
4988                 kmsg->fast_iov[0].iov_len = req->sr_msg.len;
4989                 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
4990                                 1, req->sr_msg.len);
4991         }
4992
4993         flags = req->sr_msg.msg_flags;
4994         if (force_nonblock)
4995                 flags |= MSG_DONTWAIT;
4996         if (flags & MSG_WAITALL)
4997                 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
4998
4999         ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
5000                                         kmsg->uaddr, flags);
5001         if (force_nonblock && ret == -EAGAIN)
5002                 return io_setup_async_msg(req, kmsg);
5003         if (ret == -ERESTARTSYS)
5004                 ret = -EINTR;
5005
5006         if (req->flags & REQ_F_BUFFER_SELECTED)
5007                 cflags = io_put_recv_kbuf(req);
5008         /* fast path, check for non-NULL to avoid function call */
5009         if (kmsg->free_iov)
5010                 kfree(kmsg->free_iov);
5011         req->flags &= ~REQ_F_NEED_CLEANUP;
5012         if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
5013                 req_set_fail(req);
5014         __io_req_complete(req, issue_flags, ret, cflags);
5015         return 0;
5016 }
5017
5018 static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
5019 {
5020         struct io_buffer *kbuf;
5021         struct io_sr_msg *sr = &req->sr_msg;
5022         struct msghdr msg;
5023         void __user *buf = sr->buf;
5024         struct socket *sock;
5025         struct iovec iov;
5026         unsigned flags;
5027         int min_ret = 0;
5028         int ret, cflags = 0;
5029         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
5030
5031         sock = sock_from_file(req->file);
5032         if (unlikely(!sock))
5033                 return -ENOTSOCK;
5034
5035         if (req->flags & REQ_F_BUFFER_SELECT) {
5036                 kbuf = io_recv_buffer_select(req, issue_flags);
5037                 if (IS_ERR(kbuf))
5038                         return PTR_ERR(kbuf);
5039                 buf = u64_to_user_ptr(kbuf->addr);
5040         }
5041
5042         ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
5043         if (unlikely(ret))
5044                 goto out_free;
5045
5046         msg.msg_name = NULL;
5047         msg.msg_control = NULL;
5048         msg.msg_controllen = 0;
5049         msg.msg_namelen = 0;
5050         msg.msg_iocb = NULL;
5051         msg.msg_flags = 0;
5052
5053         flags = req->sr_msg.msg_flags;
5054         if (force_nonblock)
5055                 flags |= MSG_DONTWAIT;
5056         if (flags & MSG_WAITALL)
5057                 min_ret = iov_iter_count(&msg.msg_iter);
5058
5059         ret = sock_recvmsg(sock, &msg, flags);
5060         if (force_nonblock && ret == -EAGAIN)
5061                 return -EAGAIN;
5062         if (ret == -ERESTARTSYS)
5063                 ret = -EINTR;
5064 out_free:
5065         if (req->flags & REQ_F_BUFFER_SELECTED)
5066                 cflags = io_put_recv_kbuf(req);
5067         if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
5068                 req_set_fail(req);
5069         __io_req_complete(req, issue_flags, ret, cflags);
5070         return 0;
5071 }
5072
5073 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5074 {
5075         struct io_accept *accept = &req->accept;
5076
5077         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5078                 return -EINVAL;
5079         if (sqe->ioprio || sqe->len || sqe->buf_index)
5080                 return -EINVAL;
5081
5082         accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
5083         accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
5084         accept->flags = READ_ONCE(sqe->accept_flags);
5085         accept->nofile = rlimit(RLIMIT_NOFILE);
5086
5087         accept->file_slot = READ_ONCE(sqe->file_index);
5088         if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) ||
5089                                   (accept->flags & SOCK_CLOEXEC)))
5090                 return -EINVAL;
5091         if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
5092                 return -EINVAL;
5093         if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
5094                 accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
5095         return 0;
5096 }
5097
5098 static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
5099 {
5100         struct io_accept *accept = &req->accept;
5101         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
5102         unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
5103         bool fixed = !!accept->file_slot;
5104         struct file *file;
5105         int ret, fd;
5106
5107         if (req->file->f_flags & O_NONBLOCK)
5108                 req->flags |= REQ_F_NOWAIT;
5109
5110         if (!fixed) {
5111                 fd = __get_unused_fd_flags(accept->flags, accept->nofile);
5112                 if (unlikely(fd < 0))
5113                         return fd;
5114         }
5115         file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
5116                          accept->flags);
5117         if (IS_ERR(file)) {
5118                 if (!fixed)
5119                         put_unused_fd(fd);
5120                 ret = PTR_ERR(file);
5121                 if (ret == -EAGAIN && force_nonblock)
5122                         return -EAGAIN;
5123                 if (ret == -ERESTARTSYS)
5124                         ret = -EINTR;
5125                 req_set_fail(req);
5126         } else if (!fixed) {
5127                 fd_install(fd, file);
5128                 ret = fd;
5129         } else {
5130                 ret = io_install_fixed_file(req, file, issue_flags,
5131                                             accept->file_slot - 1);
5132         }
5133         __io_req_complete(req, issue_flags, ret, 0);
5134         return 0;
5135 }
5136
5137 static int io_connect_prep_async(struct io_kiocb *req)
5138 {
5139         struct io_async_connect *io = req->async_data;
5140         struct io_connect *conn = &req->connect;
5141
5142         return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
5143 }
5144
5145 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5146 {
5147         struct io_connect *conn = &req->connect;
5148
5149         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5150                 return -EINVAL;
5151         if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
5152             sqe->splice_fd_in)
5153                 return -EINVAL;
5154
5155         conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
5156         conn->addr_len =  READ_ONCE(sqe->addr2);
5157         return 0;
5158 }
5159
5160 static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
5161 {
5162         struct io_async_connect __io, *io;
5163         unsigned file_flags;
5164         int ret;
5165         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
5166
5167         if (req_has_async_data(req)) {
5168                 io = req->async_data;
5169         } else {
5170                 ret = move_addr_to_kernel(req->connect.addr,
5171                                                 req->connect.addr_len,
5172                                                 &__io.address);
5173                 if (ret)
5174                         goto out;
5175                 io = &__io;
5176         }
5177
5178         file_flags = force_nonblock ? O_NONBLOCK : 0;
5179
5180         ret = __sys_connect_file(req->file, &io->address,
5181                                         req->connect.addr_len, file_flags);
5182         if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
5183                 if (req_has_async_data(req))
5184                         return -EAGAIN;
5185                 if (io_alloc_async_data(req)) {
5186                         ret = -ENOMEM;
5187                         goto out;
5188                 }
5189                 memcpy(req->async_data, &__io, sizeof(__io));
5190                 return -EAGAIN;
5191         }
5192         if (ret == -ERESTARTSYS)
5193                 ret = -EINTR;
5194 out:
5195         if (ret < 0)
5196                 req_set_fail(req);
5197         __io_req_complete(req, issue_flags, ret, 0);
5198         return 0;
5199 }
5200 #else /* !CONFIG_NET */
5201 #define IO_NETOP_FN(op)                                                 \
5202 static int io_##op(struct io_kiocb *req, unsigned int issue_flags)      \
5203 {                                                                       \
5204         return -EOPNOTSUPP;                                             \
5205 }
5206
5207 #define IO_NETOP_PREP(op)                                               \
5208 IO_NETOP_FN(op)                                                         \
5209 static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
5210 {                                                                       \
5211         return -EOPNOTSUPP;                                             \
5212 }                                                                       \
5213
5214 #define IO_NETOP_PREP_ASYNC(op)                                         \
5215 IO_NETOP_PREP(op)                                                       \
5216 static int io_##op##_prep_async(struct io_kiocb *req)                   \
5217 {                                                                       \
5218         return -EOPNOTSUPP;                                             \
5219 }
5220
5221 IO_NETOP_PREP_ASYNC(sendmsg);
5222 IO_NETOP_PREP_ASYNC(recvmsg);
5223 IO_NETOP_PREP_ASYNC(connect);
5224 IO_NETOP_PREP(accept);
5225 IO_NETOP_FN(send);
5226 IO_NETOP_FN(recv);
5227 #endif /* CONFIG_NET */
5228
5229 struct io_poll_table {
5230         struct poll_table_struct pt;
5231         struct io_kiocb *req;
5232         int nr_entries;
5233         int error;
5234 };
5235
5236 static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
5237                            __poll_t mask, io_req_tw_func_t func)
5238 {
5239         /* for instances that support it check for an event match first: */
5240         if (mask && !(mask & poll->events))
5241                 return 0;
5242
5243         trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
5244
5245         list_del_init(&poll->wait.entry);
5246
5247         req->result = mask;
5248         req->io_task_work.func = func;
5249
5250         /*
5251          * If this fails, then the task is exiting. When a task exits, the
5252          * work gets canceled, so just cancel this request as well instead
5253          * of executing it. We can't safely execute it anyway, as we may not
5254          * have the needed state needed for it anyway.
5255          */
5256         io_req_task_work_add(req);
5257         return 1;
5258 }
5259
5260 static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
5261         __acquires(&req->ctx->completion_lock)
5262 {
5263         struct io_ring_ctx *ctx = req->ctx;
5264
5265         /* req->task == current here, checking PF_EXITING is safe */
5266         if (unlikely(req->task->flags & PF_EXITING))
5267                 WRITE_ONCE(poll->canceled, true);
5268
5269         if (!req->result && !READ_ONCE(poll->canceled)) {
5270                 struct poll_table_struct pt = { ._key = poll->events };
5271
5272                 req->result = vfs_poll(req->file, &pt) & poll->events;
5273         }
5274
5275         spin_lock(&ctx->completion_lock);
5276         if (!req->result && !READ_ONCE(poll->canceled)) {
5277                 add_wait_queue(poll->head, &poll->wait);
5278                 return true;
5279         }
5280
5281         return false;
5282 }
5283
5284 static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
5285 {
5286         /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
5287         if (req->opcode == IORING_OP_POLL_ADD)
5288                 return req->async_data;
5289         return req->apoll->double_poll;
5290 }
5291
5292 static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
5293 {
5294         if (req->opcode == IORING_OP_POLL_ADD)
5295                 return &req->poll;
5296         return &req->apoll->poll;
5297 }
5298
5299 static void io_poll_remove_double(struct io_kiocb *req)
5300         __must_hold(&req->ctx->completion_lock)
5301 {
5302         struct io_poll_iocb *poll = io_poll_get_double(req);
5303
5304         lockdep_assert_held(&req->ctx->completion_lock);
5305
5306         if (poll && poll->head) {
5307                 struct wait_queue_head *head = poll->head;
5308
5309                 spin_lock_irq(&head->lock);
5310                 list_del_init(&poll->wait.entry);
5311                 if (poll->wait.private)
5312                         req_ref_put(req);
5313                 poll->head = NULL;
5314                 spin_unlock_irq(&head->lock);
5315         }
5316 }
5317
5318 static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask)
5319         __must_hold(&req->ctx->completion_lock)
5320 {
5321         struct io_ring_ctx *ctx = req->ctx;
5322         unsigned flags = IORING_CQE_F_MORE;
5323         int error;
5324
5325         if (READ_ONCE(req->poll.canceled)) {
5326                 error = -ECANCELED;
5327                 req->poll.events |= EPOLLONESHOT;
5328         } else {
5329                 error = mangle_poll(mask);
5330         }
5331         if (req->poll.events & EPOLLONESHOT)
5332                 flags = 0;
5333         if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
5334                 req->poll.events |= EPOLLONESHOT;
5335                 flags = 0;
5336         }
5337         if (flags & IORING_CQE_F_MORE)
5338                 ctx->cq_extra++;
5339
5340         return !(flags & IORING_CQE_F_MORE);
5341 }
5342
5343 static void io_poll_task_func(struct io_kiocb *req, bool *locked)
5344 {
5345         struct io_ring_ctx *ctx = req->ctx;
5346         struct io_kiocb *nxt;
5347
5348         if (io_poll_rewait(req, &req->poll)) {
5349                 spin_unlock(&ctx->completion_lock);
5350         } else {
5351                 bool done;
5352
5353                 if (req->poll.done) {
5354                         spin_unlock(&ctx->completion_lock);
5355                         return;
5356                 }
5357                 done = __io_poll_complete(req, req->result);
5358                 if (done) {
5359                         io_poll_remove_double(req);
5360                         hash_del(&req->hash_node);
5361                         req->poll.done = true;
5362                 } else {
5363                         req->result = 0;
5364                         add_wait_queue(req->poll.head, &req->poll.wait);
5365                 }
5366                 io_commit_cqring(ctx);
5367                 spin_unlock(&ctx->completion_lock);
5368                 io_cqring_ev_posted(ctx);
5369
5370                 if (done) {
5371                         nxt = io_put_req_find_next(req);
5372                         if (nxt)
5373                                 io_req_task_submit(nxt, locked);
5374                 }
5375         }
5376 }
5377
5378 static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
5379                                int sync, void *key)
5380 {
5381         struct io_kiocb *req = wait->private;
5382         struct io_poll_iocb *poll = io_poll_get_single(req);
5383         __poll_t mask = key_to_poll(key);
5384         unsigned long flags;
5385
5386         /* for instances that support it check for an event match first: */
5387         if (mask && !(mask & poll->events))
5388                 return 0;
5389         if (!(poll->events & EPOLLONESHOT))
5390                 return poll->wait.func(&poll->wait, mode, sync, key);
5391
5392         list_del_init(&wait->entry);
5393
5394         if (poll->head) {
5395                 bool done;
5396
5397                 spin_lock_irqsave(&poll->head->lock, flags);
5398                 done = list_empty(&poll->wait.entry);
5399                 if (!done)
5400                         list_del_init(&poll->wait.entry);
5401                 /* make sure double remove sees this as being gone */
5402                 wait->private = NULL;
5403                 spin_unlock_irqrestore(&poll->head->lock, flags);
5404                 if (!done) {
5405                         /* use wait func handler, so it matches the rq type */
5406                         poll->wait.func(&poll->wait, mode, sync, key);
5407                 }
5408         }
5409         req_ref_put(req);
5410         return 1;
5411 }
5412
5413 static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
5414                               wait_queue_func_t wake_func)
5415 {
5416         poll->head = NULL;
5417         poll->done = false;
5418         poll->canceled = false;
5419 #define IO_POLL_UNMASK  (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
5420         /* mask in events that we always want/need */
5421         poll->events = events | IO_POLL_UNMASK;
5422         INIT_LIST_HEAD(&poll->wait.entry);
5423         init_waitqueue_func_entry(&poll->wait, wake_func);
5424 }
5425
5426 static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
5427                             struct wait_queue_head *head,
5428                             struct io_poll_iocb **poll_ptr)
5429 {
5430         struct io_kiocb *req = pt->req;
5431
5432         /*
5433          * The file being polled uses multiple waitqueues for poll handling
5434          * (e.g. one for read, one for write). Setup a separate io_poll_iocb
5435          * if this happens.
5436          */
5437         if (unlikely(pt->nr_entries)) {
5438                 struct io_poll_iocb *poll_one = poll;
5439
5440                 /* double add on the same waitqueue head, ignore */
5441                 if (poll_one->head == head)
5442                         return;
5443                 /* already have a 2nd entry, fail a third attempt */
5444                 if (*poll_ptr) {
5445                         if ((*poll_ptr)->head == head)
5446                                 return;
5447                         pt->error = -EINVAL;
5448                         return;
5449                 }
5450                 /*
5451                  * Can't handle multishot for double wait for now, turn it
5452                  * into one-shot mode.
5453                  */
5454                 if (!(poll_one->events & EPOLLONESHOT))
5455                         poll_one->events |= EPOLLONESHOT;
5456                 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
5457                 if (!poll) {
5458                         pt->error = -ENOMEM;
5459                         return;
5460                 }
5461                 io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
5462                 req_ref_get(req);
5463                 poll->wait.private = req;
5464
5465                 *poll_ptr = poll;
5466                 if (req->opcode == IORING_OP_POLL_ADD)
5467                         req->flags |= REQ_F_ASYNC_DATA;
5468         }
5469
5470         pt->nr_entries++;
5471         poll->head = head;
5472
5473         if (poll->events & EPOLLEXCLUSIVE)
5474                 add_wait_queue_exclusive(head, &poll->wait);
5475         else
5476                 add_wait_queue(head, &poll->wait);
5477 }
5478
5479 static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
5480                                struct poll_table_struct *p)
5481 {
5482         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5483         struct async_poll *apoll = pt->req->apoll;
5484
5485         __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
5486 }
5487
5488 static void io_async_task_func(struct io_kiocb *req, bool *locked)
5489 {
5490         struct async_poll *apoll = req->apoll;
5491         struct io_ring_ctx *ctx = req->ctx;
5492
5493         trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data);
5494
5495         if (io_poll_rewait(req, &apoll->poll)) {
5496                 spin_unlock(&ctx->completion_lock);
5497                 return;
5498         }
5499
5500         hash_del(&req->hash_node);
5501         io_poll_remove_double(req);
5502         apoll->poll.done = true;
5503         spin_unlock(&ctx->completion_lock);
5504
5505         if (!READ_ONCE(apoll->poll.canceled))
5506                 io_req_task_submit(req, locked);
5507         else
5508                 io_req_complete_failed(req, -ECANCELED);
5509 }
5510
5511 static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5512                         void *key)
5513 {
5514         struct io_kiocb *req = wait->private;
5515         struct io_poll_iocb *poll = &req->apoll->poll;
5516
5517         trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
5518                                         key_to_poll(key));
5519
5520         return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
5521 }
5522
5523 static void io_poll_req_insert(struct io_kiocb *req)
5524 {
5525         struct io_ring_ctx *ctx = req->ctx;
5526         struct hlist_head *list;
5527
5528         list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
5529         hlist_add_head(&req->hash_node, list);
5530 }
5531
5532 static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
5533                                       struct io_poll_iocb *poll,
5534                                       struct io_poll_table *ipt, __poll_t mask,
5535                                       wait_queue_func_t wake_func)
5536         __acquires(&ctx->completion_lock)
5537 {
5538         struct io_ring_ctx *ctx = req->ctx;
5539         bool cancel = false;
5540
5541         INIT_HLIST_NODE(&req->hash_node);
5542         io_init_poll_iocb(poll, mask, wake_func);
5543         poll->file = req->file;
5544         poll->wait.private = req;
5545
5546         ipt->pt._key = mask;
5547         ipt->req = req;
5548         ipt->error = 0;
5549         ipt->nr_entries = 0;
5550
5551         mask = vfs_poll(req->file, &ipt->pt) & poll->events;
5552         if (unlikely(!ipt->nr_entries) && !ipt->error)
5553                 ipt->error = -EINVAL;
5554
5555         spin_lock(&ctx->completion_lock);
5556         if (ipt->error || (mask && (poll->events & EPOLLONESHOT)))
5557                 io_poll_remove_double(req);
5558         if (likely(poll->head)) {
5559                 spin_lock_irq(&poll->head->lock);
5560                 if (unlikely(list_empty(&poll->wait.entry))) {
5561                         if (ipt->error)
5562                                 cancel = true;
5563                         ipt->error = 0;
5564                         mask = 0;
5565                 }
5566                 if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error)
5567                         list_del_init(&poll->wait.entry);
5568                 else if (cancel)
5569                         WRITE_ONCE(poll->canceled, true);
5570                 else if (!poll->done) /* actually waiting for an event */
5571                         io_poll_req_insert(req);
5572                 spin_unlock_irq(&poll->head->lock);
5573         }
5574
5575         return mask;
5576 }
5577
5578 enum {
5579         IO_APOLL_OK,
5580         IO_APOLL_ABORTED,
5581         IO_APOLL_READY
5582 };
5583
5584 static int io_arm_poll_handler(struct io_kiocb *req)
5585 {
5586         const struct io_op_def *def = &io_op_defs[req->opcode];
5587         struct io_ring_ctx *ctx = req->ctx;
5588         struct async_poll *apoll;
5589         struct io_poll_table ipt;
5590         __poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI;
5591
5592         if (!req->file || !file_can_poll(req->file))
5593                 return IO_APOLL_ABORTED;
5594         if (req->flags & REQ_F_POLLED)
5595                 return IO_APOLL_ABORTED;
5596         if (!def->pollin && !def->pollout)
5597                 return IO_APOLL_ABORTED;
5598
5599         if (def->pollin) {
5600                 mask |= POLLIN | POLLRDNORM;
5601
5602                 /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
5603                 if ((req->opcode == IORING_OP_RECVMSG) &&
5604                     (req->sr_msg.msg_flags & MSG_ERRQUEUE))
5605                         mask &= ~POLLIN;
5606         } else {
5607                 mask |= POLLOUT | POLLWRNORM;
5608         }
5609
5610         apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
5611         if (unlikely(!apoll))
5612                 return IO_APOLL_ABORTED;
5613         apoll->double_poll = NULL;
5614         req->apoll = apoll;
5615         req->flags |= REQ_F_POLLED;
5616         ipt.pt._qproc = io_async_queue_proc;
5617         io_req_set_refcount(req);
5618
5619         ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
5620                                         io_async_wake);
5621         spin_unlock(&ctx->completion_lock);
5622         if (ret || ipt.error)
5623                 return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
5624
5625         trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
5626                                 mask, apoll->poll.events);
5627         return IO_APOLL_OK;
5628 }
5629
5630 static bool __io_poll_remove_one(struct io_kiocb *req,
5631                                  struct io_poll_iocb *poll, bool do_cancel)
5632         __must_hold(&req->ctx->completion_lock)
5633 {
5634         bool do_complete = false;
5635
5636         if (!poll->head)
5637                 return false;
5638         spin_lock_irq(&poll->head->lock);
5639         if (do_cancel)
5640                 WRITE_ONCE(poll->canceled, true);
5641         if (!list_empty(&poll->wait.entry)) {
5642                 list_del_init(&poll->wait.entry);
5643                 do_complete = true;
5644         }
5645         spin_unlock_irq(&poll->head->lock);
5646         hash_del(&req->hash_node);
5647         return do_complete;
5648 }
5649
5650 static bool io_poll_remove_one(struct io_kiocb *req)
5651         __must_hold(&req->ctx->completion_lock)
5652 {
5653         bool do_complete;
5654
5655         io_poll_remove_double(req);
5656         do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
5657
5658         if (do_complete) {
5659                 io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
5660                 io_commit_cqring(req->ctx);
5661                 req_set_fail(req);
5662                 io_put_req_deferred(req);
5663         }
5664         return do_complete;
5665 }
5666
5667 /*
5668  * Returns true if we found and killed one or more poll requests
5669  */
5670 static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
5671                                       struct task_struct *tsk, bool cancel_all)
5672 {
5673         struct hlist_node *tmp;
5674         struct io_kiocb *req;
5675         int posted = 0, i;
5676
5677         spin_lock(&ctx->completion_lock);
5678         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
5679                 struct hlist_head *list;
5680
5681                 list = &ctx->cancel_hash[i];
5682                 hlist_for_each_entry_safe(req, tmp, list, hash_node) {
5683                         if (io_match_task(req, tsk, cancel_all))
5684                                 posted += io_poll_remove_one(req);
5685                 }
5686         }
5687         spin_unlock(&ctx->completion_lock);
5688
5689         if (posted)
5690                 io_cqring_ev_posted(ctx);
5691
5692         return posted != 0;
5693 }
5694
5695 static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
5696                                      bool poll_only)
5697         __must_hold(&ctx->completion_lock)
5698 {
5699         struct hlist_head *list;
5700         struct io_kiocb *req;
5701
5702         list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
5703         hlist_for_each_entry(req, list, hash_node) {
5704                 if (sqe_addr != req->user_data)
5705                         continue;
5706                 if (poll_only && req->opcode != IORING_OP_POLL_ADD)
5707                         continue;
5708                 return req;
5709         }
5710         return NULL;
5711 }
5712
5713 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
5714                           bool poll_only)
5715         __must_hold(&ctx->completion_lock)
5716 {
5717         struct io_kiocb *req;
5718
5719         req = io_poll_find(ctx, sqe_addr, poll_only);
5720         if (!req)
5721                 return -ENOENT;
5722         if (io_poll_remove_one(req))
5723                 return 0;
5724
5725         return -EALREADY;
5726 }
5727
5728 static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
5729                                      unsigned int flags)
5730 {
5731         u32 events;
5732
5733         events = READ_ONCE(sqe->poll32_events);
5734 #ifdef __BIG_ENDIAN
5735         events = swahw32(events);
5736 #endif
5737         if (!(flags & IORING_POLL_ADD_MULTI))
5738                 events |= EPOLLONESHOT;
5739         return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
5740 }
5741
5742 static int io_poll_update_prep(struct io_kiocb *req,
5743                                const struct io_uring_sqe *sqe)
5744 {
5745         struct io_poll_update *upd = &req->poll_update;
5746         u32 flags;
5747
5748         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5749                 return -EINVAL;
5750         if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
5751                 return -EINVAL;
5752         flags = READ_ONCE(sqe->len);
5753         if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
5754                       IORING_POLL_ADD_MULTI))
5755                 return -EINVAL;
5756         /* meaningless without update */
5757         if (flags == IORING_POLL_ADD_MULTI)
5758                 return -EINVAL;
5759
5760         upd->old_user_data = READ_ONCE(sqe->addr);
5761         upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
5762         upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
5763
5764         upd->new_user_data = READ_ONCE(sqe->off);
5765         if (!upd->update_user_data && upd->new_user_data)
5766                 return -EINVAL;
5767         if (upd->update_events)
5768                 upd->events = io_poll_parse_events(sqe, flags);
5769         else if (sqe->poll32_events)
5770                 return -EINVAL;
5771
5772         return 0;
5773 }
5774
5775 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5776                         void *key)
5777 {
5778         struct io_kiocb *req = wait->private;
5779         struct io_poll_iocb *poll = &req->poll;
5780
5781         return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
5782 }
5783
5784 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
5785                                struct poll_table_struct *p)
5786 {
5787         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5788
5789         __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
5790 }
5791
5792 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5793 {
5794         struct io_poll_iocb *poll = &req->poll;
5795         u32 flags;
5796
5797         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5798                 return -EINVAL;
5799         if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
5800                 return -EINVAL;
5801         flags = READ_ONCE(sqe->len);
5802         if (flags & ~IORING_POLL_ADD_MULTI)
5803                 return -EINVAL;
5804
5805         io_req_set_refcount(req);
5806         poll->events = io_poll_parse_events(sqe, flags);
5807         return 0;
5808 }
5809
5810 static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
5811 {
5812         struct io_poll_iocb *poll = &req->poll;
5813         struct io_ring_ctx *ctx = req->ctx;
5814         struct io_poll_table ipt;
5815         __poll_t mask;
5816         bool done;
5817
5818         ipt.pt._qproc = io_poll_queue_proc;
5819
5820         mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
5821                                         io_poll_wake);
5822
5823         if (mask) { /* no async, we'd stolen it */
5824                 ipt.error = 0;
5825                 done = __io_poll_complete(req, mask);
5826                 io_commit_cqring(req->ctx);
5827         }
5828         spin_unlock(&ctx->completion_lock);
5829
5830         if (mask) {
5831                 io_cqring_ev_posted(ctx);
5832                 if (done)
5833                         io_put_req(req);
5834         }
5835         return ipt.error;
5836 }
5837
5838 static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
5839 {
5840         struct io_ring_ctx *ctx = req->ctx;
5841         struct io_kiocb *preq;
5842         bool completing;
5843         int ret;
5844
5845         spin_lock(&ctx->completion_lock);
5846         preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
5847         if (!preq) {
5848                 ret = -ENOENT;
5849                 goto err;
5850         }
5851
5852         if (!req->poll_update.update_events && !req->poll_update.update_user_data) {
5853                 completing = true;
5854                 ret = io_poll_remove_one(preq) ? 0 : -EALREADY;
5855                 goto err;
5856         }
5857
5858         /*
5859          * Don't allow racy completion with singleshot, as we cannot safely
5860          * update those. For multishot, if we're racing with completion, just
5861          * let completion re-add it.
5862          */
5863         completing = !__io_poll_remove_one(preq, &preq->poll, false);
5864         if (completing && (preq->poll.events & EPOLLONESHOT)) {
5865                 ret = -EALREADY;
5866                 goto err;
5867         }
5868         /* we now have a detached poll request. reissue. */
5869         ret = 0;
5870 err:
5871         if (ret < 0) {
5872                 spin_unlock(&ctx->completion_lock);
5873                 req_set_fail(req);
5874                 io_req_complete(req, ret);
5875                 return 0;
5876         }
5877         /* only mask one event flags, keep behavior flags */
5878         if (req->poll_update.update_events) {
5879                 preq->poll.events &= ~0xffff;
5880                 preq->poll.events |= req->poll_update.events & 0xffff;
5881                 preq->poll.events |= IO_POLL_UNMASK;
5882         }
5883         if (req->poll_update.update_user_data)
5884                 preq->user_data = req->poll_update.new_user_data;
5885         spin_unlock(&ctx->completion_lock);
5886
5887         /* complete update request, we're done with it */
5888         io_req_complete(req, ret);
5889
5890         if (!completing) {
5891                 ret = io_poll_add(preq, issue_flags);
5892                 if (ret < 0) {
5893                         req_set_fail(preq);
5894                         io_req_complete(preq, ret);
5895                 }
5896         }
5897         return 0;
5898 }
5899
5900 static void io_req_task_timeout(struct io_kiocb *req, bool *locked)
5901 {
5902         struct io_timeout_data *data = req->async_data;
5903
5904         if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
5905                 req_set_fail(req);
5906         io_req_complete_post(req, -ETIME, 0);
5907 }
5908
5909 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
5910 {
5911         struct io_timeout_data *data = container_of(timer,
5912                                                 struct io_timeout_data, timer);
5913         struct io_kiocb *req = data->req;
5914         struct io_ring_ctx *ctx = req->ctx;
5915         unsigned long flags;
5916
5917         spin_lock_irqsave(&ctx->timeout_lock, flags);
5918         list_del_init(&req->timeout.list);
5919         atomic_set(&req->ctx->cq_timeouts,
5920                 atomic_read(&req->ctx->cq_timeouts) + 1);
5921         spin_unlock_irqrestore(&ctx->timeout_lock, flags);
5922
5923         req->io_task_work.func = io_req_task_timeout;
5924         io_req_task_work_add(req);
5925         return HRTIMER_NORESTART;
5926 }
5927
5928 static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
5929                                            __u64 user_data)
5930         __must_hold(&ctx->timeout_lock)
5931 {
5932         struct io_timeout_data *io;
5933         struct io_kiocb *req;
5934         bool found = false;
5935
5936         list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
5937                 found = user_data == req->user_data;
5938                 if (found)
5939                         break;
5940         }
5941         if (!found)
5942                 return ERR_PTR(-ENOENT);
5943
5944         io = req->async_data;
5945         if (hrtimer_try_to_cancel(&io->timer) == -1)
5946                 return ERR_PTR(-EALREADY);
5947         list_del_init(&req->timeout.list);
5948         return req;
5949 }
5950
5951 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
5952         __must_hold(&ctx->completion_lock)
5953         __must_hold(&ctx->timeout_lock)
5954 {
5955         struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5956
5957         if (IS_ERR(req))
5958                 return PTR_ERR(req);
5959
5960         req_set_fail(req);
5961         io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0);
5962         io_put_req_deferred(req);
5963         return 0;
5964 }
5965
5966 static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
5967 {
5968         switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
5969         case IORING_TIMEOUT_BOOTTIME:
5970                 return CLOCK_BOOTTIME;
5971         case IORING_TIMEOUT_REALTIME:
5972                 return CLOCK_REALTIME;
5973         default:
5974                 /* can't happen, vetted at prep time */
5975                 WARN_ON_ONCE(1);
5976                 fallthrough;
5977         case 0:
5978                 return CLOCK_MONOTONIC;
5979         }
5980 }
5981
5982 static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
5983                                     struct timespec64 *ts, enum hrtimer_mode mode)
5984         __must_hold(&ctx->timeout_lock)
5985 {
5986         struct io_timeout_data *io;
5987         struct io_kiocb *req;
5988         bool found = false;
5989
5990         list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
5991                 found = user_data == req->user_data;
5992                 if (found)
5993                         break;
5994         }
5995         if (!found)
5996                 return -ENOENT;
5997
5998         io = req->async_data;
5999         if (hrtimer_try_to_cancel(&io->timer) == -1)
6000                 return -EALREADY;
6001         hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
6002         io->timer.function = io_link_timeout_fn;
6003         hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
6004         return 0;
6005 }
6006
6007 static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
6008                              struct timespec64 *ts, enum hrtimer_mode mode)
6009         __must_hold(&ctx->timeout_lock)
6010 {
6011         struct io_kiocb *req = io_timeout_extract(ctx, user_data);
6012         struct io_timeout_data *data;
6013
6014         if (IS_ERR(req))
6015                 return PTR_ERR(req);
6016
6017         req->timeout.off = 0; /* noseq */
6018         data = req->async_data;
6019         list_add_tail(&req->timeout.list, &ctx->timeout_list);
6020         hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
6021         data->timer.function = io_timeout_fn;
6022         hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
6023         return 0;
6024 }
6025
6026 static int io_timeout_remove_prep(struct io_kiocb *req,
6027                                   const struct io_uring_sqe *sqe)
6028 {
6029         struct io_timeout_rem *tr = &req->timeout_rem;
6030
6031         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6032                 return -EINVAL;
6033         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6034                 return -EINVAL;
6035         if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
6036                 return -EINVAL;
6037
6038         tr->ltimeout = false;
6039         tr->addr = READ_ONCE(sqe->addr);
6040         tr->flags = READ_ONCE(sqe->timeout_flags);
6041         if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
6042                 if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
6043                         return -EINVAL;
6044                 if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
6045                         tr->ltimeout = true;
6046                 if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
6047                         return -EINVAL;
6048                 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
6049                         return -EFAULT;
6050         } else if (tr->flags) {
6051                 /* timeout removal doesn't support flags */
6052                 return -EINVAL;
6053         }
6054
6055         return 0;
6056 }
6057
6058 static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
6059 {
6060         return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
6061                                             : HRTIMER_MODE_REL;
6062 }
6063
6064 /*
6065  * Remove or update an existing timeout command
6066  */
6067 static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
6068 {
6069         struct io_timeout_rem *tr = &req->timeout_rem;
6070         struct io_ring_ctx *ctx = req->ctx;
6071         int ret;
6072
6073         if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
6074                 spin_lock(&ctx->completion_lock);
6075                 spin_lock_irq(&ctx->timeout_lock);
6076                 ret = io_timeout_cancel(ctx, tr->addr);
6077                 spin_unlock_irq(&ctx->timeout_lock);
6078                 spin_unlock(&ctx->completion_lock);
6079         } else {
6080                 enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
6081
6082                 spin_lock_irq(&ctx->timeout_lock);
6083                 if (tr->ltimeout)
6084                         ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
6085                 else
6086                         ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
6087                 spin_unlock_irq(&ctx->timeout_lock);
6088         }
6089
6090         if (ret < 0)
6091                 req_set_fail(req);
6092         io_req_complete_post(req, ret, 0);
6093         return 0;
6094 }
6095
6096 static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
6097                            bool is_timeout_link)
6098 {
6099         struct io_timeout_data *data;
6100         unsigned flags;
6101         u32 off = READ_ONCE(sqe->off);
6102
6103         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6104                 return -EINVAL;
6105         if (sqe->ioprio || sqe->buf_index || sqe->len != 1 ||
6106             sqe->splice_fd_in)
6107                 return -EINVAL;
6108         if (off && is_timeout_link)
6109                 return -EINVAL;
6110         flags = READ_ONCE(sqe->timeout_flags);
6111         if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
6112                       IORING_TIMEOUT_ETIME_SUCCESS))
6113                 return -EINVAL;
6114         /* more than one clock specified is invalid, obviously */
6115         if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
6116                 return -EINVAL;
6117
6118         INIT_LIST_HEAD(&req->timeout.list);
6119         req->timeout.off = off;
6120         if (unlikely(off && !req->ctx->off_timeout_used))
6121                 req->ctx->off_timeout_used = true;
6122
6123         if (!req_has_async_data(req) && io_alloc_async_data(req))
6124                 return -ENOMEM;
6125
6126         data = req->async_data;
6127         data->req = req;
6128         data->flags = flags;
6129
6130         if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
6131                 return -EFAULT;
6132
6133         data->mode = io_translate_timeout_mode(flags);
6134         hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
6135
6136         if (is_timeout_link) {
6137                 struct io_submit_link *link = &req->ctx->submit_state.link;
6138
6139                 if (!link->head)
6140                         return -EINVAL;
6141                 if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
6142                         return -EINVAL;
6143                 req->timeout.head = link->last;
6144                 link->last->flags |= REQ_F_ARM_LTIMEOUT;
6145         }
6146         return 0;
6147 }
6148
6149 static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
6150 {
6151         struct io_ring_ctx *ctx = req->ctx;
6152         struct io_timeout_data *data = req->async_data;
6153         struct list_head *entry;
6154         u32 tail, off = req->timeout.off;
6155
6156         spin_lock_irq(&ctx->timeout_lock);
6157
6158         /*
6159          * sqe->off holds how many events that need to occur for this
6160          * timeout event to be satisfied. If it isn't set, then this is
6161          * a pure timeout request, sequence isn't used.
6162          */
6163         if (io_is_timeout_noseq(req)) {
6164                 entry = ctx->timeout_list.prev;
6165                 goto add;
6166         }
6167
6168         tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
6169         req->timeout.target_seq = tail + off;
6170
6171         /* Update the last seq here in case io_flush_timeouts() hasn't.
6172          * This is safe because ->completion_lock is held, and submissions
6173          * and completions are never mixed in the same ->completion_lock section.
6174          */
6175         ctx->cq_last_tm_flush = tail;
6176
6177         /*
6178          * Insertion sort, ensuring the first entry in the list is always
6179          * the one we need first.
6180          */
6181         list_for_each_prev(entry, &ctx->timeout_list) {
6182                 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
6183                                                   timeout.list);
6184
6185                 if (io_is_timeout_noseq(nxt))
6186                         continue;
6187                 /* nxt.seq is behind @tail, otherwise would've been completed */
6188                 if (off >= nxt->timeout.target_seq - tail)
6189                         break;
6190         }
6191 add:
6192         list_add(&req->timeout.list, entry);
6193         data->timer.function = io_timeout_fn;
6194         hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
6195         spin_unlock_irq(&ctx->timeout_lock);
6196         return 0;
6197 }
6198
6199 struct io_cancel_data {
6200         struct io_ring_ctx *ctx;
6201         u64 user_data;
6202 };
6203
6204 static bool io_cancel_cb(struct io_wq_work *work, void *data)
6205 {
6206         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6207         struct io_cancel_data *cd = data;
6208
6209         return req->ctx == cd->ctx && req->user_data == cd->user_data;
6210 }
6211
6212 static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
6213                                struct io_ring_ctx *ctx)
6214 {
6215         struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
6216         enum io_wq_cancel cancel_ret;
6217         int ret = 0;
6218
6219         if (!tctx || !tctx->io_wq)
6220                 return -ENOENT;
6221
6222         cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
6223         switch (cancel_ret) {
6224         case IO_WQ_CANCEL_OK:
6225                 ret = 0;
6226                 break;
6227         case IO_WQ_CANCEL_RUNNING:
6228                 ret = -EALREADY;
6229                 break;
6230         case IO_WQ_CANCEL_NOTFOUND:
6231                 ret = -ENOENT;
6232                 break;
6233         }
6234
6235         return ret;
6236 }
6237
6238 static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
6239 {
6240         struct io_ring_ctx *ctx = req->ctx;
6241         int ret;
6242
6243         WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
6244
6245         ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
6246         if (ret != -ENOENT)
6247                 return ret;
6248
6249         spin_lock(&ctx->completion_lock);
6250         spin_lock_irq(&ctx->timeout_lock);
6251         ret = io_timeout_cancel(ctx, sqe_addr);
6252         spin_unlock_irq(&ctx->timeout_lock);
6253         if (ret != -ENOENT)
6254                 goto out;
6255         ret = io_poll_cancel(ctx, sqe_addr, false);
6256 out:
6257         spin_unlock(&ctx->completion_lock);
6258         return ret;
6259 }
6260
6261 static int io_async_cancel_prep(struct io_kiocb *req,
6262                                 const struct io_uring_sqe *sqe)
6263 {
6264         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6265                 return -EINVAL;
6266         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6267                 return -EINVAL;
6268         if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags ||
6269             sqe->splice_fd_in)
6270                 return -EINVAL;
6271
6272         req->cancel.addr = READ_ONCE(sqe->addr);
6273         return 0;
6274 }
6275
6276 static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
6277 {
6278         struct io_ring_ctx *ctx = req->ctx;
6279         u64 sqe_addr = req->cancel.addr;
6280         struct io_tctx_node *node;
6281         int ret;
6282
6283         ret = io_try_cancel_userdata(req, sqe_addr);
6284         if (ret != -ENOENT)
6285                 goto done;
6286
6287         /* slow path, try all io-wq's */
6288         io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
6289         ret = -ENOENT;
6290         list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
6291                 struct io_uring_task *tctx = node->task->io_uring;
6292
6293                 ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
6294                 if (ret != -ENOENT)
6295                         break;
6296         }
6297         io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
6298 done:
6299         if (ret < 0)
6300                 req_set_fail(req);
6301         io_req_complete_post(req, ret, 0);
6302         return 0;
6303 }
6304
6305 static int io_rsrc_update_prep(struct io_kiocb *req,
6306                                 const struct io_uring_sqe *sqe)
6307 {
6308         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6309                 return -EINVAL;
6310         if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
6311                 return -EINVAL;
6312
6313         req->rsrc_update.offset = READ_ONCE(sqe->off);
6314         req->rsrc_update.nr_args = READ_ONCE(sqe->len);
6315         if (!req->rsrc_update.nr_args)
6316                 return -EINVAL;
6317         req->rsrc_update.arg = READ_ONCE(sqe->addr);
6318         return 0;
6319 }
6320
6321 static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
6322 {
6323         struct io_ring_ctx *ctx = req->ctx;
6324         struct io_uring_rsrc_update2 up;
6325         int ret;
6326
6327         up.offset = req->rsrc_update.offset;
6328         up.data = req->rsrc_update.arg;
6329         up.nr = 0;
6330         up.tags = 0;
6331         up.resv = 0;
6332
6333         io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
6334         ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
6335                                         &up, req->rsrc_update.nr_args);
6336         io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
6337
6338         if (ret < 0)
6339                 req_set_fail(req);
6340         __io_req_complete(req, issue_flags, ret, 0);
6341         return 0;
6342 }
6343
6344 static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
6345 {
6346         switch (req->opcode) {
6347         case IORING_OP_NOP:
6348                 return 0;
6349         case IORING_OP_READV:
6350         case IORING_OP_READ_FIXED:
6351         case IORING_OP_READ:
6352                 return io_read_prep(req, sqe);
6353         case IORING_OP_WRITEV:
6354         case IORING_OP_WRITE_FIXED:
6355         case IORING_OP_WRITE:
6356                 return io_write_prep(req, sqe);
6357         case IORING_OP_POLL_ADD:
6358                 return io_poll_add_prep(req, sqe);
6359         case IORING_OP_POLL_REMOVE:
6360                 return io_poll_update_prep(req, sqe);
6361         case IORING_OP_FSYNC:
6362                 return io_fsync_prep(req, sqe);
6363         case IORING_OP_SYNC_FILE_RANGE:
6364                 return io_sfr_prep(req, sqe);
6365         case IORING_OP_SENDMSG:
6366         case IORING_OP_SEND:
6367                 return io_sendmsg_prep(req, sqe);
6368         case IORING_OP_RECVMSG:
6369         case IORING_OP_RECV:
6370                 return io_recvmsg_prep(req, sqe);
6371         case IORING_OP_CONNECT:
6372                 return io_connect_prep(req, sqe);
6373         case IORING_OP_TIMEOUT:
6374                 return io_timeout_prep(req, sqe, false);
6375         case IORING_OP_TIMEOUT_REMOVE:
6376                 return io_timeout_remove_prep(req, sqe);
6377         case IORING_OP_ASYNC_CANCEL:
6378                 return io_async_cancel_prep(req, sqe);
6379         case IORING_OP_LINK_TIMEOUT:
6380                 return io_timeout_prep(req, sqe, true);
6381         case IORING_OP_ACCEPT:
6382                 return io_accept_prep(req, sqe);
6383         case IORING_OP_FALLOCATE:
6384                 return io_fallocate_prep(req, sqe);
6385         case IORING_OP_OPENAT:
6386                 return io_openat_prep(req, sqe);
6387         case IORING_OP_CLOSE:
6388                 return io_close_prep(req, sqe);
6389         case IORING_OP_FILES_UPDATE:
6390                 return io_rsrc_update_prep(req, sqe);
6391         case IORING_OP_STATX:
6392                 return io_statx_prep(req, sqe);
6393         case IORING_OP_FADVISE:
6394                 return io_fadvise_prep(req, sqe);
6395         case IORING_OP_MADVISE:
6396                 return io_madvise_prep(req, sqe);
6397         case IORING_OP_OPENAT2:
6398                 return io_openat2_prep(req, sqe);
6399         case IORING_OP_EPOLL_CTL:
6400                 return io_epoll_ctl_prep(req, sqe);
6401         case IORING_OP_SPLICE:
6402                 return io_splice_prep(req, sqe);
6403         case IORING_OP_PROVIDE_BUFFERS:
6404                 return io_provide_buffers_prep(req, sqe);
6405         case IORING_OP_REMOVE_BUFFERS:
6406                 return io_remove_buffers_prep(req, sqe);
6407         case IORING_OP_TEE:
6408                 return io_tee_prep(req, sqe);
6409         case IORING_OP_SHUTDOWN:
6410                 return io_shutdown_prep(req, sqe);
6411         case IORING_OP_RENAMEAT:
6412                 return io_renameat_prep(req, sqe);
6413         case IORING_OP_UNLINKAT:
6414                 return io_unlinkat_prep(req, sqe);
6415         case IORING_OP_MKDIRAT:
6416                 return io_mkdirat_prep(req, sqe);
6417         case IORING_OP_SYMLINKAT:
6418                 return io_symlinkat_prep(req, sqe);
6419         case IORING_OP_LINKAT:
6420                 return io_linkat_prep(req, sqe);
6421         }
6422
6423         printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
6424                         req->opcode);
6425         return -EINVAL;
6426 }
6427
6428 static int io_req_prep_async(struct io_kiocb *req)
6429 {
6430         if (!io_op_defs[req->opcode].needs_async_setup)
6431                 return 0;
6432         if (WARN_ON_ONCE(req_has_async_data(req)))
6433                 return -EFAULT;
6434         if (io_alloc_async_data(req))
6435                 return -EAGAIN;
6436
6437         switch (req->opcode) {
6438         case IORING_OP_READV:
6439                 return io_rw_prep_async(req, READ);
6440         case IORING_OP_WRITEV:
6441                 return io_rw_prep_async(req, WRITE);
6442         case IORING_OP_SENDMSG:
6443                 return io_sendmsg_prep_async(req);
6444         case IORING_OP_RECVMSG:
6445                 return io_recvmsg_prep_async(req);
6446         case IORING_OP_CONNECT:
6447                 return io_connect_prep_async(req);
6448         }
6449         printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
6450                     req->opcode);
6451         return -EFAULT;
6452 }
6453
6454 static u32 io_get_sequence(struct io_kiocb *req)
6455 {
6456         u32 seq = req->ctx->cached_sq_head;
6457
6458         /* need original cached_sq_head, but it was increased for each req */
6459         io_for_each_link(req, req)
6460                 seq--;
6461         return seq;
6462 }
6463
6464 static __cold void io_drain_req(struct io_kiocb *req)
6465 {
6466         struct io_ring_ctx *ctx = req->ctx;
6467         struct io_defer_entry *de;
6468         int ret;
6469         u32 seq = io_get_sequence(req);
6470
6471         /* Still need defer if there is pending req in defer list. */
6472         if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
6473 queue:
6474                 ctx->drain_active = false;
6475                 io_req_task_queue(req);
6476                 return;
6477         }
6478
6479         ret = io_req_prep_async(req);
6480         if (ret) {
6481 fail:
6482                 io_req_complete_failed(req, ret);
6483                 return;
6484         }
6485         io_prep_async_link(req);
6486         de = kmalloc(sizeof(*de), GFP_KERNEL);
6487         if (!de) {
6488                 ret = -ENOMEM;
6489                 goto fail;
6490         }
6491
6492         spin_lock(&ctx->completion_lock);
6493         if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
6494                 spin_unlock(&ctx->completion_lock);
6495                 kfree(de);
6496                 goto queue;
6497         }
6498
6499         trace_io_uring_defer(ctx, req, req->user_data);
6500         de->req = req;
6501         de->seq = seq;
6502         list_add_tail(&de->list, &ctx->defer_list);
6503         spin_unlock(&ctx->completion_lock);
6504 }
6505
6506 static void io_clean_op(struct io_kiocb *req)
6507 {
6508         if (req->flags & REQ_F_BUFFER_SELECTED) {
6509                 kfree(req->kbuf);
6510                 req->kbuf = NULL;
6511         }
6512
6513         if (req->flags & REQ_F_NEED_CLEANUP) {
6514                 switch (req->opcode) {
6515                 case IORING_OP_READV:
6516                 case IORING_OP_READ_FIXED:
6517                 case IORING_OP_READ:
6518                 case IORING_OP_WRITEV:
6519                 case IORING_OP_WRITE_FIXED:
6520                 case IORING_OP_WRITE: {
6521                         struct io_async_rw *io = req->async_data;
6522
6523                         kfree(io->free_iovec);
6524                         break;
6525                         }
6526                 case IORING_OP_RECVMSG:
6527                 case IORING_OP_SENDMSG: {
6528                         struct io_async_msghdr *io = req->async_data;
6529
6530                         kfree(io->free_iov);
6531                         break;
6532                         }
6533                 case IORING_OP_SPLICE:
6534                 case IORING_OP_TEE:
6535                         if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED))
6536                                 io_put_file(req->splice.file_in);
6537                         break;
6538                 case IORING_OP_OPENAT:
6539                 case IORING_OP_OPENAT2:
6540                         if (req->open.filename)
6541                                 putname(req->open.filename);
6542                         break;
6543                 case IORING_OP_RENAMEAT:
6544                         putname(req->rename.oldpath);
6545                         putname(req->rename.newpath);
6546                         break;
6547                 case IORING_OP_UNLINKAT:
6548                         putname(req->unlink.filename);
6549                         break;
6550                 case IORING_OP_MKDIRAT:
6551                         putname(req->mkdir.filename);
6552                         break;
6553                 case IORING_OP_SYMLINKAT:
6554                         putname(req->symlink.oldpath);
6555                         putname(req->symlink.newpath);
6556                         break;
6557                 case IORING_OP_LINKAT:
6558                         putname(req->hardlink.oldpath);
6559                         putname(req->hardlink.newpath);
6560                         break;
6561                 }
6562         }
6563         if ((req->flags & REQ_F_POLLED) && req->apoll) {
6564                 kfree(req->apoll->double_poll);
6565                 kfree(req->apoll);
6566                 req->apoll = NULL;
6567         }
6568         if (req->flags & REQ_F_INFLIGHT) {
6569                 struct io_uring_task *tctx = req->task->io_uring;
6570
6571                 atomic_dec(&tctx->inflight_tracked);
6572         }
6573         if (req->flags & REQ_F_CREDS)
6574                 put_cred(req->creds);
6575         if (req->flags & REQ_F_ASYNC_DATA) {
6576                 kfree(req->async_data);
6577                 req->async_data = NULL;
6578         }
6579         req->flags &= ~IO_REQ_CLEAN_FLAGS;
6580 }
6581
6582 static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
6583 {
6584         const struct cred *creds = NULL;
6585         int ret;
6586
6587         if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
6588                 creds = override_creds(req->creds);
6589
6590         switch (req->opcode) {
6591         case IORING_OP_NOP:
6592                 ret = io_nop(req, issue_flags);
6593                 break;
6594         case IORING_OP_READV:
6595         case IORING_OP_READ_FIXED:
6596         case IORING_OP_READ:
6597                 ret = io_read(req, issue_flags);
6598                 break;
6599         case IORING_OP_WRITEV:
6600         case IORING_OP_WRITE_FIXED:
6601         case IORING_OP_WRITE:
6602                 ret = io_write(req, issue_flags);
6603                 break;
6604         case IORING_OP_FSYNC:
6605                 ret = io_fsync(req, issue_flags);
6606                 break;
6607         case IORING_OP_POLL_ADD:
6608                 ret = io_poll_add(req, issue_flags);
6609                 break;
6610         case IORING_OP_POLL_REMOVE:
6611                 ret = io_poll_update(req, issue_flags);
6612                 break;
6613         case IORING_OP_SYNC_FILE_RANGE:
6614                 ret = io_sync_file_range(req, issue_flags);
6615                 break;
6616         case IORING_OP_SENDMSG:
6617                 ret = io_sendmsg(req, issue_flags);
6618                 break;
6619         case IORING_OP_SEND:
6620                 ret = io_send(req, issue_flags);
6621                 break;
6622         case IORING_OP_RECVMSG:
6623                 ret = io_recvmsg(req, issue_flags);
6624                 break;
6625         case IORING_OP_RECV:
6626                 ret = io_recv(req, issue_flags);
6627                 break;
6628         case IORING_OP_TIMEOUT:
6629                 ret = io_timeout(req, issue_flags);
6630                 break;
6631         case IORING_OP_TIMEOUT_REMOVE:
6632                 ret = io_timeout_remove(req, issue_flags);
6633                 break;
6634         case IORING_OP_ACCEPT:
6635                 ret = io_accept(req, issue_flags);
6636                 break;
6637         case IORING_OP_CONNECT:
6638                 ret = io_connect(req, issue_flags);
6639                 break;
6640         case IORING_OP_ASYNC_CANCEL:
6641                 ret = io_async_cancel(req, issue_flags);
6642                 break;
6643         case IORING_OP_FALLOCATE:
6644                 ret = io_fallocate(req, issue_flags);
6645                 break;
6646         case IORING_OP_OPENAT:
6647                 ret = io_openat(req, issue_flags);
6648                 break;
6649         case IORING_OP_CLOSE:
6650                 ret = io_close(req, issue_flags);
6651                 break;
6652         case IORING_OP_FILES_UPDATE:
6653                 ret = io_files_update(req, issue_flags);
6654                 break;
6655         case IORING_OP_STATX:
6656                 ret = io_statx(req, issue_flags);
6657                 break;
6658         case IORING_OP_FADVISE:
6659                 ret = io_fadvise(req, issue_flags);
6660                 break;
6661         case IORING_OP_MADVISE:
6662                 ret = io_madvise(req, issue_flags);
6663                 break;
6664         case IORING_OP_OPENAT2:
6665                 ret = io_openat2(req, issue_flags);
6666                 break;
6667         case IORING_OP_EPOLL_CTL:
6668                 ret = io_epoll_ctl(req, issue_flags);
6669                 break;
6670         case IORING_OP_SPLICE:
6671                 ret = io_splice(req, issue_flags);
6672                 break;
6673         case IORING_OP_PROVIDE_BUFFERS:
6674                 ret = io_provide_buffers(req, issue_flags);
6675                 break;
6676         case IORING_OP_REMOVE_BUFFERS:
6677                 ret = io_remove_buffers(req, issue_flags);
6678                 break;
6679         case IORING_OP_TEE:
6680                 ret = io_tee(req, issue_flags);
6681                 break;
6682         case IORING_OP_SHUTDOWN:
6683                 ret = io_shutdown(req, issue_flags);
6684                 break;
6685         case IORING_OP_RENAMEAT:
6686                 ret = io_renameat(req, issue_flags);
6687                 break;
6688         case IORING_OP_UNLINKAT:
6689                 ret = io_unlinkat(req, issue_flags);
6690                 break;
6691         case IORING_OP_MKDIRAT:
6692                 ret = io_mkdirat(req, issue_flags);
6693                 break;
6694         case IORING_OP_SYMLINKAT:
6695                 ret = io_symlinkat(req, issue_flags);
6696                 break;
6697         case IORING_OP_LINKAT:
6698                 ret = io_linkat(req, issue_flags);
6699                 break;
6700         default:
6701                 ret = -EINVAL;
6702                 break;
6703         }
6704
6705         if (creds)
6706                 revert_creds(creds);
6707         if (ret)
6708                 return ret;
6709         /* If the op doesn't have a file, we're not polling for it */
6710         if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
6711                 io_iopoll_req_issued(req, issue_flags);
6712
6713         return 0;
6714 }
6715
6716 static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
6717 {
6718         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6719
6720         req = io_put_req_find_next(req);
6721         return req ? &req->work : NULL;
6722 }
6723
6724 static void io_wq_submit_work(struct io_wq_work *work)
6725 {
6726         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6727         struct io_kiocb *timeout;
6728         int ret = 0;
6729
6730         /* one will be dropped by ->io_free_work() after returning to io-wq */
6731         if (!(req->flags & REQ_F_REFCOUNT))
6732                 __io_req_set_refcount(req, 2);
6733         else
6734                 req_ref_get(req);
6735
6736         timeout = io_prep_linked_timeout(req);
6737         if (timeout)
6738                 io_queue_linked_timeout(timeout);
6739
6740         /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
6741         if (work->flags & IO_WQ_WORK_CANCEL)
6742                 ret = -ECANCELED;
6743
6744         if (!ret) {
6745                 do {
6746                         ret = io_issue_sqe(req, 0);
6747                         /*
6748                          * We can get EAGAIN for polled IO even though we're
6749                          * forcing a sync submission from here, since we can't
6750                          * wait for request slots on the block side.
6751                          */
6752                         if (ret != -EAGAIN)
6753                                 break;
6754                         cond_resched();
6755                 } while (1);
6756         }
6757
6758         /* avoid locking problems by failing it from a clean context */
6759         if (ret)
6760                 io_req_task_queue_fail(req, ret);
6761 }
6762
6763 static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
6764                                                        unsigned i)
6765 {
6766         return &table->files[i];
6767 }
6768
6769 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
6770                                               int index)
6771 {
6772         struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
6773
6774         return (struct file *) (slot->file_ptr & FFS_MASK);
6775 }
6776
6777 static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
6778 {
6779         unsigned long file_ptr = (unsigned long) file;
6780
6781         if (__io_file_supports_nowait(file, READ))
6782                 file_ptr |= FFS_ASYNC_READ;
6783         if (__io_file_supports_nowait(file, WRITE))
6784                 file_ptr |= FFS_ASYNC_WRITE;
6785         if (S_ISREG(file_inode(file)->i_mode))
6786                 file_ptr |= FFS_ISREG;
6787         file_slot->file_ptr = file_ptr;
6788 }
6789
6790 static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
6791                                              struct io_kiocb *req, int fd)
6792 {
6793         struct file *file;
6794         unsigned long file_ptr;
6795
6796         if (unlikely((unsigned int)fd >= ctx->nr_user_files))
6797                 return NULL;
6798         fd = array_index_nospec(fd, ctx->nr_user_files);
6799         file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
6800         file = (struct file *) (file_ptr & FFS_MASK);
6801         file_ptr &= ~FFS_MASK;
6802         /* mask in overlapping REQ_F and FFS bits */
6803         req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT);
6804         io_req_set_rsrc_node(req, ctx);
6805         return file;
6806 }
6807
6808 static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
6809                                        struct io_kiocb *req, int fd)
6810 {
6811         struct file *file = fget(fd);
6812
6813         trace_io_uring_file_get(ctx, fd);
6814
6815         /* we don't allow fixed io_uring files */
6816         if (file && unlikely(file->f_op == &io_uring_fops))
6817                 io_req_track_inflight(req);
6818         return file;
6819 }
6820
6821 static inline struct file *io_file_get(struct io_ring_ctx *ctx,
6822                                        struct io_kiocb *req, int fd, bool fixed)
6823 {
6824         if (fixed)
6825                 return io_file_get_fixed(ctx, req, fd);
6826         else
6827                 return io_file_get_normal(ctx, req, fd);
6828 }
6829
6830 static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
6831 {
6832         struct io_kiocb *prev = req->timeout.prev;
6833         int ret;
6834
6835         if (prev) {
6836                 ret = io_try_cancel_userdata(req, prev->user_data);
6837                 io_req_complete_post(req, ret ?: -ETIME, 0);
6838                 io_put_req(prev);
6839         } else {
6840                 io_req_complete_post(req, -ETIME, 0);
6841         }
6842 }
6843
6844 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
6845 {
6846         struct io_timeout_data *data = container_of(timer,
6847                                                 struct io_timeout_data, timer);
6848         struct io_kiocb *prev, *req = data->req;
6849         struct io_ring_ctx *ctx = req->ctx;
6850         unsigned long flags;
6851
6852         spin_lock_irqsave(&ctx->timeout_lock, flags);
6853         prev = req->timeout.head;
6854         req->timeout.head = NULL;
6855
6856         /*
6857          * We don't expect the list to be empty, that will only happen if we
6858          * race with the completion of the linked work.
6859          */
6860         if (prev) {
6861                 io_remove_next_linked(prev);
6862                 if (!req_ref_inc_not_zero(prev))
6863                         prev = NULL;
6864         }
6865         list_del(&req->timeout.list);
6866         req->timeout.prev = prev;
6867         spin_unlock_irqrestore(&ctx->timeout_lock, flags);
6868
6869         req->io_task_work.func = io_req_task_link_timeout;
6870         io_req_task_work_add(req);
6871         return HRTIMER_NORESTART;
6872 }
6873
6874 static void io_queue_linked_timeout(struct io_kiocb *req)
6875 {
6876         struct io_ring_ctx *ctx = req->ctx;
6877
6878         spin_lock_irq(&ctx->timeout_lock);
6879         /*
6880          * If the back reference is NULL, then our linked request finished
6881          * before we got a chance to setup the timer
6882          */
6883         if (req->timeout.head) {
6884                 struct io_timeout_data *data = req->async_data;
6885
6886                 data->timer.function = io_link_timeout_fn;
6887                 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
6888                                 data->mode);
6889                 list_add_tail(&req->timeout.list, &ctx->ltimeout_list);
6890         }
6891         spin_unlock_irq(&ctx->timeout_lock);
6892         /* drop submission reference */
6893         io_put_req(req);
6894 }
6895
6896 static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
6897         __must_hold(&req->ctx->uring_lock)
6898 {
6899         struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
6900
6901         switch (io_arm_poll_handler(req)) {
6902         case IO_APOLL_READY:
6903                 if (linked_timeout) {
6904                         io_unprep_linked_timeout(req);
6905                         linked_timeout = NULL;
6906                 }
6907                 io_req_task_queue(req);
6908                 break;
6909         case IO_APOLL_ABORTED:
6910                 /*
6911                  * Queued up for async execution, worker will release
6912                  * submit reference when the iocb is actually submitted.
6913                  */
6914                 io_queue_async_work(req, NULL);
6915                 break;
6916         }
6917
6918         if (linked_timeout)
6919                 io_queue_linked_timeout(linked_timeout);
6920 }
6921
6922 static inline void __io_queue_sqe(struct io_kiocb *req)
6923         __must_hold(&req->ctx->uring_lock)
6924 {
6925         struct io_kiocb *linked_timeout;
6926         int ret;
6927
6928         ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
6929
6930         if (req->flags & REQ_F_COMPLETE_INLINE) {
6931                 io_req_add_compl_list(req);
6932                 return;
6933         }
6934         /*
6935          * We async punt it if the file wasn't marked NOWAIT, or if the file
6936          * doesn't support non-blocking read/write attempts
6937          */
6938         if (likely(!ret)) {
6939                 linked_timeout = io_prep_linked_timeout(req);
6940                 if (linked_timeout)
6941                         io_queue_linked_timeout(linked_timeout);
6942         } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
6943                 io_queue_sqe_arm_apoll(req);
6944         } else {
6945                 io_req_complete_failed(req, ret);
6946         }
6947 }
6948
6949 static void io_queue_sqe_fallback(struct io_kiocb *req)
6950         __must_hold(&req->ctx->uring_lock)
6951 {
6952         if (req->flags & REQ_F_FAIL) {
6953                 io_req_complete_fail_submit(req);
6954         } else if (unlikely(req->ctx->drain_active)) {
6955                 io_drain_req(req);
6956         } else {
6957                 int ret = io_req_prep_async(req);
6958
6959                 if (unlikely(ret))
6960                         io_req_complete_failed(req, ret);
6961                 else
6962                         io_queue_async_work(req, NULL);
6963         }
6964 }
6965
6966 static inline void io_queue_sqe(struct io_kiocb *req)
6967         __must_hold(&req->ctx->uring_lock)
6968 {
6969         if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))))
6970                 __io_queue_sqe(req);
6971         else
6972                 io_queue_sqe_fallback(req);
6973 }
6974
6975 /*
6976  * Check SQE restrictions (opcode and flags).
6977  *
6978  * Returns 'true' if SQE is allowed, 'false' otherwise.
6979  */
6980 static inline bool io_check_restriction(struct io_ring_ctx *ctx,
6981                                         struct io_kiocb *req,
6982                                         unsigned int sqe_flags)
6983 {
6984         if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
6985                 return false;
6986
6987         if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
6988             ctx->restrictions.sqe_flags_required)
6989                 return false;
6990
6991         if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
6992                           ctx->restrictions.sqe_flags_required))
6993                 return false;
6994
6995         return true;
6996 }
6997
6998 static void io_init_req_drain(struct io_kiocb *req)
6999 {
7000         struct io_ring_ctx *ctx = req->ctx;
7001         struct io_kiocb *head = ctx->submit_state.link.head;
7002
7003         ctx->drain_active = true;
7004         if (head) {
7005                 /*
7006                  * If we need to drain a request in the middle of a link, drain
7007                  * the head request and the next request/link after the current
7008                  * link. Considering sequential execution of links,
7009                  * IOSQE_IO_DRAIN will be maintained for every request of our
7010                  * link.
7011                  */
7012                 head->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC;
7013                 ctx->drain_next = true;
7014         }
7015 }
7016
7017 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
7018                        const struct io_uring_sqe *sqe)
7019         __must_hold(&ctx->uring_lock)
7020 {
7021         unsigned int sqe_flags;
7022         int personality;
7023         u8 opcode;
7024
7025         /* req is partially pre-initialised, see io_preinit_req() */
7026         req->opcode = opcode = READ_ONCE(sqe->opcode);
7027         /* same numerical values with corresponding REQ_F_*, safe to copy */
7028         req->flags = sqe_flags = READ_ONCE(sqe->flags);
7029         req->user_data = READ_ONCE(sqe->user_data);
7030         req->file = NULL;
7031         req->fixed_rsrc_refs = NULL;
7032         req->task = current;
7033
7034         if (unlikely(opcode >= IORING_OP_LAST)) {
7035                 req->opcode = 0;
7036                 return -EINVAL;
7037         }
7038         if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
7039                 /* enforce forwards compatibility on users */
7040                 if (sqe_flags & ~SQE_VALID_FLAGS)
7041                         return -EINVAL;
7042                 if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
7043                     !io_op_defs[opcode].buffer_select)
7044                         return -EOPNOTSUPP;
7045                 if (sqe_flags & IOSQE_IO_DRAIN)
7046                         io_init_req_drain(req);
7047         }
7048         if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
7049                 if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
7050                         return -EACCES;
7051                 /* knock it to the slow queue path, will be drained there */
7052                 if (ctx->drain_active)
7053                         req->flags |= REQ_F_FORCE_ASYNC;
7054                 /* if there is no link, we're at "next" request and need to drain */
7055                 if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
7056                         ctx->drain_next = false;
7057                         ctx->drain_active = true;
7058                         req->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC;
7059                 }
7060         }
7061
7062         if (io_op_defs[opcode].needs_file) {
7063                 struct io_submit_state *state = &ctx->submit_state;
7064
7065                 /*
7066                  * Plug now if we have more than 2 IO left after this, and the
7067                  * target is potentially a read/write to block based storage.
7068                  */
7069                 if (state->need_plug && io_op_defs[opcode].plug) {
7070                         state->plug_started = true;
7071                         state->need_plug = false;
7072                         blk_start_plug(&state->plug);
7073                 }
7074
7075                 req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
7076                                         (sqe_flags & IOSQE_FIXED_FILE));
7077                 if (unlikely(!req->file))
7078                         return -EBADF;
7079         }
7080
7081         personality = READ_ONCE(sqe->personality);
7082         if (personality) {
7083                 req->creds = xa_load(&ctx->personalities, personality);
7084                 if (!req->creds)
7085                         return -EINVAL;
7086                 get_cred(req->creds);
7087                 req->flags |= REQ_F_CREDS;
7088         }
7089
7090         return io_req_prep(req, sqe);
7091 }
7092
7093 static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
7094                          const struct io_uring_sqe *sqe)
7095         __must_hold(&ctx->uring_lock)
7096 {
7097         struct io_submit_link *link = &ctx->submit_state.link;
7098         int ret;
7099
7100         ret = io_init_req(ctx, req, sqe);
7101         if (unlikely(ret)) {
7102                 trace_io_uring_req_failed(sqe, ret);
7103
7104                 /* fail even hard links since we don't submit */
7105                 if (link->head) {
7106                         /*
7107                          * we can judge a link req is failed or cancelled by if
7108                          * REQ_F_FAIL is set, but the head is an exception since
7109                          * it may be set REQ_F_FAIL because of other req's failure
7110                          * so let's leverage req->result to distinguish if a head
7111                          * is set REQ_F_FAIL because of its failure or other req's
7112                          * failure so that we can set the correct ret code for it.
7113                          * init result here to avoid affecting the normal path.
7114                          */
7115                         if (!(link->head->flags & REQ_F_FAIL))
7116                                 req_fail_link_node(link->head, -ECANCELED);
7117                 } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
7118                         /*
7119                          * the current req is a normal req, we should return
7120                          * error and thus break the submittion loop.
7121                          */
7122                         io_req_complete_failed(req, ret);
7123                         return ret;
7124                 }
7125                 req_fail_link_node(req, ret);
7126         }
7127
7128         /* don't need @sqe from now on */
7129         trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data,
7130                                   req->flags, true,
7131                                   ctx->flags & IORING_SETUP_SQPOLL);
7132
7133         /*
7134          * If we already have a head request, queue this one for async
7135          * submittal once the head completes. If we don't have a head but
7136          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
7137          * submitted sync once the chain is complete. If none of those
7138          * conditions are true (normal request), then just queue it.
7139          */
7140         if (link->head) {
7141                 struct io_kiocb *head = link->head;
7142
7143                 if (!(req->flags & REQ_F_FAIL)) {
7144                         ret = io_req_prep_async(req);
7145                         if (unlikely(ret)) {
7146                                 req_fail_link_node(req, ret);
7147                                 if (!(head->flags & REQ_F_FAIL))
7148                                         req_fail_link_node(head, -ECANCELED);
7149                         }
7150                 }
7151                 trace_io_uring_link(ctx, req, head);
7152                 link->last->link = req;
7153                 link->last = req;
7154
7155                 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
7156                         return 0;
7157                 /* last request of a link, enqueue the link */
7158                 link->head = NULL;
7159                 req = head;
7160         } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
7161                 link->head = req;
7162                 link->last = req;
7163                 return 0;
7164         }
7165
7166         io_queue_sqe(req);
7167         return 0;
7168 }
7169
7170 /*
7171  * Batched submission is done, ensure local IO is flushed out.
7172  */
7173 static void io_submit_state_end(struct io_ring_ctx *ctx)
7174 {
7175         struct io_submit_state *state = &ctx->submit_state;
7176
7177         if (state->link.head)
7178                 io_queue_sqe(state->link.head);
7179         /* flush only after queuing links as they can generate completions */
7180         io_submit_flush_completions(ctx);
7181         if (state->plug_started)
7182                 blk_finish_plug(&state->plug);
7183 }
7184
7185 /*
7186  * Start submission side cache.
7187  */
7188 static void io_submit_state_start(struct io_submit_state *state,
7189                                   unsigned int max_ios)
7190 {
7191         state->plug_started = false;
7192         state->need_plug = max_ios > 2;
7193         /* set only head, no need to init link_last in advance */
7194         state->link.head = NULL;
7195 }
7196
7197 static void io_commit_sqring(struct io_ring_ctx *ctx)
7198 {
7199         struct io_rings *rings = ctx->rings;
7200
7201         /*
7202          * Ensure any loads from the SQEs are done at this point,
7203          * since once we write the new head, the application could
7204          * write new data to them.
7205          */
7206         smp_store_release(&rings->sq.head, ctx->cached_sq_head);
7207 }
7208
7209 /*
7210  * Fetch an sqe, if one is available. Note this returns a pointer to memory
7211  * that is mapped by userspace. This means that care needs to be taken to
7212  * ensure that reads are stable, as we cannot rely on userspace always
7213  * being a good citizen. If members of the sqe are validated and then later
7214  * used, it's important that those reads are done through READ_ONCE() to
7215  * prevent a re-load down the line.
7216  */
7217 static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
7218 {
7219         unsigned head, mask = ctx->sq_entries - 1;
7220         unsigned sq_idx = ctx->cached_sq_head++ & mask;
7221
7222         /*
7223          * The cached sq head (or cq tail) serves two purposes:
7224          *
7225          * 1) allows us to batch the cost of updating the user visible
7226          *    head updates.
7227          * 2) allows the kernel side to track the head on its own, even
7228          *    though the application is the one updating it.
7229          */
7230         head = READ_ONCE(ctx->sq_array[sq_idx]);
7231         if (likely(head < ctx->sq_entries))
7232                 return &ctx->sq_sqes[head];
7233
7234         /* drop invalid entries */
7235         ctx->cq_extra--;
7236         WRITE_ONCE(ctx->rings->sq_dropped,
7237                    READ_ONCE(ctx->rings->sq_dropped) + 1);
7238         return NULL;
7239 }
7240
7241 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
7242         __must_hold(&ctx->uring_lock)
7243 {
7244         unsigned int entries = io_sqring_entries(ctx);
7245         int submitted = 0;
7246
7247         if (unlikely(!entries))
7248                 return 0;
7249         /* make sure SQ entry isn't read before tail */
7250         nr = min3(nr, ctx->sq_entries, entries);
7251         io_get_task_refs(nr);
7252
7253         io_submit_state_start(&ctx->submit_state, nr);
7254         do {
7255                 const struct io_uring_sqe *sqe;
7256                 struct io_kiocb *req;
7257
7258                 if (unlikely(!io_alloc_req_refill(ctx))) {
7259                         if (!submitted)
7260                                 submitted = -EAGAIN;
7261                         break;
7262                 }
7263                 req = io_alloc_req(ctx);
7264                 sqe = io_get_sqe(ctx);
7265                 if (unlikely(!sqe)) {
7266                         wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
7267                         break;
7268                 }
7269                 /* will complete beyond this point, count as submitted */
7270                 submitted++;
7271                 if (io_submit_sqe(ctx, req, sqe))
7272                         break;
7273         } while (submitted < nr);
7274
7275         if (unlikely(submitted != nr)) {
7276                 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
7277                 int unused = nr - ref_used;
7278
7279                 current->io_uring->cached_refs += unused;
7280         }
7281
7282         io_submit_state_end(ctx);
7283          /* Commit SQ ring head once we've consumed and submitted all SQEs */
7284         io_commit_sqring(ctx);
7285
7286         return submitted;
7287 }
7288
7289 static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
7290 {
7291         return READ_ONCE(sqd->state);
7292 }
7293
7294 static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
7295 {
7296         /* Tell userspace we may need a wakeup call */
7297         spin_lock(&ctx->completion_lock);
7298         WRITE_ONCE(ctx->rings->sq_flags,
7299                    ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
7300         spin_unlock(&ctx->completion_lock);
7301 }
7302
7303 static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
7304 {
7305         spin_lock(&ctx->completion_lock);
7306         WRITE_ONCE(ctx->rings->sq_flags,
7307                    ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
7308         spin_unlock(&ctx->completion_lock);
7309 }
7310
7311 static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
7312 {
7313         unsigned int to_submit;
7314         int ret = 0;
7315
7316         to_submit = io_sqring_entries(ctx);
7317         /* if we're handling multiple rings, cap submit size for fairness */
7318         if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
7319                 to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
7320
7321         if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
7322                 const struct cred *creds = NULL;
7323
7324                 if (ctx->sq_creds != current_cred())
7325                         creds = override_creds(ctx->sq_creds);
7326
7327                 mutex_lock(&ctx->uring_lock);
7328                 if (!wq_list_empty(&ctx->iopoll_list))
7329                         io_do_iopoll(ctx, true);
7330
7331                 /*
7332                  * Don't submit if refs are dying, good for io_uring_register(),
7333                  * but also it is relied upon by io_ring_exit_work()
7334                  */
7335                 if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
7336                     !(ctx->flags & IORING_SETUP_R_DISABLED))
7337                         ret = io_submit_sqes(ctx, to_submit);
7338                 mutex_unlock(&ctx->uring_lock);
7339
7340                 if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
7341                         wake_up(&ctx->sqo_sq_wait);
7342                 if (creds)
7343                         revert_creds(creds);
7344         }
7345
7346         return ret;
7347 }
7348
7349 static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
7350 {
7351         struct io_ring_ctx *ctx;
7352         unsigned sq_thread_idle = 0;
7353
7354         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7355                 sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
7356         sqd->sq_thread_idle = sq_thread_idle;
7357 }
7358
7359 static bool io_sqd_handle_event(struct io_sq_data *sqd)
7360 {
7361         bool did_sig = false;
7362         struct ksignal ksig;
7363
7364         if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
7365             signal_pending(current)) {
7366                 mutex_unlock(&sqd->lock);
7367                 if (signal_pending(current))
7368                         did_sig = get_signal(&ksig);
7369                 cond_resched();
7370                 mutex_lock(&sqd->lock);
7371         }
7372         return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
7373 }
7374
7375 static int io_sq_thread(void *data)
7376 {
7377         struct io_sq_data *sqd = data;
7378         struct io_ring_ctx *ctx;
7379         unsigned long timeout = 0;
7380         char buf[TASK_COMM_LEN];
7381         DEFINE_WAIT(wait);
7382
7383         snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
7384         set_task_comm(current, buf);
7385
7386         if (sqd->sq_cpu != -1)
7387                 set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
7388         else
7389                 set_cpus_allowed_ptr(current, cpu_online_mask);
7390         current->flags |= PF_NO_SETAFFINITY;
7391
7392         mutex_lock(&sqd->lock);
7393         while (1) {
7394                 bool cap_entries, sqt_spin = false;
7395
7396                 if (io_sqd_events_pending(sqd) || signal_pending(current)) {
7397                         if (io_sqd_handle_event(sqd))
7398                                 break;
7399                         timeout = jiffies + sqd->sq_thread_idle;
7400                 }
7401
7402                 cap_entries = !list_is_singular(&sqd->ctx_list);
7403                 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
7404                         int ret = __io_sq_thread(ctx, cap_entries);
7405
7406                         if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
7407                                 sqt_spin = true;
7408                 }
7409                 if (io_run_task_work())
7410                         sqt_spin = true;
7411
7412                 if (sqt_spin || !time_after(jiffies, timeout)) {
7413                         cond_resched();
7414                         if (sqt_spin)
7415                                 timeout = jiffies + sqd->sq_thread_idle;
7416                         continue;
7417                 }
7418
7419                 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
7420                 if (!io_sqd_events_pending(sqd) && !current->task_works) {
7421                         bool needs_sched = true;
7422
7423                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
7424                                 io_ring_set_wakeup_flag(ctx);
7425
7426                                 if ((ctx->flags & IORING_SETUP_IOPOLL) &&
7427                                     !wq_list_empty(&ctx->iopoll_list)) {
7428                                         needs_sched = false;
7429                                         break;
7430                                 }
7431                                 if (io_sqring_entries(ctx)) {
7432                                         needs_sched = false;
7433                                         break;
7434                                 }
7435                         }
7436
7437                         if (needs_sched) {
7438                                 mutex_unlock(&sqd->lock);
7439                                 schedule();
7440                                 mutex_lock(&sqd->lock);
7441                         }
7442                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7443                                 io_ring_clear_wakeup_flag(ctx);
7444                 }
7445
7446                 finish_wait(&sqd->wait, &wait);
7447                 timeout = jiffies + sqd->sq_thread_idle;
7448         }
7449
7450         io_uring_cancel_generic(true, sqd);
7451         sqd->thread = NULL;
7452         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7453                 io_ring_set_wakeup_flag(ctx);
7454         io_run_task_work();
7455         mutex_unlock(&sqd->lock);
7456
7457         complete(&sqd->exited);
7458         do_exit(0);
7459 }
7460
7461 struct io_wait_queue {
7462         struct wait_queue_entry wq;
7463         struct io_ring_ctx *ctx;
7464         unsigned cq_tail;
7465         unsigned nr_timeouts;
7466 };
7467
7468 static inline bool io_should_wake(struct io_wait_queue *iowq)
7469 {
7470         struct io_ring_ctx *ctx = iowq->ctx;
7471         int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
7472
7473         /*
7474          * Wake up if we have enough events, or if a timeout occurred since we
7475          * started waiting. For timeouts, we always want to return to userspace,
7476          * regardless of event count.
7477          */
7478         return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
7479 }
7480
7481 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
7482                             int wake_flags, void *key)
7483 {
7484         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
7485                                                         wq);
7486
7487         /*
7488          * Cannot safely flush overflowed CQEs from here, ensure we wake up
7489          * the task, and the next invocation will do it.
7490          */
7491         if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow))
7492                 return autoremove_wake_function(curr, mode, wake_flags, key);
7493         return -1;
7494 }
7495
7496 static int io_run_task_work_sig(void)
7497 {
7498         if (io_run_task_work())
7499                 return 1;
7500         if (!signal_pending(current))
7501                 return 0;
7502         if (test_thread_flag(TIF_NOTIFY_SIGNAL))
7503                 return -ERESTARTSYS;
7504         return -EINTR;
7505 }
7506
7507 /* when returns >0, the caller should retry */
7508 static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
7509                                           struct io_wait_queue *iowq,
7510                                           signed long *timeout)
7511 {
7512         int ret;
7513
7514         /* make sure we run task_work before checking for signals */
7515         ret = io_run_task_work_sig();
7516         if (ret || io_should_wake(iowq))
7517                 return ret;
7518         /* let the caller flush overflows, retry */
7519         if (test_bit(0, &ctx->check_cq_overflow))
7520                 return 1;
7521
7522         *timeout = schedule_timeout(*timeout);
7523         return !*timeout ? -ETIME : 1;
7524 }
7525
7526 /*
7527  * Wait until events become available, if we don't already have some. The
7528  * application must reap them itself, as they reside on the shared cq ring.
7529  */
7530 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
7531                           const sigset_t __user *sig, size_t sigsz,
7532                           struct __kernel_timespec __user *uts)
7533 {
7534         struct io_wait_queue iowq;
7535         struct io_rings *rings = ctx->rings;
7536         signed long timeout = MAX_SCHEDULE_TIMEOUT;
7537         int ret;
7538
7539         do {
7540                 io_cqring_overflow_flush(ctx);
7541                 if (io_cqring_events(ctx) >= min_events)
7542                         return 0;
7543                 if (!io_run_task_work())
7544                         break;
7545         } while (1);
7546
7547         if (uts) {
7548                 struct timespec64 ts;
7549
7550                 if (get_timespec64(&ts, uts))
7551                         return -EFAULT;
7552                 timeout = timespec64_to_jiffies(&ts);
7553         }
7554
7555         if (sig) {
7556 #ifdef CONFIG_COMPAT
7557                 if (in_compat_syscall())
7558                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
7559                                                       sigsz);
7560                 else
7561 #endif
7562                         ret = set_user_sigmask(sig, sigsz);
7563
7564                 if (ret)
7565                         return ret;
7566         }
7567
7568         init_waitqueue_func_entry(&iowq.wq, io_wake_function);
7569         iowq.wq.private = current;
7570         INIT_LIST_HEAD(&iowq.wq.entry);
7571         iowq.ctx = ctx;
7572         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
7573         iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
7574
7575         trace_io_uring_cqring_wait(ctx, min_events);
7576         do {
7577                 /* if we can't even flush overflow, don't wait for more */
7578                 if (!io_cqring_overflow_flush(ctx)) {
7579                         ret = -EBUSY;
7580                         break;
7581                 }
7582                 prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
7583                                                 TASK_INTERRUPTIBLE);
7584                 ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
7585                 finish_wait(&ctx->cq_wait, &iowq.wq);
7586                 cond_resched();
7587         } while (ret > 0);
7588
7589         restore_saved_sigmask_unless(ret == -EINTR);
7590
7591         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
7592 }
7593
7594 static void io_free_page_table(void **table, size_t size)
7595 {
7596         unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
7597
7598         for (i = 0; i < nr_tables; i++)
7599                 kfree(table[i]);
7600         kfree(table);
7601 }
7602
7603 static __cold void **io_alloc_page_table(size_t size)
7604 {
7605         unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
7606         size_t init_size = size;
7607         void **table;
7608
7609         table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
7610         if (!table)
7611                 return NULL;
7612
7613         for (i = 0; i < nr_tables; i++) {
7614                 unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
7615
7616                 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
7617                 if (!table[i]) {
7618                         io_free_page_table(table, init_size);
7619                         return NULL;
7620                 }
7621                 size -= this_size;
7622         }
7623         return table;
7624 }
7625
7626 static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
7627 {
7628         percpu_ref_exit(&ref_node->refs);
7629         kfree(ref_node);
7630 }
7631
7632 static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
7633 {
7634         struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
7635         struct io_ring_ctx *ctx = node->rsrc_data->ctx;
7636         unsigned long flags;
7637         bool first_add = false;
7638
7639         spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
7640         node->done = true;
7641
7642         while (!list_empty(&ctx->rsrc_ref_list)) {
7643                 node = list_first_entry(&ctx->rsrc_ref_list,
7644                                             struct io_rsrc_node, node);
7645                 /* recycle ref nodes in order */
7646                 if (!node->done)
7647                         break;
7648                 list_del(&node->node);
7649                 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
7650         }
7651         spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
7652
7653         if (first_add)
7654                 mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ);
7655 }
7656
7657 static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
7658 {
7659         struct io_rsrc_node *ref_node;
7660
7661         ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
7662         if (!ref_node)
7663                 return NULL;
7664
7665         if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
7666                             0, GFP_KERNEL)) {
7667                 kfree(ref_node);
7668                 return NULL;
7669         }
7670         INIT_LIST_HEAD(&ref_node->node);
7671         INIT_LIST_HEAD(&ref_node->rsrc_list);
7672         ref_node->done = false;
7673         return ref_node;
7674 }
7675
7676 static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
7677                                 struct io_rsrc_data *data_to_kill)
7678         __must_hold(&ctx->uring_lock)
7679 {
7680         WARN_ON_ONCE(!ctx->rsrc_backup_node);
7681         WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
7682
7683         io_rsrc_refs_drop(ctx);
7684
7685         if (data_to_kill) {
7686                 struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
7687
7688                 rsrc_node->rsrc_data = data_to_kill;
7689                 spin_lock_irq(&ctx->rsrc_ref_lock);
7690                 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
7691                 spin_unlock_irq(&ctx->rsrc_ref_lock);
7692
7693                 atomic_inc(&data_to_kill->refs);
7694                 percpu_ref_kill(&rsrc_node->refs);
7695                 ctx->rsrc_node = NULL;
7696         }
7697
7698         if (!ctx->rsrc_node) {
7699                 ctx->rsrc_node = ctx->rsrc_backup_node;
7700                 ctx->rsrc_backup_node = NULL;
7701         }
7702 }
7703
7704 static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
7705 {
7706         if (ctx->rsrc_backup_node)
7707                 return 0;
7708         ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
7709         return ctx->rsrc_backup_node ? 0 : -ENOMEM;
7710 }
7711
7712 static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
7713                                       struct io_ring_ctx *ctx)
7714 {
7715         int ret;
7716
7717         /* As we may drop ->uring_lock, other task may have started quiesce */
7718         if (data->quiesce)
7719                 return -ENXIO;
7720
7721         data->quiesce = true;
7722         do {
7723                 ret = io_rsrc_node_switch_start(ctx);
7724                 if (ret)
7725                         break;
7726                 io_rsrc_node_switch(ctx, data);
7727
7728                 /* kill initial ref, already quiesced if zero */
7729                 if (atomic_dec_and_test(&data->refs))
7730                         break;
7731                 mutex_unlock(&ctx->uring_lock);
7732                 flush_delayed_work(&ctx->rsrc_put_work);
7733                 ret = wait_for_completion_interruptible(&data->done);
7734                 if (!ret) {
7735                         mutex_lock(&ctx->uring_lock);
7736                         break;
7737                 }
7738
7739                 atomic_inc(&data->refs);
7740                 /* wait for all works potentially completing data->done */
7741                 flush_delayed_work(&ctx->rsrc_put_work);
7742                 reinit_completion(&data->done);
7743
7744                 ret = io_run_task_work_sig();
7745                 mutex_lock(&ctx->uring_lock);
7746         } while (ret >= 0);
7747         data->quiesce = false;
7748
7749         return ret;
7750 }
7751
7752 static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
7753 {
7754         unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
7755         unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
7756
7757         return &data->tags[table_idx][off];
7758 }
7759
7760 static void io_rsrc_data_free(struct io_rsrc_data *data)
7761 {
7762         size_t size = data->nr * sizeof(data->tags[0][0]);
7763
7764         if (data->tags)
7765                 io_free_page_table((void **)data->tags, size);
7766         kfree(data);
7767 }
7768
7769 static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
7770                                      u64 __user *utags, unsigned nr,
7771                                      struct io_rsrc_data **pdata)
7772 {
7773         struct io_rsrc_data *data;
7774         int ret = -ENOMEM;
7775         unsigned i;
7776
7777         data = kzalloc(sizeof(*data), GFP_KERNEL);
7778         if (!data)
7779                 return -ENOMEM;
7780         data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
7781         if (!data->tags) {
7782                 kfree(data);
7783                 return -ENOMEM;
7784         }
7785
7786         data->nr = nr;
7787         data->ctx = ctx;
7788         data->do_put = do_put;
7789         if (utags) {
7790                 ret = -EFAULT;
7791                 for (i = 0; i < nr; i++) {
7792                         u64 *tag_slot = io_get_tag_slot(data, i);
7793
7794                         if (copy_from_user(tag_slot, &utags[i],
7795                                            sizeof(*tag_slot)))
7796                                 goto fail;
7797                 }
7798         }
7799
7800         atomic_set(&data->refs, 1);
7801         init_completion(&data->done);
7802         *pdata = data;
7803         return 0;
7804 fail:
7805         io_rsrc_data_free(data);
7806         return ret;
7807 }
7808
7809 static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
7810 {
7811         table->files = kvcalloc(nr_files, sizeof(table->files[0]),
7812                                 GFP_KERNEL_ACCOUNT);
7813         return !!table->files;
7814 }
7815
7816 static void io_free_file_tables(struct io_file_table *table)
7817 {
7818         kvfree(table->files);
7819         table->files = NULL;
7820 }
7821
7822 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
7823 {
7824 #if defined(CONFIG_UNIX)
7825         if (ctx->ring_sock) {
7826                 struct sock *sock = ctx->ring_sock->sk;
7827                 struct sk_buff *skb;
7828
7829                 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
7830                         kfree_skb(skb);
7831         }
7832 #else
7833         int i;
7834
7835         for (i = 0; i < ctx->nr_user_files; i++) {
7836                 struct file *file;
7837
7838                 file = io_file_from_index(ctx, i);
7839                 if (file)
7840                         fput(file);
7841         }
7842 #endif
7843         io_free_file_tables(&ctx->file_table);
7844         io_rsrc_data_free(ctx->file_data);
7845         ctx->file_data = NULL;
7846         ctx->nr_user_files = 0;
7847 }
7848
7849 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
7850 {
7851         int ret;
7852
7853         if (!ctx->file_data)
7854                 return -ENXIO;
7855         ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
7856         if (!ret)
7857                 __io_sqe_files_unregister(ctx);
7858         return ret;
7859 }
7860
7861 static void io_sq_thread_unpark(struct io_sq_data *sqd)
7862         __releases(&sqd->lock)
7863 {
7864         WARN_ON_ONCE(sqd->thread == current);
7865
7866         /*
7867          * Do the dance but not conditional clear_bit() because it'd race with
7868          * other threads incrementing park_pending and setting the bit.
7869          */
7870         clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
7871         if (atomic_dec_return(&sqd->park_pending))
7872                 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
7873         mutex_unlock(&sqd->lock);
7874 }
7875
7876 static void io_sq_thread_park(struct io_sq_data *sqd)
7877         __acquires(&sqd->lock)
7878 {
7879         WARN_ON_ONCE(sqd->thread == current);
7880
7881         atomic_inc(&sqd->park_pending);
7882         set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
7883         mutex_lock(&sqd->lock);
7884         if (sqd->thread)
7885                 wake_up_process(sqd->thread);
7886 }
7887
7888 static void io_sq_thread_stop(struct io_sq_data *sqd)
7889 {
7890         WARN_ON_ONCE(sqd->thread == current);
7891         WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
7892
7893         set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
7894         mutex_lock(&sqd->lock);
7895         if (sqd->thread)
7896                 wake_up_process(sqd->thread);
7897         mutex_unlock(&sqd->lock);
7898         wait_for_completion(&sqd->exited);
7899 }
7900
7901 static void io_put_sq_data(struct io_sq_data *sqd)
7902 {
7903         if (refcount_dec_and_test(&sqd->refs)) {
7904                 WARN_ON_ONCE(atomic_read(&sqd->park_pending));
7905
7906                 io_sq_thread_stop(sqd);
7907                 kfree(sqd);
7908         }
7909 }
7910
7911 static void io_sq_thread_finish(struct io_ring_ctx *ctx)
7912 {
7913         struct io_sq_data *sqd = ctx->sq_data;
7914
7915         if (sqd) {
7916                 io_sq_thread_park(sqd);
7917                 list_del_init(&ctx->sqd_list);
7918                 io_sqd_update_thread_idle(sqd);
7919                 io_sq_thread_unpark(sqd);
7920
7921                 io_put_sq_data(sqd);
7922                 ctx->sq_data = NULL;
7923         }
7924 }
7925
7926 static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
7927 {
7928         struct io_ring_ctx *ctx_attach;
7929         struct io_sq_data *sqd;
7930         struct fd f;
7931
7932         f = fdget(p->wq_fd);
7933         if (!f.file)
7934                 return ERR_PTR(-ENXIO);
7935         if (f.file->f_op != &io_uring_fops) {
7936                 fdput(f);
7937                 return ERR_PTR(-EINVAL);
7938         }
7939
7940         ctx_attach = f.file->private_data;
7941         sqd = ctx_attach->sq_data;
7942         if (!sqd) {
7943                 fdput(f);
7944                 return ERR_PTR(-EINVAL);
7945         }
7946         if (sqd->task_tgid != current->tgid) {
7947                 fdput(f);
7948                 return ERR_PTR(-EPERM);
7949         }
7950
7951         refcount_inc(&sqd->refs);
7952         fdput(f);
7953         return sqd;
7954 }
7955
7956 static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
7957                                          bool *attached)
7958 {
7959         struct io_sq_data *sqd;
7960
7961         *attached = false;
7962         if (p->flags & IORING_SETUP_ATTACH_WQ) {
7963                 sqd = io_attach_sq_data(p);
7964                 if (!IS_ERR(sqd)) {
7965                         *attached = true;
7966                         return sqd;
7967                 }
7968                 /* fall through for EPERM case, setup new sqd/task */
7969                 if (PTR_ERR(sqd) != -EPERM)
7970                         return sqd;
7971         }
7972
7973         sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
7974         if (!sqd)
7975                 return ERR_PTR(-ENOMEM);
7976
7977         atomic_set(&sqd->park_pending, 0);
7978         refcount_set(&sqd->refs, 1);
7979         INIT_LIST_HEAD(&sqd->ctx_list);
7980         mutex_init(&sqd->lock);
7981         init_waitqueue_head(&sqd->wait);
7982         init_completion(&sqd->exited);
7983         return sqd;
7984 }
7985
7986 #if defined(CONFIG_UNIX)
7987 /*
7988  * Ensure the UNIX gc is aware of our file set, so we are certain that
7989  * the io_uring can be safely unregistered on process exit, even if we have
7990  * loops in the file referencing.
7991  */
7992 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
7993 {
7994         struct sock *sk = ctx->ring_sock->sk;
7995         struct scm_fp_list *fpl;
7996         struct sk_buff *skb;
7997         int i, nr_files;
7998
7999         fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
8000         if (!fpl)
8001                 return -ENOMEM;
8002
8003         skb = alloc_skb(0, GFP_KERNEL);
8004         if (!skb) {
8005                 kfree(fpl);
8006                 return -ENOMEM;
8007         }
8008
8009         skb->sk = sk;
8010
8011         nr_files = 0;
8012         fpl->user = get_uid(current_user());
8013         for (i = 0; i < nr; i++) {
8014                 struct file *file = io_file_from_index(ctx, i + offset);
8015
8016                 if (!file)
8017                         continue;
8018                 fpl->fp[nr_files] = get_file(file);
8019                 unix_inflight(fpl->user, fpl->fp[nr_files]);
8020                 nr_files++;
8021         }
8022
8023         if (nr_files) {
8024                 fpl->max = SCM_MAX_FD;
8025                 fpl->count = nr_files;
8026                 UNIXCB(skb).fp = fpl;
8027                 skb->destructor = unix_destruct_scm;
8028                 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
8029                 skb_queue_head(&sk->sk_receive_queue, skb);
8030
8031                 for (i = 0; i < nr_files; i++)
8032                         fput(fpl->fp[i]);
8033         } else {
8034                 kfree_skb(skb);
8035                 kfree(fpl);
8036         }
8037
8038         return 0;
8039 }
8040
8041 /*
8042  * If UNIX sockets are enabled, fd passing can cause a reference cycle which
8043  * causes regular reference counting to break down. We rely on the UNIX
8044  * garbage collection to take care of this problem for us.
8045  */
8046 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
8047 {
8048         unsigned left, total;
8049         int ret = 0;
8050
8051         total = 0;
8052         left = ctx->nr_user_files;
8053         while (left) {
8054                 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
8055
8056                 ret = __io_sqe_files_scm(ctx, this_files, total);
8057                 if (ret)
8058                         break;
8059                 left -= this_files;
8060                 total += this_files;
8061         }
8062
8063         if (!ret)
8064                 return 0;
8065
8066         while (total < ctx->nr_user_files) {
8067                 struct file *file = io_file_from_index(ctx, total);
8068
8069                 if (file)
8070                         fput(file);
8071                 total++;
8072         }
8073
8074         return ret;
8075 }
8076 #else
8077 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
8078 {
8079         return 0;
8080 }
8081 #endif
8082
8083 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
8084 {
8085         struct file *file = prsrc->file;
8086 #if defined(CONFIG_UNIX)
8087         struct sock *sock = ctx->ring_sock->sk;
8088         struct sk_buff_head list, *head = &sock->sk_receive_queue;
8089         struct sk_buff *skb;
8090         int i;
8091
8092         __skb_queue_head_init(&list);
8093
8094         /*
8095          * Find the skb that holds this file in its SCM_RIGHTS. When found,
8096          * remove this entry and rearrange the file array.
8097          */
8098         skb = skb_dequeue(head);
8099         while (skb) {
8100                 struct scm_fp_list *fp;
8101
8102                 fp = UNIXCB(skb).fp;
8103                 for (i = 0; i < fp->count; i++) {
8104                         int left;
8105
8106                         if (fp->fp[i] != file)
8107                                 continue;
8108
8109                         unix_notinflight(fp->user, fp->fp[i]);
8110                         left = fp->count - 1 - i;
8111                         if (left) {
8112                                 memmove(&fp->fp[i], &fp->fp[i + 1],
8113                                                 left * sizeof(struct file *));
8114                         }
8115                         fp->count--;
8116                         if (!fp->count) {
8117                                 kfree_skb(skb);
8118                                 skb = NULL;
8119                         } else {
8120                                 __skb_queue_tail(&list, skb);
8121                         }
8122                         fput(file);
8123                         file = NULL;
8124                         break;
8125                 }
8126
8127                 if (!file)
8128                         break;
8129
8130                 __skb_queue_tail(&list, skb);
8131
8132                 skb = skb_dequeue(head);
8133         }
8134
8135         if (skb_peek(&list)) {
8136                 spin_lock_irq(&head->lock);
8137                 while ((skb = __skb_dequeue(&list)) != NULL)
8138                         __skb_queue_tail(head, skb);
8139                 spin_unlock_irq(&head->lock);
8140         }
8141 #else
8142         fput(file);
8143 #endif
8144 }
8145
8146 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
8147 {
8148         struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
8149         struct io_ring_ctx *ctx = rsrc_data->ctx;
8150         struct io_rsrc_put *prsrc, *tmp;
8151
8152         list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
8153                 list_del(&prsrc->list);
8154
8155                 if (prsrc->tag) {
8156                         bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
8157
8158                         io_ring_submit_lock(ctx, lock_ring);
8159                         spin_lock(&ctx->completion_lock);
8160                         io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
8161                         ctx->cq_extra++;
8162                         io_commit_cqring(ctx);
8163                         spin_unlock(&ctx->completion_lock);
8164                         io_cqring_ev_posted(ctx);
8165                         io_ring_submit_unlock(ctx, lock_ring);
8166                 }
8167
8168                 rsrc_data->do_put(ctx, prsrc);
8169                 kfree(prsrc);
8170         }
8171
8172         io_rsrc_node_destroy(ref_node);
8173         if (atomic_dec_and_test(&rsrc_data->refs))
8174                 complete(&rsrc_data->done);
8175 }
8176
8177 static void io_rsrc_put_work(struct work_struct *work)
8178 {
8179         struct io_ring_ctx *ctx;
8180         struct llist_node *node;
8181
8182         ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
8183         node = llist_del_all(&ctx->rsrc_put_llist);
8184
8185         while (node) {
8186                 struct io_rsrc_node *ref_node;
8187                 struct llist_node *next = node->next;
8188
8189                 ref_node = llist_entry(node, struct io_rsrc_node, llist);
8190                 __io_rsrc_put_work(ref_node);
8191                 node = next;
8192         }
8193 }
8194
8195 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
8196                                  unsigned nr_args, u64 __user *tags)
8197 {
8198         __s32 __user *fds = (__s32 __user *) arg;
8199         struct file *file;
8200         int fd, ret;
8201         unsigned i;
8202
8203         if (ctx->file_data)
8204                 return -EBUSY;
8205         if (!nr_args)
8206                 return -EINVAL;
8207         if (nr_args > IORING_MAX_FIXED_FILES)
8208                 return -EMFILE;
8209         if (nr_args > rlimit(RLIMIT_NOFILE))
8210                 return -EMFILE;
8211         ret = io_rsrc_node_switch_start(ctx);
8212         if (ret)
8213                 return ret;
8214         ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
8215                                  &ctx->file_data);
8216         if (ret)
8217                 return ret;
8218
8219         ret = -ENOMEM;
8220         if (!io_alloc_file_tables(&ctx->file_table, nr_args))
8221                 goto out_free;
8222
8223         for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
8224                 if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
8225                         ret = -EFAULT;
8226                         goto out_fput;
8227                 }
8228                 /* allow sparse sets */
8229                 if (fd == -1) {
8230                         ret = -EINVAL;
8231                         if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
8232                                 goto out_fput;
8233                         continue;
8234                 }
8235
8236                 file = fget(fd);
8237                 ret = -EBADF;
8238                 if (unlikely(!file))
8239                         goto out_fput;
8240
8241                 /*
8242                  * Don't allow io_uring instances to be registered. If UNIX
8243                  * isn't enabled, then this causes a reference cycle and this
8244                  * instance can never get freed. If UNIX is enabled we'll
8245                  * handle it just fine, but there's still no point in allowing
8246                  * a ring fd as it doesn't support regular read/write anyway.
8247                  */
8248                 if (file->f_op == &io_uring_fops) {
8249                         fput(file);
8250                         goto out_fput;
8251                 }
8252                 io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
8253         }
8254
8255         ret = io_sqe_files_scm(ctx);
8256         if (ret) {
8257                 __io_sqe_files_unregister(ctx);
8258                 return ret;
8259         }
8260
8261         io_rsrc_node_switch(ctx, NULL);
8262         return ret;
8263 out_fput:
8264         for (i = 0; i < ctx->nr_user_files; i++) {
8265                 file = io_file_from_index(ctx, i);
8266                 if (file)
8267                         fput(file);
8268         }
8269         io_free_file_tables(&ctx->file_table);
8270         ctx->nr_user_files = 0;
8271 out_free:
8272         io_rsrc_data_free(ctx->file_data);
8273         ctx->file_data = NULL;
8274         return ret;
8275 }
8276
8277 static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
8278                                 int index)
8279 {
8280 #if defined(CONFIG_UNIX)
8281         struct sock *sock = ctx->ring_sock->sk;
8282         struct sk_buff_head *head = &sock->sk_receive_queue;
8283         struct sk_buff *skb;
8284
8285         /*
8286          * See if we can merge this file into an existing skb SCM_RIGHTS
8287          * file set. If there's no room, fall back to allocating a new skb
8288          * and filling it in.
8289          */
8290         spin_lock_irq(&head->lock);
8291         skb = skb_peek(head);
8292         if (skb) {
8293                 struct scm_fp_list *fpl = UNIXCB(skb).fp;
8294
8295                 if (fpl->count < SCM_MAX_FD) {
8296                         __skb_unlink(skb, head);
8297                         spin_unlock_irq(&head->lock);
8298                         fpl->fp[fpl->count] = get_file(file);
8299                         unix_inflight(fpl->user, fpl->fp[fpl->count]);
8300                         fpl->count++;
8301                         spin_lock_irq(&head->lock);
8302                         __skb_queue_head(head, skb);
8303                 } else {
8304                         skb = NULL;
8305                 }
8306         }
8307         spin_unlock_irq(&head->lock);
8308
8309         if (skb) {
8310                 fput(file);
8311                 return 0;
8312         }
8313
8314         return __io_sqe_files_scm(ctx, 1, index);
8315 #else
8316         return 0;
8317 #endif
8318 }
8319
8320 static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
8321                                  struct io_rsrc_node *node, void *rsrc)
8322 {
8323         struct io_rsrc_put *prsrc;
8324
8325         prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
8326         if (!prsrc)
8327                 return -ENOMEM;
8328
8329         prsrc->tag = *io_get_tag_slot(data, idx);
8330         prsrc->rsrc = rsrc;
8331         list_add(&prsrc->list, &node->rsrc_list);
8332         return 0;
8333 }
8334
8335 static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
8336                                  unsigned int issue_flags, u32 slot_index)
8337 {
8338         struct io_ring_ctx *ctx = req->ctx;
8339         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
8340         bool needs_switch = false;
8341         struct io_fixed_file *file_slot;
8342         int ret = -EBADF;
8343
8344         io_ring_submit_lock(ctx, !force_nonblock);
8345         if (file->f_op == &io_uring_fops)
8346                 goto err;
8347         ret = -ENXIO;
8348         if (!ctx->file_data)
8349                 goto err;
8350         ret = -EINVAL;
8351         if (slot_index >= ctx->nr_user_files)
8352                 goto err;
8353
8354         slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
8355         file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
8356
8357         if (file_slot->file_ptr) {
8358                 struct file *old_file;
8359
8360                 ret = io_rsrc_node_switch_start(ctx);
8361                 if (ret)
8362                         goto err;
8363
8364                 old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
8365                 ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
8366                                             ctx->rsrc_node, old_file);
8367                 if (ret)
8368                         goto err;
8369                 file_slot->file_ptr = 0;
8370                 needs_switch = true;
8371         }
8372
8373         *io_get_tag_slot(ctx->file_data, slot_index) = 0;
8374         io_fixed_file_set(file_slot, file);
8375         ret = io_sqe_file_register(ctx, file, slot_index);
8376         if (ret) {
8377                 file_slot->file_ptr = 0;
8378                 goto err;
8379         }
8380
8381         ret = 0;
8382 err:
8383         if (needs_switch)
8384                 io_rsrc_node_switch(ctx, ctx->file_data);
8385         io_ring_submit_unlock(ctx, !force_nonblock);
8386         if (ret)
8387                 fput(file);
8388         return ret;
8389 }
8390
8391 static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
8392 {
8393         unsigned int offset = req->close.file_slot - 1;
8394         struct io_ring_ctx *ctx = req->ctx;
8395         struct io_fixed_file *file_slot;
8396         struct file *file;
8397         int ret, i;
8398
8399         io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
8400         ret = -ENXIO;
8401         if (unlikely(!ctx->file_data))
8402                 goto out;
8403         ret = -EINVAL;
8404         if (offset >= ctx->nr_user_files)
8405                 goto out;
8406         ret = io_rsrc_node_switch_start(ctx);
8407         if (ret)
8408                 goto out;
8409
8410         i = array_index_nospec(offset, ctx->nr_user_files);
8411         file_slot = io_fixed_file_slot(&ctx->file_table, i);
8412         ret = -EBADF;
8413         if (!file_slot->file_ptr)
8414                 goto out;
8415
8416         file = (struct file *)(file_slot->file_ptr & FFS_MASK);
8417         ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
8418         if (ret)
8419                 goto out;
8420
8421         file_slot->file_ptr = 0;
8422         io_rsrc_node_switch(ctx, ctx->file_data);
8423         ret = 0;
8424 out:
8425         io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
8426         return ret;
8427 }
8428
8429 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
8430                                  struct io_uring_rsrc_update2 *up,
8431                                  unsigned nr_args)
8432 {
8433         u64 __user *tags = u64_to_user_ptr(up->tags);
8434         __s32 __user *fds = u64_to_user_ptr(up->data);
8435         struct io_rsrc_data *data = ctx->file_data;
8436         struct io_fixed_file *file_slot;
8437         struct file *file;
8438         int fd, i, err = 0;
8439         unsigned int done;
8440         bool needs_switch = false;
8441
8442         if (!ctx->file_data)
8443                 return -ENXIO;
8444         if (up->offset + nr_args > ctx->nr_user_files)
8445                 return -EINVAL;
8446
8447         for (done = 0; done < nr_args; done++) {
8448                 u64 tag = 0;
8449
8450                 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
8451                     copy_from_user(&fd, &fds[done], sizeof(fd))) {
8452                         err = -EFAULT;
8453                         break;
8454                 }
8455                 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
8456                         err = -EINVAL;
8457                         break;
8458                 }
8459                 if (fd == IORING_REGISTER_FILES_SKIP)
8460                         continue;
8461
8462                 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
8463                 file_slot = io_fixed_file_slot(&ctx->file_table, i);
8464
8465                 if (file_slot->file_ptr) {
8466                         file = (struct file *)(file_slot->file_ptr & FFS_MASK);
8467                         err = io_queue_rsrc_removal(data, up->offset + done,
8468                                                     ctx->rsrc_node, file);
8469                         if (err)
8470                                 break;
8471                         file_slot->file_ptr = 0;
8472                         needs_switch = true;
8473                 }
8474                 if (fd != -1) {
8475                         file = fget(fd);
8476                         if (!file) {
8477                                 err = -EBADF;
8478                                 break;
8479                         }
8480                         /*
8481                          * Don't allow io_uring instances to be registered. If
8482                          * UNIX isn't enabled, then this causes a reference
8483                          * cycle and this instance can never get freed. If UNIX
8484                          * is enabled we'll handle it just fine, but there's
8485                          * still no point in allowing a ring fd as it doesn't
8486                          * support regular read/write anyway.
8487                          */
8488                         if (file->f_op == &io_uring_fops) {
8489                                 fput(file);
8490                                 err = -EBADF;
8491                                 break;
8492                         }
8493                         *io_get_tag_slot(data, up->offset + done) = tag;
8494                         io_fixed_file_set(file_slot, file);
8495                         err = io_sqe_file_register(ctx, file, i);
8496                         if (err) {
8497                                 file_slot->file_ptr = 0;
8498                                 fput(file);
8499                                 break;
8500                         }
8501                 }
8502         }
8503
8504         if (needs_switch)
8505                 io_rsrc_node_switch(ctx, data);
8506         return done ? done : err;
8507 }
8508
8509 static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
8510                                         struct task_struct *task)
8511 {
8512         struct io_wq_hash *hash;
8513         struct io_wq_data data;
8514         unsigned int concurrency;
8515
8516         mutex_lock(&ctx->uring_lock);
8517         hash = ctx->hash_map;
8518         if (!hash) {
8519                 hash = kzalloc(sizeof(*hash), GFP_KERNEL);
8520                 if (!hash) {
8521                         mutex_unlock(&ctx->uring_lock);
8522                         return ERR_PTR(-ENOMEM);
8523                 }
8524                 refcount_set(&hash->refs, 1);
8525                 init_waitqueue_head(&hash->wait);
8526                 ctx->hash_map = hash;
8527         }
8528         mutex_unlock(&ctx->uring_lock);
8529
8530         data.hash = hash;
8531         data.task = task;
8532         data.free_work = io_wq_free_work;
8533         data.do_work = io_wq_submit_work;
8534
8535         /* Do QD, or 4 * CPUS, whatever is smallest */
8536         concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
8537
8538         return io_wq_create(concurrency, &data);
8539 }
8540
8541 static __cold int io_uring_alloc_task_context(struct task_struct *task,
8542                                               struct io_ring_ctx *ctx)
8543 {
8544         struct io_uring_task *tctx;
8545         int ret;
8546
8547         tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
8548         if (unlikely(!tctx))
8549                 return -ENOMEM;
8550
8551         ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
8552         if (unlikely(ret)) {
8553                 kfree(tctx);
8554                 return ret;
8555         }
8556
8557         tctx->io_wq = io_init_wq_offload(ctx, task);
8558         if (IS_ERR(tctx->io_wq)) {
8559                 ret = PTR_ERR(tctx->io_wq);
8560                 percpu_counter_destroy(&tctx->inflight);
8561                 kfree(tctx);
8562                 return ret;
8563         }
8564
8565         xa_init(&tctx->xa);
8566         init_waitqueue_head(&tctx->wait);
8567         atomic_set(&tctx->in_idle, 0);
8568         atomic_set(&tctx->inflight_tracked, 0);
8569         task->io_uring = tctx;
8570         spin_lock_init(&tctx->task_lock);
8571         INIT_WQ_LIST(&tctx->task_list);
8572         init_task_work(&tctx->task_work, tctx_task_work);
8573         return 0;
8574 }
8575
8576 void __io_uring_free(struct task_struct *tsk)
8577 {
8578         struct io_uring_task *tctx = tsk->io_uring;
8579
8580         WARN_ON_ONCE(!xa_empty(&tctx->xa));
8581         WARN_ON_ONCE(tctx->io_wq);
8582         WARN_ON_ONCE(tctx->cached_refs);
8583
8584         percpu_counter_destroy(&tctx->inflight);
8585         kfree(tctx);
8586         tsk->io_uring = NULL;
8587 }
8588
8589 static __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
8590                                        struct io_uring_params *p)
8591 {
8592         int ret;
8593
8594         /* Retain compatibility with failing for an invalid attach attempt */
8595         if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
8596                                 IORING_SETUP_ATTACH_WQ) {
8597                 struct fd f;
8598
8599                 f = fdget(p->wq_fd);
8600                 if (!f.file)
8601                         return -ENXIO;
8602                 if (f.file->f_op != &io_uring_fops) {
8603                         fdput(f);
8604                         return -EINVAL;
8605                 }
8606                 fdput(f);
8607         }
8608         if (ctx->flags & IORING_SETUP_SQPOLL) {
8609                 struct task_struct *tsk;
8610                 struct io_sq_data *sqd;
8611                 bool attached;
8612
8613                 sqd = io_get_sq_data(p, &attached);
8614                 if (IS_ERR(sqd)) {
8615                         ret = PTR_ERR(sqd);
8616                         goto err;
8617                 }
8618
8619                 ctx->sq_creds = get_current_cred();
8620                 ctx->sq_data = sqd;
8621                 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
8622                 if (!ctx->sq_thread_idle)
8623                         ctx->sq_thread_idle = HZ;
8624
8625                 io_sq_thread_park(sqd);
8626                 list_add(&ctx->sqd_list, &sqd->ctx_list);
8627                 io_sqd_update_thread_idle(sqd);
8628                 /* don't attach to a dying SQPOLL thread, would be racy */
8629                 ret = (attached && !sqd->thread) ? -ENXIO : 0;
8630                 io_sq_thread_unpark(sqd);
8631
8632                 if (ret < 0)
8633                         goto err;
8634                 if (attached)
8635                         return 0;
8636
8637                 if (p->flags & IORING_SETUP_SQ_AFF) {
8638                         int cpu = p->sq_thread_cpu;
8639
8640                         ret = -EINVAL;
8641                         if (cpu >= nr_cpu_ids || !cpu_online(cpu))
8642                                 goto err_sqpoll;
8643                         sqd->sq_cpu = cpu;
8644                 } else {
8645                         sqd->sq_cpu = -1;
8646                 }
8647
8648                 sqd->task_pid = current->pid;
8649                 sqd->task_tgid = current->tgid;
8650                 tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
8651                 if (IS_ERR(tsk)) {
8652                         ret = PTR_ERR(tsk);
8653                         goto err_sqpoll;
8654                 }
8655
8656                 sqd->thread = tsk;
8657                 ret = io_uring_alloc_task_context(tsk, ctx);
8658                 wake_up_new_task(tsk);
8659                 if (ret)
8660                         goto err;
8661         } else if (p->flags & IORING_SETUP_SQ_AFF) {
8662                 /* Can't have SQ_AFF without SQPOLL */
8663                 ret = -EINVAL;
8664                 goto err;
8665         }
8666
8667         return 0;
8668 err_sqpoll:
8669         complete(&ctx->sq_data->exited);
8670 err:
8671         io_sq_thread_finish(ctx);
8672         return ret;
8673 }
8674
8675 static inline void __io_unaccount_mem(struct user_struct *user,
8676                                       unsigned long nr_pages)
8677 {
8678         atomic_long_sub(nr_pages, &user->locked_vm);
8679 }
8680
8681 static inline int __io_account_mem(struct user_struct *user,
8682                                    unsigned long nr_pages)
8683 {
8684         unsigned long page_limit, cur_pages, new_pages;
8685
8686         /* Don't allow more pages than we can safely lock */
8687         page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
8688
8689         do {
8690                 cur_pages = atomic_long_read(&user->locked_vm);
8691                 new_pages = cur_pages + nr_pages;
8692                 if (new_pages > page_limit)
8693                         return -ENOMEM;
8694         } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
8695                                         new_pages) != cur_pages);
8696
8697         return 0;
8698 }
8699
8700 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
8701 {
8702         if (ctx->user)
8703                 __io_unaccount_mem(ctx->user, nr_pages);
8704
8705         if (ctx->mm_account)
8706                 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
8707 }
8708
8709 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
8710 {
8711         int ret;
8712
8713         if (ctx->user) {
8714                 ret = __io_account_mem(ctx->user, nr_pages);
8715                 if (ret)
8716                         return ret;
8717         }
8718
8719         if (ctx->mm_account)
8720                 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
8721
8722         return 0;
8723 }
8724
8725 static void io_mem_free(void *ptr)
8726 {
8727         struct page *page;
8728
8729         if (!ptr)
8730                 return;
8731
8732         page = virt_to_head_page(ptr);
8733         if (put_page_testzero(page))
8734                 free_compound_page(page);
8735 }
8736
8737 static void *io_mem_alloc(size_t size)
8738 {
8739         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
8740                                 __GFP_NORETRY | __GFP_ACCOUNT;
8741
8742         return (void *) __get_free_pages(gfp_flags, get_order(size));
8743 }
8744
8745 static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
8746                                 size_t *sq_offset)
8747 {
8748         struct io_rings *rings;
8749         size_t off, sq_array_size;
8750
8751         off = struct_size(rings, cqes, cq_entries);
8752         if (off == SIZE_MAX)
8753                 return SIZE_MAX;
8754
8755 #ifdef CONFIG_SMP
8756         off = ALIGN(off, SMP_CACHE_BYTES);
8757         if (off == 0)
8758                 return SIZE_MAX;
8759 #endif
8760
8761         if (sq_offset)
8762                 *sq_offset = off;
8763
8764         sq_array_size = array_size(sizeof(u32), sq_entries);
8765         if (sq_array_size == SIZE_MAX)
8766                 return SIZE_MAX;
8767
8768         if (check_add_overflow(off, sq_array_size, &off))
8769                 return SIZE_MAX;
8770
8771         return off;
8772 }
8773
8774 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
8775 {
8776         struct io_mapped_ubuf *imu = *slot;
8777         unsigned int i;
8778
8779         if (imu != ctx->dummy_ubuf) {
8780                 for (i = 0; i < imu->nr_bvecs; i++)
8781                         unpin_user_page(imu->bvec[i].bv_page);
8782                 if (imu->acct_pages)
8783                         io_unaccount_mem(ctx, imu->acct_pages);
8784                 kvfree(imu);
8785         }
8786         *slot = NULL;
8787 }
8788
8789 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
8790 {
8791         io_buffer_unmap(ctx, &prsrc->buf);
8792         prsrc->buf = NULL;
8793 }
8794
8795 static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
8796 {
8797         unsigned int i;
8798
8799         for (i = 0; i < ctx->nr_user_bufs; i++)
8800                 io_buffer_unmap(ctx, &ctx->user_bufs[i]);
8801         kfree(ctx->user_bufs);
8802         io_rsrc_data_free(ctx->buf_data);
8803         ctx->user_bufs = NULL;
8804         ctx->buf_data = NULL;
8805         ctx->nr_user_bufs = 0;
8806 }
8807
8808 static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
8809 {
8810         int ret;
8811
8812         if (!ctx->buf_data)
8813                 return -ENXIO;
8814
8815         ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
8816         if (!ret)
8817                 __io_sqe_buffers_unregister(ctx);
8818         return ret;
8819 }
8820
8821 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
8822                        void __user *arg, unsigned index)
8823 {
8824         struct iovec __user *src;
8825
8826 #ifdef CONFIG_COMPAT
8827         if (ctx->compat) {
8828                 struct compat_iovec __user *ciovs;
8829                 struct compat_iovec ciov;
8830
8831                 ciovs = (struct compat_iovec __user *) arg;
8832                 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
8833                         return -EFAULT;
8834
8835                 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
8836                 dst->iov_len = ciov.iov_len;
8837                 return 0;
8838         }
8839 #endif
8840         src = (struct iovec __user *) arg;
8841         if (copy_from_user(dst, &src[index], sizeof(*dst)))
8842                 return -EFAULT;
8843         return 0;
8844 }
8845
8846 /*
8847  * Not super efficient, but this is just a registration time. And we do cache
8848  * the last compound head, so generally we'll only do a full search if we don't
8849  * match that one.
8850  *
8851  * We check if the given compound head page has already been accounted, to
8852  * avoid double accounting it. This allows us to account the full size of the
8853  * page, not just the constituent pages of a huge page.
8854  */
8855 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
8856                                   int nr_pages, struct page *hpage)
8857 {
8858         int i, j;
8859
8860         /* check current page array */
8861         for (i = 0; i < nr_pages; i++) {
8862                 if (!PageCompound(pages[i]))
8863                         continue;
8864                 if (compound_head(pages[i]) == hpage)
8865                         return true;
8866         }
8867
8868         /* check previously registered pages */
8869         for (i = 0; i < ctx->nr_user_bufs; i++) {
8870                 struct io_mapped_ubuf *imu = ctx->user_bufs[i];
8871
8872                 for (j = 0; j < imu->nr_bvecs; j++) {
8873                         if (!PageCompound(imu->bvec[j].bv_page))
8874                                 continue;
8875                         if (compound_head(imu->bvec[j].bv_page) == hpage)
8876                                 return true;
8877                 }
8878         }
8879
8880         return false;
8881 }
8882
8883 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
8884                                  int nr_pages, struct io_mapped_ubuf *imu,
8885                                  struct page **last_hpage)
8886 {
8887         int i, ret;
8888
8889         imu->acct_pages = 0;
8890         for (i = 0; i < nr_pages; i++) {
8891                 if (!PageCompound(pages[i])) {
8892                         imu->acct_pages++;
8893                 } else {
8894                         struct page *hpage;
8895
8896                         hpage = compound_head(pages[i]);
8897                         if (hpage == *last_hpage)
8898                                 continue;
8899                         *last_hpage = hpage;
8900                         if (headpage_already_acct(ctx, pages, i, hpage))
8901                                 continue;
8902                         imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
8903                 }
8904         }
8905
8906         if (!imu->acct_pages)
8907                 return 0;
8908
8909         ret = io_account_mem(ctx, imu->acct_pages);
8910         if (ret)
8911                 imu->acct_pages = 0;
8912         return ret;
8913 }
8914
8915 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
8916                                   struct io_mapped_ubuf **pimu,
8917                                   struct page **last_hpage)
8918 {
8919         struct io_mapped_ubuf *imu = NULL;
8920         struct vm_area_struct **vmas = NULL;
8921         struct page **pages = NULL;
8922         unsigned long off, start, end, ubuf;
8923         size_t size;
8924         int ret, pret, nr_pages, i;
8925
8926         if (!iov->iov_base) {
8927                 *pimu = ctx->dummy_ubuf;
8928                 return 0;
8929         }
8930
8931         ubuf = (unsigned long) iov->iov_base;
8932         end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
8933         start = ubuf >> PAGE_SHIFT;
8934         nr_pages = end - start;
8935
8936         *pimu = NULL;
8937         ret = -ENOMEM;
8938
8939         pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
8940         if (!pages)
8941                 goto done;
8942
8943         vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
8944                               GFP_KERNEL);
8945         if (!vmas)
8946                 goto done;
8947
8948         imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
8949         if (!imu)
8950                 goto done;
8951
8952         ret = 0;
8953         mmap_read_lock(current->mm);
8954         pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
8955                               pages, vmas);
8956         if (pret == nr_pages) {
8957                 /* don't support file backed memory */
8958                 for (i = 0; i < nr_pages; i++) {
8959                         struct vm_area_struct *vma = vmas[i];
8960
8961                         if (vma_is_shmem(vma))
8962                                 continue;
8963                         if (vma->vm_file &&
8964                             !is_file_hugepages(vma->vm_file)) {
8965                                 ret = -EOPNOTSUPP;
8966                                 break;
8967                         }
8968                 }
8969         } else {
8970                 ret = pret < 0 ? pret : -EFAULT;
8971         }
8972         mmap_read_unlock(current->mm);
8973         if (ret) {
8974                 /*
8975                  * if we did partial map, or found file backed vmas,
8976                  * release any pages we did get
8977                  */
8978                 if (pret > 0)
8979                         unpin_user_pages(pages, pret);
8980                 goto done;
8981         }
8982
8983         ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
8984         if (ret) {
8985                 unpin_user_pages(pages, pret);
8986                 goto done;
8987         }
8988
8989         off = ubuf & ~PAGE_MASK;
8990         size = iov->iov_len;
8991         for (i = 0; i < nr_pages; i++) {
8992                 size_t vec_len;
8993
8994                 vec_len = min_t(size_t, size, PAGE_SIZE - off);
8995                 imu->bvec[i].bv_page = pages[i];
8996                 imu->bvec[i].bv_len = vec_len;
8997                 imu->bvec[i].bv_offset = off;
8998                 off = 0;
8999                 size -= vec_len;
9000         }
9001         /* store original address for later verification */
9002         imu->ubuf = ubuf;
9003         imu->ubuf_end = ubuf + iov->iov_len;
9004         imu->nr_bvecs = nr_pages;
9005         *pimu = imu;
9006         ret = 0;
9007 done:
9008         if (ret)
9009                 kvfree(imu);
9010         kvfree(pages);
9011         kvfree(vmas);
9012         return ret;
9013 }
9014
9015 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
9016 {
9017         ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
9018         return ctx->user_bufs ? 0 : -ENOMEM;
9019 }
9020
9021 static int io_buffer_validate(struct iovec *iov)
9022 {
9023         unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
9024
9025         /*
9026          * Don't impose further limits on the size and buffer
9027          * constraints here, we'll -EINVAL later when IO is
9028          * submitted if they are wrong.
9029          */
9030         if (!iov->iov_base)
9031                 return iov->iov_len ? -EFAULT : 0;
9032         if (!iov->iov_len)
9033                 return -EFAULT;
9034
9035         /* arbitrary limit, but we need something */
9036         if (iov->iov_len > SZ_1G)
9037                 return -EFAULT;
9038
9039         if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
9040                 return -EOVERFLOW;
9041
9042         return 0;
9043 }
9044
9045 static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
9046                                    unsigned int nr_args, u64 __user *tags)
9047 {
9048         struct page *last_hpage = NULL;
9049         struct io_rsrc_data *data;
9050         int i, ret;
9051         struct iovec iov;
9052
9053         if (ctx->user_bufs)
9054                 return -EBUSY;
9055         if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
9056                 return -EINVAL;
9057         ret = io_rsrc_node_switch_start(ctx);
9058         if (ret)
9059                 return ret;
9060         ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
9061         if (ret)
9062                 return ret;
9063         ret = io_buffers_map_alloc(ctx, nr_args);
9064         if (ret) {
9065                 io_rsrc_data_free(data);
9066                 return ret;
9067         }
9068
9069         for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
9070                 ret = io_copy_iov(ctx, &iov, arg, i);
9071                 if (ret)
9072                         break;
9073                 ret = io_buffer_validate(&iov);
9074                 if (ret)
9075                         break;
9076                 if (!iov.iov_base && *io_get_tag_slot(data, i)) {
9077                         ret = -EINVAL;
9078                         break;
9079                 }
9080
9081                 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
9082                                              &last_hpage);
9083                 if (ret)
9084                         break;
9085         }
9086
9087         WARN_ON_ONCE(ctx->buf_data);
9088
9089         ctx->buf_data = data;
9090         if (ret)
9091                 __io_sqe_buffers_unregister(ctx);
9092         else
9093                 io_rsrc_node_switch(ctx, NULL);
9094         return ret;
9095 }
9096
9097 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
9098                                    struct io_uring_rsrc_update2 *up,
9099                                    unsigned int nr_args)
9100 {
9101         u64 __user *tags = u64_to_user_ptr(up->tags);
9102         struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
9103         struct page *last_hpage = NULL;
9104         bool needs_switch = false;
9105         __u32 done;
9106         int i, err;
9107
9108         if (!ctx->buf_data)
9109                 return -ENXIO;
9110         if (up->offset + nr_args > ctx->nr_user_bufs)
9111                 return -EINVAL;
9112
9113         for (done = 0; done < nr_args; done++) {
9114                 struct io_mapped_ubuf *imu;
9115                 int offset = up->offset + done;
9116                 u64 tag = 0;
9117
9118                 err = io_copy_iov(ctx, &iov, iovs, done);
9119                 if (err)
9120                         break;
9121                 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
9122                         err = -EFAULT;
9123                         break;
9124                 }
9125                 err = io_buffer_validate(&iov);
9126                 if (err)
9127                         break;
9128                 if (!iov.iov_base && tag) {
9129                         err = -EINVAL;
9130                         break;
9131                 }
9132                 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
9133                 if (err)
9134                         break;
9135
9136                 i = array_index_nospec(offset, ctx->nr_user_bufs);
9137                 if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
9138                         err = io_queue_rsrc_removal(ctx->buf_data, offset,
9139                                                     ctx->rsrc_node, ctx->user_bufs[i]);
9140                         if (unlikely(err)) {
9141                                 io_buffer_unmap(ctx, &imu);
9142                                 break;
9143                         }
9144                         ctx->user_bufs[i] = NULL;
9145                         needs_switch = true;
9146                 }
9147
9148                 ctx->user_bufs[i] = imu;
9149                 *io_get_tag_slot(ctx->buf_data, offset) = tag;
9150         }
9151
9152         if (needs_switch)
9153                 io_rsrc_node_switch(ctx, ctx->buf_data);
9154         return done ? done : err;
9155 }
9156
9157 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
9158 {
9159         __s32 __user *fds = arg;
9160         int fd;
9161
9162         if (ctx->cq_ev_fd)
9163                 return -EBUSY;
9164
9165         if (copy_from_user(&fd, fds, sizeof(*fds)))
9166                 return -EFAULT;
9167
9168         ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
9169         if (IS_ERR(ctx->cq_ev_fd)) {
9170                 int ret = PTR_ERR(ctx->cq_ev_fd);
9171
9172                 ctx->cq_ev_fd = NULL;
9173                 return ret;
9174         }
9175
9176         return 0;
9177 }
9178
9179 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
9180 {
9181         if (ctx->cq_ev_fd) {
9182                 eventfd_ctx_put(ctx->cq_ev_fd);
9183                 ctx->cq_ev_fd = NULL;
9184                 return 0;
9185         }
9186
9187         return -ENXIO;
9188 }
9189
9190 static void io_destroy_buffers(struct io_ring_ctx *ctx)
9191 {
9192         struct io_buffer *buf;
9193         unsigned long index;
9194
9195         xa_for_each(&ctx->io_buffers, index, buf) {
9196                 __io_remove_buffers(ctx, buf, index, -1U);
9197                 cond_resched();
9198         }
9199 }
9200
9201 static void io_req_caches_free(struct io_ring_ctx *ctx)
9202 {
9203         struct io_submit_state *state = &ctx->submit_state;
9204         int nr = 0;
9205
9206         mutex_lock(&ctx->uring_lock);
9207         io_flush_cached_locked_reqs(ctx, state);
9208
9209         while (state->free_list.next) {
9210                 struct io_wq_work_node *node;
9211                 struct io_kiocb *req;
9212
9213                 node = wq_stack_extract(&state->free_list);
9214                 req = container_of(node, struct io_kiocb, comp_list);
9215                 kmem_cache_free(req_cachep, req);
9216                 nr++;
9217         }
9218         if (nr)
9219                 percpu_ref_put_many(&ctx->refs, nr);
9220         mutex_unlock(&ctx->uring_lock);
9221 }
9222
9223 static void io_wait_rsrc_data(struct io_rsrc_data *data)
9224 {
9225         if (data && !atomic_dec_and_test(&data->refs))
9226                 wait_for_completion(&data->done);
9227 }
9228
9229 static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
9230 {
9231         io_sq_thread_finish(ctx);
9232
9233         if (ctx->mm_account) {
9234                 mmdrop(ctx->mm_account);
9235                 ctx->mm_account = NULL;
9236         }
9237
9238         io_rsrc_refs_drop(ctx);
9239         /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
9240         io_wait_rsrc_data(ctx->buf_data);
9241         io_wait_rsrc_data(ctx->file_data);
9242
9243         mutex_lock(&ctx->uring_lock);
9244         if (ctx->buf_data)
9245                 __io_sqe_buffers_unregister(ctx);
9246         if (ctx->file_data)
9247                 __io_sqe_files_unregister(ctx);
9248         if (ctx->rings)
9249                 __io_cqring_overflow_flush(ctx, true);
9250         mutex_unlock(&ctx->uring_lock);
9251         io_eventfd_unregister(ctx);
9252         io_destroy_buffers(ctx);
9253         if (ctx->sq_creds)
9254                 put_cred(ctx->sq_creds);
9255
9256         /* there are no registered resources left, nobody uses it */
9257         if (ctx->rsrc_node)
9258                 io_rsrc_node_destroy(ctx->rsrc_node);
9259         if (ctx->rsrc_backup_node)
9260                 io_rsrc_node_destroy(ctx->rsrc_backup_node);
9261         flush_delayed_work(&ctx->rsrc_put_work);
9262         flush_delayed_work(&ctx->fallback_work);
9263
9264         WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
9265         WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
9266
9267 #if defined(CONFIG_UNIX)
9268         if (ctx->ring_sock) {
9269                 ctx->ring_sock->file = NULL; /* so that iput() is called */
9270                 sock_release(ctx->ring_sock);
9271         }
9272 #endif
9273         WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
9274
9275         io_mem_free(ctx->rings);
9276         io_mem_free(ctx->sq_sqes);
9277
9278         percpu_ref_exit(&ctx->refs);
9279         free_uid(ctx->user);
9280         io_req_caches_free(ctx);
9281         if (ctx->hash_map)
9282                 io_wq_put_hash(ctx->hash_map);
9283         kfree(ctx->cancel_hash);
9284         kfree(ctx->dummy_ubuf);
9285         kfree(ctx);
9286 }
9287
9288 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
9289 {
9290         struct io_ring_ctx *ctx = file->private_data;
9291         __poll_t mask = 0;
9292
9293         poll_wait(file, &ctx->cq_wait, wait);
9294         /*
9295          * synchronizes with barrier from wq_has_sleeper call in
9296          * io_commit_cqring
9297          */
9298         smp_rmb();
9299         if (!io_sqring_full(ctx))
9300                 mask |= EPOLLOUT | EPOLLWRNORM;
9301
9302         /*
9303          * Don't flush cqring overflow list here, just do a simple check.
9304          * Otherwise there could possible be ABBA deadlock:
9305          *      CPU0                    CPU1
9306          *      ----                    ----
9307          * lock(&ctx->uring_lock);
9308          *                              lock(&ep->mtx);
9309          *                              lock(&ctx->uring_lock);
9310          * lock(&ep->mtx);
9311          *
9312          * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
9313          * pushs them to do the flush.
9314          */
9315         if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow))
9316                 mask |= EPOLLIN | EPOLLRDNORM;
9317
9318         return mask;
9319 }
9320
9321 static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
9322 {
9323         const struct cred *creds;
9324
9325         creds = xa_erase(&ctx->personalities, id);
9326         if (creds) {
9327                 put_cred(creds);
9328                 return 0;
9329         }
9330
9331         return -EINVAL;
9332 }
9333
9334 struct io_tctx_exit {
9335         struct callback_head            task_work;
9336         struct completion               completion;
9337         struct io_ring_ctx              *ctx;
9338 };
9339
9340 static __cold void io_tctx_exit_cb(struct callback_head *cb)
9341 {
9342         struct io_uring_task *tctx = current->io_uring;
9343         struct io_tctx_exit *work;
9344
9345         work = container_of(cb, struct io_tctx_exit, task_work);
9346         /*
9347          * When @in_idle, we're in cancellation and it's racy to remove the
9348          * node. It'll be removed by the end of cancellation, just ignore it.
9349          */
9350         if (!atomic_read(&tctx->in_idle))
9351                 io_uring_del_tctx_node((unsigned long)work->ctx);
9352         complete(&work->completion);
9353 }
9354
9355 static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
9356 {
9357         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
9358
9359         return req->ctx == data;
9360 }
9361
9362 static __cold void io_ring_exit_work(struct work_struct *work)
9363 {
9364         struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
9365         unsigned long timeout = jiffies + HZ * 60 * 5;
9366         unsigned long interval = HZ / 20;
9367         struct io_tctx_exit exit;
9368         struct io_tctx_node *node;
9369         int ret;
9370
9371         /*
9372          * If we're doing polled IO and end up having requests being
9373          * submitted async (out-of-line), then completions can come in while
9374          * we're waiting for refs to drop. We need to reap these manually,
9375          * as nobody else will be looking for them.
9376          */
9377         do {
9378                 io_uring_try_cancel_requests(ctx, NULL, true);
9379                 if (ctx->sq_data) {
9380                         struct io_sq_data *sqd = ctx->sq_data;
9381                         struct task_struct *tsk;
9382
9383                         io_sq_thread_park(sqd);
9384                         tsk = sqd->thread;
9385                         if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
9386                                 io_wq_cancel_cb(tsk->io_uring->io_wq,
9387                                                 io_cancel_ctx_cb, ctx, true);
9388                         io_sq_thread_unpark(sqd);
9389                 }
9390
9391                 io_req_caches_free(ctx);
9392
9393                 if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
9394                         /* there is little hope left, don't run it too often */
9395                         interval = HZ * 60;
9396                 }
9397         } while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
9398
9399         init_completion(&exit.completion);
9400         init_task_work(&exit.task_work, io_tctx_exit_cb);
9401         exit.ctx = ctx;
9402         /*
9403          * Some may use context even when all refs and requests have been put,
9404          * and they are free to do so while still holding uring_lock or
9405          * completion_lock, see io_req_task_submit(). Apart from other work,
9406          * this lock/unlock section also waits them to finish.
9407          */
9408         mutex_lock(&ctx->uring_lock);
9409         while (!list_empty(&ctx->tctx_list)) {
9410                 WARN_ON_ONCE(time_after(jiffies, timeout));
9411
9412                 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
9413                                         ctx_node);
9414                 /* don't spin on a single task if cancellation failed */
9415                 list_rotate_left(&ctx->tctx_list);
9416                 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
9417                 if (WARN_ON_ONCE(ret))
9418                         continue;
9419
9420                 mutex_unlock(&ctx->uring_lock);
9421                 wait_for_completion(&exit.completion);
9422                 mutex_lock(&ctx->uring_lock);
9423         }
9424         mutex_unlock(&ctx->uring_lock);
9425         spin_lock(&ctx->completion_lock);
9426         spin_unlock(&ctx->completion_lock);
9427
9428         io_ring_ctx_free(ctx);
9429 }
9430
9431 /* Returns true if we found and killed one or more timeouts */
9432 static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
9433                                     struct task_struct *tsk, bool cancel_all)
9434 {
9435         struct io_kiocb *req, *tmp;
9436         int canceled = 0;
9437
9438         spin_lock(&ctx->completion_lock);
9439         spin_lock_irq(&ctx->timeout_lock);
9440         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
9441                 if (io_match_task(req, tsk, cancel_all)) {
9442                         io_kill_timeout(req, -ECANCELED);
9443                         canceled++;
9444                 }
9445         }
9446         spin_unlock_irq(&ctx->timeout_lock);
9447         if (canceled != 0)
9448                 io_commit_cqring(ctx);
9449         spin_unlock(&ctx->completion_lock);
9450         if (canceled != 0)
9451                 io_cqring_ev_posted(ctx);
9452         return canceled != 0;
9453 }
9454
9455 static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
9456 {
9457         unsigned long index;
9458         struct creds *creds;
9459
9460         mutex_lock(&ctx->uring_lock);
9461         percpu_ref_kill(&ctx->refs);
9462         if (ctx->rings)
9463                 __io_cqring_overflow_flush(ctx, true);
9464         xa_for_each(&ctx->personalities, index, creds)
9465                 io_unregister_personality(ctx, index);
9466         mutex_unlock(&ctx->uring_lock);
9467
9468         io_kill_timeouts(ctx, NULL, true);
9469         io_poll_remove_all(ctx, NULL, true);
9470
9471         /* if we failed setting up the ctx, we might not have any rings */
9472         io_iopoll_try_reap_events(ctx);
9473
9474         INIT_WORK(&ctx->exit_work, io_ring_exit_work);
9475         /*
9476          * Use system_unbound_wq to avoid spawning tons of event kworkers
9477          * if we're exiting a ton of rings at the same time. It just adds
9478          * noise and overhead, there's no discernable change in runtime
9479          * over using system_wq.
9480          */
9481         queue_work(system_unbound_wq, &ctx->exit_work);
9482 }
9483
9484 static int io_uring_release(struct inode *inode, struct file *file)
9485 {
9486         struct io_ring_ctx *ctx = file->private_data;
9487
9488         file->private_data = NULL;
9489         io_ring_ctx_wait_and_kill(ctx);
9490         return 0;
9491 }
9492
9493 struct io_task_cancel {
9494         struct task_struct *task;
9495         bool all;
9496 };
9497
9498 static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
9499 {
9500         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
9501         struct io_task_cancel *cancel = data;
9502         bool ret;
9503
9504         if (!cancel->all && (req->flags & REQ_F_LINK_TIMEOUT)) {
9505                 struct io_ring_ctx *ctx = req->ctx;
9506
9507                 /* protect against races with linked timeouts */
9508                 spin_lock(&ctx->completion_lock);
9509                 ret = io_match_task(req, cancel->task, cancel->all);
9510                 spin_unlock(&ctx->completion_lock);
9511         } else {
9512                 ret = io_match_task(req, cancel->task, cancel->all);
9513         }
9514         return ret;
9515 }
9516
9517 static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
9518                                          struct task_struct *task,
9519                                          bool cancel_all)
9520 {
9521         struct io_defer_entry *de;
9522         LIST_HEAD(list);
9523
9524         spin_lock(&ctx->completion_lock);
9525         list_for_each_entry_reverse(de, &ctx->defer_list, list) {
9526                 if (io_match_task(de->req, task, cancel_all)) {
9527                         list_cut_position(&list, &ctx->defer_list, &de->list);
9528                         break;
9529                 }
9530         }
9531         spin_unlock(&ctx->completion_lock);
9532         if (list_empty(&list))
9533                 return false;
9534
9535         while (!list_empty(&list)) {
9536                 de = list_first_entry(&list, struct io_defer_entry, list);
9537                 list_del_init(&de->list);
9538                 io_req_complete_failed(de->req, -ECANCELED);
9539                 kfree(de);
9540         }
9541         return true;
9542 }
9543
9544 static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
9545 {
9546         struct io_tctx_node *node;
9547         enum io_wq_cancel cret;
9548         bool ret = false;
9549
9550         mutex_lock(&ctx->uring_lock);
9551         list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
9552                 struct io_uring_task *tctx = node->task->io_uring;
9553
9554                 /*
9555                  * io_wq will stay alive while we hold uring_lock, because it's
9556                  * killed after ctx nodes, which requires to take the lock.
9557                  */
9558                 if (!tctx || !tctx->io_wq)
9559                         continue;
9560                 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
9561                 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
9562         }
9563         mutex_unlock(&ctx->uring_lock);
9564
9565         return ret;
9566 }
9567
9568 static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
9569                                                 struct task_struct *task,
9570                                                 bool cancel_all)
9571 {
9572         struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
9573         struct io_uring_task *tctx = task ? task->io_uring : NULL;
9574
9575         while (1) {
9576                 enum io_wq_cancel cret;
9577                 bool ret = false;
9578
9579                 if (!task) {
9580                         ret |= io_uring_try_cancel_iowq(ctx);
9581                 } else if (tctx && tctx->io_wq) {
9582                         /*
9583                          * Cancels requests of all rings, not only @ctx, but
9584                          * it's fine as the task is in exit/exec.
9585                          */
9586                         cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
9587                                                &cancel, true);
9588                         ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
9589                 }
9590
9591                 /* SQPOLL thread does its own polling */
9592                 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
9593                     (ctx->sq_data && ctx->sq_data->thread == current)) {
9594                         while (!wq_list_empty(&ctx->iopoll_list)) {
9595                                 io_iopoll_try_reap_events(ctx);
9596                                 ret = true;
9597                         }
9598                 }
9599
9600                 ret |= io_cancel_defer_files(ctx, task, cancel_all);
9601                 ret |= io_poll_remove_all(ctx, task, cancel_all);
9602                 ret |= io_kill_timeouts(ctx, task, cancel_all);
9603                 if (task)
9604                         ret |= io_run_task_work();
9605                 if (!ret)
9606                         break;
9607                 cond_resched();
9608         }
9609 }
9610
9611 static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
9612 {
9613         struct io_uring_task *tctx = current->io_uring;
9614         struct io_tctx_node *node;
9615         int ret;
9616
9617         if (unlikely(!tctx)) {
9618                 ret = io_uring_alloc_task_context(current, ctx);
9619                 if (unlikely(ret))
9620                         return ret;
9621                 tctx = current->io_uring;
9622         }
9623         if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
9624                 node = kmalloc(sizeof(*node), GFP_KERNEL);
9625                 if (!node)
9626                         return -ENOMEM;
9627                 node->ctx = ctx;
9628                 node->task = current;
9629
9630                 ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
9631                                         node, GFP_KERNEL));
9632                 if (ret) {
9633                         kfree(node);
9634                         return ret;
9635                 }
9636
9637                 mutex_lock(&ctx->uring_lock);
9638                 list_add(&node->ctx_node, &ctx->tctx_list);
9639                 mutex_unlock(&ctx->uring_lock);
9640         }
9641         tctx->last = ctx;
9642         return 0;
9643 }
9644
9645 /*
9646  * Note that this task has used io_uring. We use it for cancelation purposes.
9647  */
9648 static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
9649 {
9650         struct io_uring_task *tctx = current->io_uring;
9651
9652         if (likely(tctx && tctx->last == ctx))
9653                 return 0;
9654         return __io_uring_add_tctx_node(ctx);
9655 }
9656
9657 /*
9658  * Remove this io_uring_file -> task mapping.
9659  */
9660 static __cold void io_uring_del_tctx_node(unsigned long index)
9661 {
9662         struct io_uring_task *tctx = current->io_uring;
9663         struct io_tctx_node *node;
9664
9665         if (!tctx)
9666                 return;
9667         node = xa_erase(&tctx->xa, index);
9668         if (!node)
9669                 return;
9670
9671         WARN_ON_ONCE(current != node->task);
9672         WARN_ON_ONCE(list_empty(&node->ctx_node));
9673
9674         mutex_lock(&node->ctx->uring_lock);
9675         list_del(&node->ctx_node);
9676         mutex_unlock(&node->ctx->uring_lock);
9677
9678         if (tctx->last == node->ctx)
9679                 tctx->last = NULL;
9680         kfree(node);
9681 }
9682
9683 static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
9684 {
9685         struct io_wq *wq = tctx->io_wq;
9686         struct io_tctx_node *node;
9687         unsigned long index;
9688
9689         xa_for_each(&tctx->xa, index, node) {
9690                 io_uring_del_tctx_node(index);
9691                 cond_resched();
9692         }
9693         if (wq) {
9694                 /*
9695                  * Must be after io_uring_del_task_file() (removes nodes under
9696                  * uring_lock) to avoid race with io_uring_try_cancel_iowq().
9697                  */
9698                 io_wq_put_and_exit(wq);
9699                 tctx->io_wq = NULL;
9700         }
9701 }
9702
9703 static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
9704 {
9705         if (tracked)
9706                 return atomic_read(&tctx->inflight_tracked);
9707         return percpu_counter_sum(&tctx->inflight);
9708 }
9709
9710 static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
9711 {
9712         struct io_uring_task *tctx = task->io_uring;
9713         unsigned int refs = tctx->cached_refs;
9714
9715         if (refs) {
9716                 tctx->cached_refs = 0;
9717                 percpu_counter_sub(&tctx->inflight, refs);
9718                 put_task_struct_many(task, refs);
9719         }
9720 }
9721
9722 /*
9723  * Find any io_uring ctx that this task has registered or done IO on, and cancel
9724  * requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation.
9725  */
9726 static __cold void io_uring_cancel_generic(bool cancel_all,
9727                                            struct io_sq_data *sqd)
9728 {
9729         struct io_uring_task *tctx = current->io_uring;
9730         struct io_ring_ctx *ctx;
9731         s64 inflight;
9732         DEFINE_WAIT(wait);
9733
9734         WARN_ON_ONCE(sqd && sqd->thread != current);
9735
9736         if (!current->io_uring)
9737                 return;
9738         if (tctx->io_wq)
9739                 io_wq_exit_start(tctx->io_wq);
9740
9741         atomic_inc(&tctx->in_idle);
9742         do {
9743                 io_uring_drop_tctx_refs(current);
9744                 /* read completions before cancelations */
9745                 inflight = tctx_inflight(tctx, !cancel_all);
9746                 if (!inflight)
9747                         break;
9748
9749                 if (!sqd) {
9750                         struct io_tctx_node *node;
9751                         unsigned long index;
9752
9753                         xa_for_each(&tctx->xa, index, node) {
9754                                 /* sqpoll task will cancel all its requests */
9755                                 if (node->ctx->sq_data)
9756                                         continue;
9757                                 io_uring_try_cancel_requests(node->ctx, current,
9758                                                              cancel_all);
9759                         }
9760                 } else {
9761                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
9762                                 io_uring_try_cancel_requests(ctx, current,
9763                                                              cancel_all);
9764                 }
9765
9766                 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
9767                 io_uring_drop_tctx_refs(current);
9768                 /*
9769                  * If we've seen completions, retry without waiting. This
9770                  * avoids a race where a completion comes in before we did
9771                  * prepare_to_wait().
9772                  */
9773                 if (inflight == tctx_inflight(tctx, !cancel_all))
9774                         schedule();
9775                 finish_wait(&tctx->wait, &wait);
9776         } while (1);
9777         atomic_dec(&tctx->in_idle);
9778
9779         io_uring_clean_tctx(tctx);
9780         if (cancel_all) {
9781                 /* for exec all current's requests should be gone, kill tctx */
9782                 __io_uring_free(current);
9783         }
9784 }
9785
9786 void __io_uring_cancel(bool cancel_all)
9787 {
9788         io_uring_cancel_generic(cancel_all, NULL);
9789 }
9790
9791 static void *io_uring_validate_mmap_request(struct file *file,
9792                                             loff_t pgoff, size_t sz)
9793 {
9794         struct io_ring_ctx *ctx = file->private_data;
9795         loff_t offset = pgoff << PAGE_SHIFT;
9796         struct page *page;
9797         void *ptr;
9798
9799         switch (offset) {
9800         case IORING_OFF_SQ_RING:
9801         case IORING_OFF_CQ_RING:
9802                 ptr = ctx->rings;
9803                 break;
9804         case IORING_OFF_SQES:
9805                 ptr = ctx->sq_sqes;
9806                 break;
9807         default:
9808                 return ERR_PTR(-EINVAL);
9809         }
9810
9811         page = virt_to_head_page(ptr);
9812         if (sz > page_size(page))
9813                 return ERR_PTR(-EINVAL);
9814
9815         return ptr;
9816 }
9817
9818 #ifdef CONFIG_MMU
9819
9820 static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
9821 {
9822         size_t sz = vma->vm_end - vma->vm_start;
9823         unsigned long pfn;
9824         void *ptr;
9825
9826         ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
9827         if (IS_ERR(ptr))
9828                 return PTR_ERR(ptr);
9829
9830         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
9831         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
9832 }
9833
9834 #else /* !CONFIG_MMU */
9835
9836 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
9837 {
9838         return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
9839 }
9840
9841 static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
9842 {
9843         return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
9844 }
9845
9846 static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
9847         unsigned long addr, unsigned long len,
9848         unsigned long pgoff, unsigned long flags)
9849 {
9850         void *ptr;
9851
9852         ptr = io_uring_validate_mmap_request(file, pgoff, len);
9853         if (IS_ERR(ptr))
9854                 return PTR_ERR(ptr);
9855
9856         return (unsigned long) ptr;
9857 }
9858
9859 #endif /* !CONFIG_MMU */
9860
9861 static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
9862 {
9863         DEFINE_WAIT(wait);
9864
9865         do {
9866                 if (!io_sqring_full(ctx))
9867                         break;
9868                 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
9869
9870                 if (!io_sqring_full(ctx))
9871                         break;
9872                 schedule();
9873         } while (!signal_pending(current));
9874
9875         finish_wait(&ctx->sqo_sq_wait, &wait);
9876         return 0;
9877 }
9878
9879 static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
9880                           struct __kernel_timespec __user **ts,
9881                           const sigset_t __user **sig)
9882 {
9883         struct io_uring_getevents_arg arg;
9884
9885         /*
9886          * If EXT_ARG isn't set, then we have no timespec and the argp pointer
9887          * is just a pointer to the sigset_t.
9888          */
9889         if (!(flags & IORING_ENTER_EXT_ARG)) {
9890                 *sig = (const sigset_t __user *) argp;
9891                 *ts = NULL;
9892                 return 0;
9893         }
9894
9895         /*
9896          * EXT_ARG is set - ensure we agree on the size of it and copy in our
9897          * timespec and sigset_t pointers if good.
9898          */
9899         if (*argsz != sizeof(arg))
9900                 return -EINVAL;
9901         if (copy_from_user(&arg, argp, sizeof(arg)))
9902                 return -EFAULT;
9903         *sig = u64_to_user_ptr(arg.sigmask);
9904         *argsz = arg.sigmask_sz;
9905         *ts = u64_to_user_ptr(arg.ts);
9906         return 0;
9907 }
9908
9909 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
9910                 u32, min_complete, u32, flags, const void __user *, argp,
9911                 size_t, argsz)
9912 {
9913         struct io_ring_ctx *ctx;
9914         int submitted = 0;
9915         struct fd f;
9916         long ret;
9917
9918         io_run_task_work();
9919
9920         if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
9921                                IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
9922                 return -EINVAL;
9923
9924         f = fdget(fd);
9925         if (unlikely(!f.file))
9926                 return -EBADF;
9927
9928         ret = -EOPNOTSUPP;
9929         if (unlikely(f.file->f_op != &io_uring_fops))
9930                 goto out_fput;
9931
9932         ret = -ENXIO;
9933         ctx = f.file->private_data;
9934         if (unlikely(!percpu_ref_tryget(&ctx->refs)))
9935                 goto out_fput;
9936
9937         ret = -EBADFD;
9938         if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
9939                 goto out;
9940
9941         /*
9942          * For SQ polling, the thread will do all submissions and completions.
9943          * Just return the requested submit count, and wake the thread if
9944          * we were asked to.
9945          */
9946         ret = 0;
9947         if (ctx->flags & IORING_SETUP_SQPOLL) {
9948                 io_cqring_overflow_flush(ctx);
9949
9950                 if (unlikely(ctx->sq_data->thread == NULL)) {
9951                         ret = -EOWNERDEAD;
9952                         goto out;
9953                 }
9954                 if (flags & IORING_ENTER_SQ_WAKEUP)
9955                         wake_up(&ctx->sq_data->wait);
9956                 if (flags & IORING_ENTER_SQ_WAIT) {
9957                         ret = io_sqpoll_wait_sq(ctx);
9958                         if (ret)
9959                                 goto out;
9960                 }
9961                 submitted = to_submit;
9962         } else if (to_submit) {
9963                 ret = io_uring_add_tctx_node(ctx);
9964                 if (unlikely(ret))
9965                         goto out;
9966                 mutex_lock(&ctx->uring_lock);
9967                 submitted = io_submit_sqes(ctx, to_submit);
9968                 mutex_unlock(&ctx->uring_lock);
9969
9970                 if (submitted != to_submit)
9971                         goto out;
9972         }
9973         if (flags & IORING_ENTER_GETEVENTS) {
9974                 const sigset_t __user *sig;
9975                 struct __kernel_timespec __user *ts;
9976
9977                 ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
9978                 if (unlikely(ret))
9979                         goto out;
9980
9981                 min_complete = min(min_complete, ctx->cq_entries);
9982
9983                 /*
9984                  * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
9985                  * space applications don't need to do io completion events
9986                  * polling again, they can rely on io_sq_thread to do polling
9987                  * work, which can reduce cpu usage and uring_lock contention.
9988                  */
9989                 if (ctx->flags & IORING_SETUP_IOPOLL &&
9990                     !(ctx->flags & IORING_SETUP_SQPOLL)) {
9991                         ret = io_iopoll_check(ctx, min_complete);
9992                 } else {
9993                         ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
9994                 }
9995         }
9996
9997 out:
9998         percpu_ref_put(&ctx->refs);
9999 out_fput:
10000         fdput(f);
10001         return submitted ? submitted : ret;
10002 }
10003
10004 #ifdef CONFIG_PROC_FS
10005 static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
10006                 const struct cred *cred)
10007 {
10008         struct user_namespace *uns = seq_user_ns(m);
10009         struct group_info *gi;
10010         kernel_cap_t cap;
10011         unsigned __capi;
10012         int g;
10013
10014         seq_printf(m, "%5d\n", id);
10015         seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
10016         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
10017         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
10018         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
10019         seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
10020         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
10021         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
10022         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
10023         seq_puts(m, "\n\tGroups:\t");
10024         gi = cred->group_info;
10025         for (g = 0; g < gi->ngroups; g++) {
10026                 seq_put_decimal_ull(m, g ? " " : "",
10027                                         from_kgid_munged(uns, gi->gid[g]));
10028         }
10029         seq_puts(m, "\n\tCapEff:\t");
10030         cap = cred->cap_effective;
10031         CAP_FOR_EACH_U32(__capi)
10032                 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
10033         seq_putc(m, '\n');
10034         return 0;
10035 }
10036
10037 static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
10038                                           struct seq_file *m)
10039 {
10040         struct io_sq_data *sq = NULL;
10041         struct io_overflow_cqe *ocqe;
10042         struct io_rings *r = ctx->rings;
10043         unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
10044         unsigned int cached_sq_head = ctx->cached_sq_head;
10045         unsigned int cached_cq_tail = ctx->cached_cq_tail;
10046         unsigned int sq_head = READ_ONCE(r->sq.head);
10047         unsigned int sq_tail = READ_ONCE(r->sq.tail);
10048         unsigned int cq_head = READ_ONCE(r->cq.head);
10049         unsigned int cq_tail = READ_ONCE(r->cq.tail);
10050         bool has_lock;
10051         unsigned int i;
10052
10053         /*
10054          * we may get imprecise sqe and cqe info if uring is actively running
10055          * since we get cached_sq_head and cached_cq_tail without uring_lock
10056          * and sq_tail and cq_head are changed by userspace. But it's ok since
10057          * we usually use these info when it is stuck.
10058          */
10059         seq_printf(m, "SqHead:\t%u\n", sq_head & sq_mask);
10060         seq_printf(m, "SqTail:\t%u\n", sq_tail & sq_mask);
10061         seq_printf(m, "CachedSqHead:\t%u\n", cached_sq_head & sq_mask);
10062         seq_printf(m, "CqHead:\t%u\n", cq_head & cq_mask);
10063         seq_printf(m, "CqTail:\t%u\n", cq_tail & cq_mask);
10064         seq_printf(m, "CachedCqTail:\t%u\n", cached_cq_tail & cq_mask);
10065         seq_printf(m, "SQEs:\t%u\n", sq_tail - cached_sq_head);
10066         for (i = cached_sq_head; i < sq_tail; i++) {
10067                 unsigned int sq_idx = READ_ONCE(ctx->sq_array[i & sq_mask]);
10068
10069                 if (likely(sq_idx <= sq_mask)) {
10070                         struct io_uring_sqe *sqe = &ctx->sq_sqes[sq_idx];
10071
10072                         seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
10073                                    sq_idx, sqe->opcode, sqe->fd, sqe->flags, sqe->user_data);
10074                 }
10075         }
10076         seq_printf(m, "CQEs:\t%u\n", cached_cq_tail - cq_head);
10077         for (i = cq_head; i < cached_cq_tail; i++) {
10078                 struct io_uring_cqe *cqe = &r->cqes[i & cq_mask];
10079
10080                 seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
10081                            i & cq_mask, cqe->user_data, cqe->res, cqe->flags);
10082         }
10083
10084         /*
10085          * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
10086          * since fdinfo case grabs it in the opposite direction of normal use
10087          * cases. If we fail to get the lock, we just don't iterate any
10088          * structures that could be going away outside the io_uring mutex.
10089          */
10090         has_lock = mutex_trylock(&ctx->uring_lock);
10091
10092         if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
10093                 sq = ctx->sq_data;
10094                 if (!sq->thread)
10095                         sq = NULL;
10096         }
10097
10098         seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
10099         seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
10100         seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
10101         for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
10102                 struct file *f = io_file_from_index(ctx, i);
10103
10104                 if (f)
10105                         seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
10106                 else
10107                         seq_printf(m, "%5u: <none>\n", i);
10108         }
10109         seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
10110         for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
10111                 struct io_mapped_ubuf *buf = ctx->user_bufs[i];
10112                 unsigned int len = buf->ubuf_end - buf->ubuf;
10113
10114                 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
10115         }
10116         if (has_lock && !xa_empty(&ctx->personalities)) {
10117                 unsigned long index;
10118                 const struct cred *cred;
10119
10120                 seq_printf(m, "Personalities:\n");
10121                 xa_for_each(&ctx->personalities, index, cred)
10122                         io_uring_show_cred(m, index, cred);
10123         }
10124         if (has_lock)
10125                 mutex_unlock(&ctx->uring_lock);
10126
10127         seq_puts(m, "PollList:\n");
10128         spin_lock(&ctx->completion_lock);
10129         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
10130                 struct hlist_head *list = &ctx->cancel_hash[i];
10131                 struct io_kiocb *req;
10132
10133                 hlist_for_each_entry(req, list, hash_node)
10134                         seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
10135                                         req->task->task_works != NULL);
10136         }
10137
10138         seq_puts(m, "CqOverflowList:\n");
10139         list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
10140                 struct io_uring_cqe *cqe = &ocqe->cqe;
10141
10142                 seq_printf(m, "  user_data=%llu, res=%d, flags=%x\n",
10143                            cqe->user_data, cqe->res, cqe->flags);
10144
10145         }
10146
10147         spin_unlock(&ctx->completion_lock);
10148 }
10149
10150 static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
10151 {
10152         struct io_ring_ctx *ctx = f->private_data;
10153
10154         if (percpu_ref_tryget(&ctx->refs)) {
10155                 __io_uring_show_fdinfo(ctx, m);
10156                 percpu_ref_put(&ctx->refs);
10157         }
10158 }
10159 #endif
10160
10161 static const struct file_operations io_uring_fops = {
10162         .release        = io_uring_release,
10163         .mmap           = io_uring_mmap,
10164 #ifndef CONFIG_MMU
10165         .get_unmapped_area = io_uring_nommu_get_unmapped_area,
10166         .mmap_capabilities = io_uring_nommu_mmap_capabilities,
10167 #endif
10168         .poll           = io_uring_poll,
10169 #ifdef CONFIG_PROC_FS
10170         .show_fdinfo    = io_uring_show_fdinfo,
10171 #endif
10172 };
10173
10174 static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
10175                                          struct io_uring_params *p)
10176 {
10177         struct io_rings *rings;
10178         size_t size, sq_array_offset;
10179
10180         /* make sure these are sane, as we already accounted them */
10181         ctx->sq_entries = p->sq_entries;
10182         ctx->cq_entries = p->cq_entries;
10183
10184         size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
10185         if (size == SIZE_MAX)
10186                 return -EOVERFLOW;
10187
10188         rings = io_mem_alloc(size);
10189         if (!rings)
10190                 return -ENOMEM;
10191
10192         ctx->rings = rings;
10193         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
10194         rings->sq_ring_mask = p->sq_entries - 1;
10195         rings->cq_ring_mask = p->cq_entries - 1;
10196         rings->sq_ring_entries = p->sq_entries;
10197         rings->cq_ring_entries = p->cq_entries;
10198
10199         size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
10200         if (size == SIZE_MAX) {
10201                 io_mem_free(ctx->rings);
10202                 ctx->rings = NULL;
10203                 return -EOVERFLOW;
10204         }
10205
10206         ctx->sq_sqes = io_mem_alloc(size);
10207         if (!ctx->sq_sqes) {
10208                 io_mem_free(ctx->rings);
10209                 ctx->rings = NULL;
10210                 return -ENOMEM;
10211         }
10212
10213         return 0;
10214 }
10215
10216 static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
10217 {
10218         int ret, fd;
10219
10220         fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
10221         if (fd < 0)
10222                 return fd;
10223
10224         ret = io_uring_add_tctx_node(ctx);
10225         if (ret) {
10226                 put_unused_fd(fd);
10227                 return ret;
10228         }
10229         fd_install(fd, file);
10230         return fd;
10231 }
10232
10233 /*
10234  * Allocate an anonymous fd, this is what constitutes the application
10235  * visible backing of an io_uring instance. The application mmaps this
10236  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
10237  * we have to tie this fd to a socket for file garbage collection purposes.
10238  */
10239 static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
10240 {
10241         struct file *file;
10242 #if defined(CONFIG_UNIX)
10243         int ret;
10244
10245         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
10246                                 &ctx->ring_sock);
10247         if (ret)
10248                 return ERR_PTR(ret);
10249 #endif
10250
10251         file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
10252                                         O_RDWR | O_CLOEXEC);
10253 #if defined(CONFIG_UNIX)
10254         if (IS_ERR(file)) {
10255                 sock_release(ctx->ring_sock);
10256                 ctx->ring_sock = NULL;
10257         } else {
10258                 ctx->ring_sock->file = file;
10259         }
10260 #endif
10261         return file;
10262 }
10263
10264 static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
10265                                   struct io_uring_params __user *params)
10266 {
10267         struct io_ring_ctx *ctx;
10268         struct file *file;
10269         int ret;
10270
10271         if (!entries)
10272                 return -EINVAL;
10273         if (entries > IORING_MAX_ENTRIES) {
10274                 if (!(p->flags & IORING_SETUP_CLAMP))
10275                         return -EINVAL;
10276                 entries = IORING_MAX_ENTRIES;
10277         }
10278
10279         /*
10280          * Use twice as many entries for the CQ ring. It's possible for the
10281          * application to drive a higher depth than the size of the SQ ring,
10282          * since the sqes are only used at submission time. This allows for
10283          * some flexibility in overcommitting a bit. If the application has
10284          * set IORING_SETUP_CQSIZE, it will have passed in the desired number
10285          * of CQ ring entries manually.
10286          */
10287         p->sq_entries = roundup_pow_of_two(entries);
10288         if (p->flags & IORING_SETUP_CQSIZE) {
10289                 /*
10290                  * If IORING_SETUP_CQSIZE is set, we do the same roundup
10291                  * to a power-of-two, if it isn't already. We do NOT impose
10292                  * any cq vs sq ring sizing.
10293                  */
10294                 if (!p->cq_entries)
10295                         return -EINVAL;
10296                 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
10297                         if (!(p->flags & IORING_SETUP_CLAMP))
10298                                 return -EINVAL;
10299                         p->cq_entries = IORING_MAX_CQ_ENTRIES;
10300                 }
10301                 p->cq_entries = roundup_pow_of_two(p->cq_entries);
10302                 if (p->cq_entries < p->sq_entries)
10303                         return -EINVAL;
10304         } else {
10305                 p->cq_entries = 2 * p->sq_entries;
10306         }
10307
10308         ctx = io_ring_ctx_alloc(p);
10309         if (!ctx)
10310                 return -ENOMEM;
10311         ctx->compat = in_compat_syscall();
10312         if (!capable(CAP_IPC_LOCK))
10313                 ctx->user = get_uid(current_user());
10314
10315         /*
10316          * This is just grabbed for accounting purposes. When a process exits,
10317          * the mm is exited and dropped before the files, hence we need to hang
10318          * on to this mm purely for the purposes of being able to unaccount
10319          * memory (locked/pinned vm). It's not used for anything else.
10320          */
10321         mmgrab(current->mm);
10322         ctx->mm_account = current->mm;
10323
10324         ret = io_allocate_scq_urings(ctx, p);
10325         if (ret)
10326                 goto err;
10327
10328         ret = io_sq_offload_create(ctx, p);
10329         if (ret)
10330                 goto err;
10331         /* always set a rsrc node */
10332         ret = io_rsrc_node_switch_start(ctx);
10333         if (ret)
10334                 goto err;
10335         io_rsrc_node_switch(ctx, NULL);
10336
10337         memset(&p->sq_off, 0, sizeof(p->sq_off));
10338         p->sq_off.head = offsetof(struct io_rings, sq.head);
10339         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
10340         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
10341         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
10342         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
10343         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
10344         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
10345
10346         memset(&p->cq_off, 0, sizeof(p->cq_off));
10347         p->cq_off.head = offsetof(struct io_rings, cq.head);
10348         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
10349         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
10350         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
10351         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
10352         p->cq_off.cqes = offsetof(struct io_rings, cqes);
10353         p->cq_off.flags = offsetof(struct io_rings, cq_flags);
10354
10355         p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
10356                         IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
10357                         IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
10358                         IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
10359                         IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
10360                         IORING_FEAT_RSRC_TAGS;
10361
10362         if (copy_to_user(params, p, sizeof(*p))) {
10363                 ret = -EFAULT;
10364                 goto err;
10365         }
10366
10367         file = io_uring_get_file(ctx);
10368         if (IS_ERR(file)) {
10369                 ret = PTR_ERR(file);
10370                 goto err;
10371         }
10372
10373         /*
10374          * Install ring fd as the very last thing, so we don't risk someone
10375          * having closed it before we finish setup
10376          */
10377         ret = io_uring_install_fd(ctx, file);
10378         if (ret < 0) {
10379                 /* fput will clean it up */
10380                 fput(file);
10381                 return ret;
10382         }
10383
10384         trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
10385         return ret;
10386 err:
10387         io_ring_ctx_wait_and_kill(ctx);
10388         return ret;
10389 }
10390
10391 /*
10392  * Sets up an aio uring context, and returns the fd. Applications asks for a
10393  * ring size, we return the actual sq/cq ring sizes (among other things) in the
10394  * params structure passed in.
10395  */
10396 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
10397 {
10398         struct io_uring_params p;
10399         int i;
10400
10401         if (copy_from_user(&p, params, sizeof(p)))
10402                 return -EFAULT;
10403         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
10404                 if (p.resv[i])
10405                         return -EINVAL;
10406         }
10407
10408         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
10409                         IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
10410                         IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
10411                         IORING_SETUP_R_DISABLED))
10412                 return -EINVAL;
10413
10414         return  io_uring_create(entries, &p, params);
10415 }
10416
10417 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
10418                 struct io_uring_params __user *, params)
10419 {
10420         return io_uring_setup(entries, params);
10421 }
10422
10423 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
10424                            unsigned nr_args)
10425 {
10426         struct io_uring_probe *p;
10427         size_t size;
10428         int i, ret;
10429
10430         size = struct_size(p, ops, nr_args);
10431         if (size == SIZE_MAX)
10432                 return -EOVERFLOW;
10433         p = kzalloc(size, GFP_KERNEL);
10434         if (!p)
10435                 return -ENOMEM;
10436
10437         ret = -EFAULT;
10438         if (copy_from_user(p, arg, size))
10439                 goto out;
10440         ret = -EINVAL;
10441         if (memchr_inv(p, 0, size))
10442                 goto out;
10443
10444         p->last_op = IORING_OP_LAST - 1;
10445         if (nr_args > IORING_OP_LAST)
10446                 nr_args = IORING_OP_LAST;
10447
10448         for (i = 0; i < nr_args; i++) {
10449                 p->ops[i].op = i;
10450                 if (!io_op_defs[i].not_supported)
10451                         p->ops[i].flags = IO_URING_OP_SUPPORTED;
10452         }
10453         p->ops_len = i;
10454
10455         ret = 0;
10456         if (copy_to_user(arg, p, size))
10457                 ret = -EFAULT;
10458 out:
10459         kfree(p);
10460         return ret;
10461 }
10462
10463 static int io_register_personality(struct io_ring_ctx *ctx)
10464 {
10465         const struct cred *creds;
10466         u32 id;
10467         int ret;
10468
10469         creds = get_current_cred();
10470
10471         ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
10472                         XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
10473         if (ret < 0) {
10474                 put_cred(creds);
10475                 return ret;
10476         }
10477         return id;
10478 }
10479
10480 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
10481                                            void __user *arg, unsigned int nr_args)
10482 {
10483         struct io_uring_restriction *res;
10484         size_t size;
10485         int i, ret;
10486
10487         /* Restrictions allowed only if rings started disabled */
10488         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
10489                 return -EBADFD;
10490
10491         /* We allow only a single restrictions registration */
10492         if (ctx->restrictions.registered)
10493                 return -EBUSY;
10494
10495         if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
10496                 return -EINVAL;
10497
10498         size = array_size(nr_args, sizeof(*res));
10499         if (size == SIZE_MAX)
10500                 return -EOVERFLOW;
10501
10502         res = memdup_user(arg, size);
10503         if (IS_ERR(res))
10504                 return PTR_ERR(res);
10505
10506         ret = 0;
10507
10508         for (i = 0; i < nr_args; i++) {
10509                 switch (res[i].opcode) {
10510                 case IORING_RESTRICTION_REGISTER_OP:
10511                         if (res[i].register_op >= IORING_REGISTER_LAST) {
10512                                 ret = -EINVAL;
10513                                 goto out;
10514                         }
10515
10516                         __set_bit(res[i].register_op,
10517                                   ctx->restrictions.register_op);
10518                         break;
10519                 case IORING_RESTRICTION_SQE_OP:
10520                         if (res[i].sqe_op >= IORING_OP_LAST) {
10521                                 ret = -EINVAL;
10522                                 goto out;
10523                         }
10524
10525                         __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
10526                         break;
10527                 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
10528                         ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
10529                         break;
10530                 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
10531                         ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
10532                         break;
10533                 default:
10534                         ret = -EINVAL;
10535                         goto out;
10536                 }
10537         }
10538
10539 out:
10540         /* Reset all restrictions if an error happened */
10541         if (ret != 0)
10542                 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
10543         else
10544                 ctx->restrictions.registered = true;
10545
10546         kfree(res);
10547         return ret;
10548 }
10549
10550 static int io_register_enable_rings(struct io_ring_ctx *ctx)
10551 {
10552         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
10553                 return -EBADFD;
10554
10555         if (ctx->restrictions.registered)
10556                 ctx->restricted = 1;
10557
10558         ctx->flags &= ~IORING_SETUP_R_DISABLED;
10559         if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
10560                 wake_up(&ctx->sq_data->wait);
10561         return 0;
10562 }
10563
10564 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
10565                                      struct io_uring_rsrc_update2 *up,
10566                                      unsigned nr_args)
10567 {
10568         __u32 tmp;
10569         int err;
10570
10571         if (up->resv)
10572                 return -EINVAL;
10573         if (check_add_overflow(up->offset, nr_args, &tmp))
10574                 return -EOVERFLOW;
10575         err = io_rsrc_node_switch_start(ctx);
10576         if (err)
10577                 return err;
10578
10579         switch (type) {
10580         case IORING_RSRC_FILE:
10581                 return __io_sqe_files_update(ctx, up, nr_args);
10582         case IORING_RSRC_BUFFER:
10583                 return __io_sqe_buffers_update(ctx, up, nr_args);
10584         }
10585         return -EINVAL;
10586 }
10587
10588 static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
10589                                     unsigned nr_args)
10590 {
10591         struct io_uring_rsrc_update2 up;
10592
10593         if (!nr_args)
10594                 return -EINVAL;
10595         memset(&up, 0, sizeof(up));
10596         if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
10597                 return -EFAULT;
10598         return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
10599 }
10600
10601 static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
10602                                    unsigned size, unsigned type)
10603 {
10604         struct io_uring_rsrc_update2 up;
10605
10606         if (size != sizeof(up))
10607                 return -EINVAL;
10608         if (copy_from_user(&up, arg, sizeof(up)))
10609                 return -EFAULT;
10610         if (!up.nr || up.resv)
10611                 return -EINVAL;
10612         return __io_register_rsrc_update(ctx, type, &up, up.nr);
10613 }
10614
10615 static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
10616                             unsigned int size, unsigned int type)
10617 {
10618         struct io_uring_rsrc_register rr;
10619
10620         /* keep it extendible */
10621         if (size != sizeof(rr))
10622                 return -EINVAL;
10623
10624         memset(&rr, 0, sizeof(rr));
10625         if (copy_from_user(&rr, arg, size))
10626                 return -EFAULT;
10627         if (!rr.nr || rr.resv || rr.resv2)
10628                 return -EINVAL;
10629
10630         switch (type) {
10631         case IORING_RSRC_FILE:
10632                 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
10633                                              rr.nr, u64_to_user_ptr(rr.tags));
10634         case IORING_RSRC_BUFFER:
10635                 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
10636                                                rr.nr, u64_to_user_ptr(rr.tags));
10637         }
10638         return -EINVAL;
10639 }
10640
10641 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
10642                                        void __user *arg, unsigned len)
10643 {
10644         struct io_uring_task *tctx = current->io_uring;
10645         cpumask_var_t new_mask;
10646         int ret;
10647
10648         if (!tctx || !tctx->io_wq)
10649                 return -EINVAL;
10650
10651         if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
10652                 return -ENOMEM;
10653
10654         cpumask_clear(new_mask);
10655         if (len > cpumask_size())
10656                 len = cpumask_size();
10657
10658         if (copy_from_user(new_mask, arg, len)) {
10659                 free_cpumask_var(new_mask);
10660                 return -EFAULT;
10661         }
10662
10663         ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
10664         free_cpumask_var(new_mask);
10665         return ret;
10666 }
10667
10668 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
10669 {
10670         struct io_uring_task *tctx = current->io_uring;
10671
10672         if (!tctx || !tctx->io_wq)
10673                 return -EINVAL;
10674
10675         return io_wq_cpu_affinity(tctx->io_wq, NULL);
10676 }
10677
10678 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
10679                                                void __user *arg)
10680 {
10681         struct io_uring_task *tctx = NULL;
10682         struct io_sq_data *sqd = NULL;
10683         __u32 new_count[2];
10684         int i, ret;
10685
10686         if (copy_from_user(new_count, arg, sizeof(new_count)))
10687                 return -EFAULT;
10688         for (i = 0; i < ARRAY_SIZE(new_count); i++)
10689                 if (new_count[i] > INT_MAX)
10690                         return -EINVAL;
10691
10692         if (ctx->flags & IORING_SETUP_SQPOLL) {
10693                 sqd = ctx->sq_data;
10694                 if (sqd) {
10695                         /*
10696                          * Observe the correct sqd->lock -> ctx->uring_lock
10697                          * ordering. Fine to drop uring_lock here, we hold
10698                          * a ref to the ctx.
10699                          */
10700                         refcount_inc(&sqd->refs);
10701                         mutex_unlock(&ctx->uring_lock);
10702                         mutex_lock(&sqd->lock);
10703                         mutex_lock(&ctx->uring_lock);
10704                         if (sqd->thread)
10705                                 tctx = sqd->thread->io_uring;
10706                 }
10707         } else {
10708                 tctx = current->io_uring;
10709         }
10710
10711         ret = -EINVAL;
10712         if (!tctx || !tctx->io_wq)
10713                 goto err;
10714
10715         ret = io_wq_max_workers(tctx->io_wq, new_count);
10716         if (ret)
10717                 goto err;
10718
10719         if (sqd) {
10720                 mutex_unlock(&sqd->lock);
10721                 io_put_sq_data(sqd);
10722         }
10723
10724         if (copy_to_user(arg, new_count, sizeof(new_count)))
10725                 return -EFAULT;
10726
10727         return 0;
10728 err:
10729         if (sqd) {
10730                 mutex_unlock(&sqd->lock);
10731                 io_put_sq_data(sqd);
10732         }
10733         return ret;
10734 }
10735
10736 static bool io_register_op_must_quiesce(int op)
10737 {
10738         switch (op) {
10739         case IORING_REGISTER_BUFFERS:
10740         case IORING_UNREGISTER_BUFFERS:
10741         case IORING_REGISTER_FILES:
10742         case IORING_UNREGISTER_FILES:
10743         case IORING_REGISTER_FILES_UPDATE:
10744         case IORING_REGISTER_PROBE:
10745         case IORING_REGISTER_PERSONALITY:
10746         case IORING_UNREGISTER_PERSONALITY:
10747         case IORING_REGISTER_FILES2:
10748         case IORING_REGISTER_FILES_UPDATE2:
10749         case IORING_REGISTER_BUFFERS2:
10750         case IORING_REGISTER_BUFFERS_UPDATE:
10751         case IORING_REGISTER_IOWQ_AFF:
10752         case IORING_UNREGISTER_IOWQ_AFF:
10753         case IORING_REGISTER_IOWQ_MAX_WORKERS:
10754                 return false;
10755         default:
10756                 return true;
10757         }
10758 }
10759
10760 static __cold int io_ctx_quiesce(struct io_ring_ctx *ctx)
10761 {
10762         long ret;
10763
10764         percpu_ref_kill(&ctx->refs);
10765
10766         /*
10767          * Drop uring mutex before waiting for references to exit. If another
10768          * thread is currently inside io_uring_enter() it might need to grab the
10769          * uring_lock to make progress. If we hold it here across the drain
10770          * wait, then we can deadlock. It's safe to drop the mutex here, since
10771          * no new references will come in after we've killed the percpu ref.
10772          */
10773         mutex_unlock(&ctx->uring_lock);
10774         do {
10775                 ret = wait_for_completion_interruptible_timeout(&ctx->ref_comp, HZ);
10776                 if (ret) {
10777                         ret = min(0L, ret);
10778                         break;
10779                 }
10780
10781                 ret = io_run_task_work_sig();
10782                 io_req_caches_free(ctx);
10783         } while (ret >= 0);
10784         mutex_lock(&ctx->uring_lock);
10785
10786         if (ret)
10787                 io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
10788         return ret;
10789 }
10790
10791 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
10792                                void __user *arg, unsigned nr_args)
10793         __releases(ctx->uring_lock)
10794         __acquires(ctx->uring_lock)
10795 {
10796         int ret;
10797
10798         /*
10799          * We're inside the ring mutex, if the ref is already dying, then
10800          * someone else killed the ctx or is already going through
10801          * io_uring_register().
10802          */
10803         if (percpu_ref_is_dying(&ctx->refs))
10804                 return -ENXIO;
10805
10806         if (ctx->restricted) {
10807                 if (opcode >= IORING_REGISTER_LAST)
10808                         return -EINVAL;
10809                 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
10810                 if (!test_bit(opcode, ctx->restrictions.register_op))
10811                         return -EACCES;
10812         }
10813
10814         if (io_register_op_must_quiesce(opcode)) {
10815                 ret = io_ctx_quiesce(ctx);
10816                 if (ret)
10817                         return ret;
10818         }
10819
10820         switch (opcode) {
10821         case IORING_REGISTER_BUFFERS:
10822                 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
10823                 break;
10824         case IORING_UNREGISTER_BUFFERS:
10825                 ret = -EINVAL;
10826                 if (arg || nr_args)
10827                         break;
10828                 ret = io_sqe_buffers_unregister(ctx);
10829                 break;
10830         case IORING_REGISTER_FILES:
10831                 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
10832                 break;
10833         case IORING_UNREGISTER_FILES:
10834                 ret = -EINVAL;
10835                 if (arg || nr_args)
10836                         break;
10837                 ret = io_sqe_files_unregister(ctx);
10838                 break;
10839         case IORING_REGISTER_FILES_UPDATE:
10840                 ret = io_register_files_update(ctx, arg, nr_args);
10841                 break;
10842         case IORING_REGISTER_EVENTFD:
10843         case IORING_REGISTER_EVENTFD_ASYNC:
10844                 ret = -EINVAL;
10845                 if (nr_args != 1)
10846                         break;
10847                 ret = io_eventfd_register(ctx, arg);
10848                 if (ret)
10849                         break;
10850                 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
10851                         ctx->eventfd_async = 1;
10852                 else
10853                         ctx->eventfd_async = 0;
10854                 break;
10855         case IORING_UNREGISTER_EVENTFD:
10856                 ret = -EINVAL;
10857                 if (arg || nr_args)
10858                         break;
10859                 ret = io_eventfd_unregister(ctx);
10860                 break;
10861         case IORING_REGISTER_PROBE:
10862                 ret = -EINVAL;
10863                 if (!arg || nr_args > 256)
10864                         break;
10865                 ret = io_probe(ctx, arg, nr_args);
10866                 break;
10867         case IORING_REGISTER_PERSONALITY:
10868                 ret = -EINVAL;
10869                 if (arg || nr_args)
10870                         break;
10871                 ret = io_register_personality(ctx);
10872                 break;
10873         case IORING_UNREGISTER_PERSONALITY:
10874                 ret = -EINVAL;
10875                 if (arg)
10876                         break;
10877                 ret = io_unregister_personality(ctx, nr_args);
10878                 break;
10879         case IORING_REGISTER_ENABLE_RINGS:
10880                 ret = -EINVAL;
10881                 if (arg || nr_args)
10882                         break;
10883                 ret = io_register_enable_rings(ctx);
10884                 break;
10885         case IORING_REGISTER_RESTRICTIONS:
10886                 ret = io_register_restrictions(ctx, arg, nr_args);
10887                 break;
10888         case IORING_REGISTER_FILES2:
10889                 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
10890                 break;
10891         case IORING_REGISTER_FILES_UPDATE2:
10892                 ret = io_register_rsrc_update(ctx, arg, nr_args,
10893                                               IORING_RSRC_FILE);
10894                 break;
10895         case IORING_REGISTER_BUFFERS2:
10896                 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
10897                 break;
10898         case IORING_REGISTER_BUFFERS_UPDATE:
10899                 ret = io_register_rsrc_update(ctx, arg, nr_args,
10900                                               IORING_RSRC_BUFFER);
10901                 break;
10902         case IORING_REGISTER_IOWQ_AFF:
10903                 ret = -EINVAL;
10904                 if (!arg || !nr_args)
10905                         break;
10906                 ret = io_register_iowq_aff(ctx, arg, nr_args);
10907                 break;
10908         case IORING_UNREGISTER_IOWQ_AFF:
10909                 ret = -EINVAL;
10910                 if (arg || nr_args)
10911                         break;
10912                 ret = io_unregister_iowq_aff(ctx);
10913                 break;
10914         case IORING_REGISTER_IOWQ_MAX_WORKERS:
10915                 ret = -EINVAL;
10916                 if (!arg || nr_args != 2)
10917                         break;
10918                 ret = io_register_iowq_max_workers(ctx, arg);
10919                 break;
10920         default:
10921                 ret = -EINVAL;
10922                 break;
10923         }
10924
10925         if (io_register_op_must_quiesce(opcode)) {
10926                 /* bring the ctx back to life */
10927                 percpu_ref_reinit(&ctx->refs);
10928                 reinit_completion(&ctx->ref_comp);
10929         }
10930         return ret;
10931 }
10932
10933 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
10934                 void __user *, arg, unsigned int, nr_args)
10935 {
10936         struct io_ring_ctx *ctx;
10937         long ret = -EBADF;
10938         struct fd f;
10939
10940         f = fdget(fd);
10941         if (!f.file)
10942                 return -EBADF;
10943
10944         ret = -EOPNOTSUPP;
10945         if (f.file->f_op != &io_uring_fops)
10946                 goto out_fput;
10947
10948         ctx = f.file->private_data;
10949
10950         io_run_task_work();
10951
10952         mutex_lock(&ctx->uring_lock);
10953         ret = __io_uring_register(ctx, opcode, arg, nr_args);
10954         mutex_unlock(&ctx->uring_lock);
10955         trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
10956                                                         ctx->cq_ev_fd != NULL, ret);
10957 out_fput:
10958         fdput(f);
10959         return ret;
10960 }
10961
10962 static int __init io_uring_init(void)
10963 {
10964 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
10965         BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
10966         BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
10967 } while (0)
10968
10969 #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
10970         __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
10971         BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
10972         BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
10973         BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
10974         BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
10975         BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
10976         BUILD_BUG_SQE_ELEM(8,  __u64,  off);
10977         BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
10978         BUILD_BUG_SQE_ELEM(16, __u64,  addr);
10979         BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
10980         BUILD_BUG_SQE_ELEM(24, __u32,  len);
10981         BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
10982         BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
10983         BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
10984         BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
10985         BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
10986         BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
10987         BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
10988         BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
10989         BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
10990         BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
10991         BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
10992         BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
10993         BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
10994         BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
10995         BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
10996         BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
10997         BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
10998         BUILD_BUG_SQE_ELEM(40, __u16,  buf_group);
10999         BUILD_BUG_SQE_ELEM(42, __u16,  personality);
11000         BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
11001         BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
11002
11003         BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
11004                      sizeof(struct io_uring_rsrc_update));
11005         BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
11006                      sizeof(struct io_uring_rsrc_update2));
11007
11008         /* ->buf_index is u16 */
11009         BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
11010
11011         /* should fit into one byte */
11012         BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
11013         BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
11014         BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
11015
11016         BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
11017         BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
11018
11019         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
11020                                 SLAB_ACCOUNT);
11021         return 0;
11022 };
11023 __initcall(io_uring_init);