Merge tag 'io_uring-worker.v3-2021-02-25' of git://git.kernel.dk/linux-block
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 27 Feb 2021 16:29:02 +0000 (08:29 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 27 Feb 2021 16:29:02 +0000 (08:29 -0800)
Pull io_uring thread rewrite from Jens Axboe:
 "This converts the io-wq workers to be forked off the tasks in question
  instead of being kernel threads that assume various bits of the
  original task identity.

  This kills > 400 lines of code from io_uring/io-wq, and it's the worst
  part of the code. We've had several bugs in this area, and the worry
  is always that we could be missing some pieces for file types doing
  unusual things (recent /dev/tty example comes to mind, userfaultfd
  reads installing file descriptors is another fun one... - both of
  which need special handling, and I bet it's not the last weird oddity
  we'll find).

  With these identical workers, we can have full confidence that we're
  never missing anything. That, in itself, is a huge win. Outside of
  that, it's also more efficient since we're not wasting space and code
  on tracking state, or switching between different states.

  I'm sure we're going to find little things to patch up after this
  series, but testing has been pretty thorough, from the usual
  regression suite to production. Any issue that may crop up should be
  manageable.

  There's also a nice series of further reductions we can do on top of
  this, but I wanted to get the meat of it out sooner rather than later.
  The general worry here isn't that it's fundamentally broken. Most of
  the little issues we've found over the last week have been related to
  just changes in how thread startup/exit is done, since that's the main
  difference between using kthreads and these kinds of threads. In fact,
  if all goes according to plan, I want to get this into the 5.10 and
  5.11 stable branches as well.

  That said, the changes outside of io_uring/io-wq are:

   - arch setup, simple one-liner to each arch copy_thread()
     implementation.

   - Removal of net and proc restrictions for io_uring, they are no
     longer needed or useful"

* tag 'io_uring-worker.v3-2021-02-25' of git://git.kernel.dk/linux-block: (30 commits)
  io-wq: remove now unused IO_WQ_BIT_ERROR
  io_uring: fix SQPOLL thread handling over exec
  io-wq: improve manager/worker handling over exec
  io_uring: ensure SQPOLL startup is triggered before error shutdown
  io-wq: make buffered file write hashed work map per-ctx
  io-wq: fix race around io_worker grabbing
  io-wq: fix races around manager/worker creation and task exit
  io_uring: ensure io-wq context is always destroyed for tasks
  arch: ensure parisc/powerpc handle PF_IO_WORKER in copy_thread()
  io_uring: cleanup ->user usage
  io-wq: remove nr_process accounting
  io_uring: flag new native workers with IORING_FEAT_NATIVE_WORKERS
  net: remove cmsg restriction from io_uring based send/recvmsg calls
  Revert "proc: don't allow async path resolution of /proc/self components"
  Revert "proc: don't allow async path resolution of /proc/thread-self components"
  io_uring: move SQPOLL thread io-wq forked worker
  io-wq: make io_wq_fork_thread() available to other users
  io-wq: only remove worker from free_list, if it was there
  io_uring: remove io_identity
  io_uring: remove any grabbing of context
  ...

1  2 
arch/openrisc/kernel/process.c
arch/powerpc/kernel/process.c
arch/riscv/kernel/process.c
arch/sparc/kernel/process_32.c
fs/io_uring.c
net/socket.c

index 181448f743162488a84d3ecd1fca7e4a247997eb,83fba4ee44535db18c39c528f82b51cddc14f7dc..eb62429681fc806e5b48658d05488b3aa1e826c4
@@@ -34,7 -34,6 +34,7 @@@
  #include <linux/init_task.h>
  #include <linux/mqueue.h>
  #include <linux/fs.h>
 +#include <linux/reboot.h>
  
  #include <linux/uaccess.h>
  #include <asm/io.h>
   */
  struct thread_info *current_thread_info_set[NR_CPUS] = { &init_thread_info, };
  
 -void machine_restart(void)
 +void machine_restart(char *cmd)
  {
 -      printk(KERN_INFO "*** MACHINE RESTART ***\n");
 -      __asm__("l.nop 1");
 +      do_kernel_restart(cmd);
 +
 +      /* Give a grace period for failure to restart of 1s */
 +      mdelay(1000);
 +
 +      /* Whoops - the platform was unable to reboot. Tell the user! */
 +      pr_emerg("Reboot failed -- System halted\n");
 +      while (1);
  }
  
  /*
@@@ -174,7 -167,7 +174,7 @@@ copy_thread(unsigned long clone_flags, 
        sp -= sizeof(struct pt_regs);
        kregs = (struct pt_regs *)sp;
  
-       if (unlikely(p->flags & PF_KTHREAD)) {
+       if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
                memset(kregs, 0, sizeof(struct pt_regs));
                kregs->gpr[20] = usp; /* fn, kernel thread */
                kregs->gpr[22] = arg;
index 924d023dad0a808cef9b052df702822604c23a0a,8c20a49ea630d5943f5a6e0c38c437bee42ad284..3231c2df9e261fba57c74792c177256e0dafd678
@@@ -41,7 -41,6 +41,7 @@@
  #include <linux/pkeys.h>
  #include <linux/seq_buf.h>
  
 +#include <asm/interrupt.h>
  #include <asm/io.h>
  #include <asm/processor.h>
  #include <asm/mmu.h>
@@@ -660,10 -659,11 +660,10 @@@ static void do_break_handler(struct pt_
        }
  }
  
 -void do_break (struct pt_regs *regs, unsigned long address,
 -                  unsigned long error_code)
 +DEFINE_INTERRUPT_HANDLER(do_break)
  {
        current->thread.trap_nr = TRAP_HWBKPT;
 -      if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
 +      if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, regs->dsisr,
                        11, SIGSEGV) == NOTIFY_STOP)
                return;
  
                do_break_handler(regs);
  
        /* Deliver the signal to userspace */
 -      force_sig_fault(SIGTRAP, TRAP_HWBKPT, (void __user *)address);
 +      force_sig_fault(SIGTRAP, TRAP_HWBKPT, (void __user *)regs->dar);
  }
  #endif        /* CONFIG_PPC_ADV_DEBUG_REGS */
  
@@@ -1670,7 -1670,7 +1670,7 @@@ int copy_thread(unsigned long clone_fla
        /* Copy registers */
        sp -= sizeof(struct pt_regs);
        childregs = (struct pt_regs *) sp;
-       if (unlikely(p->flags & PF_KTHREAD)) {
+       if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
                /* kernel thread */
                memset(childregs, 0, sizeof(struct pt_regs));
                childregs->gpr[1] = sp + sizeof(struct pt_regs);
@@@ -2047,9 -2047,6 +2047,9 @@@ static inline int valid_emergency_stack
        unsigned long stack_page;
        unsigned long cpu = task_cpu(p);
  
 +      if (!paca_ptrs)
 +              return 0;
 +
        stack_page = (unsigned long)paca_ptrs[cpu]->emergency_sp - THREAD_SIZE;
        if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes)
                return 1;
@@@ -2179,7 -2176,7 +2179,7 @@@ void show_stack(struct task_struct *tsk
                 * See if this is an exception frame.
                 * We look for the "regshere" marker in the current frame.
                 */
 -              if (validate_sp(sp, tsk, STACK_INT_FRAME_SIZE)
 +              if (validate_sp(sp, tsk, STACK_FRAME_WITH_PT_REGS)
                    && stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) {
                        struct pt_regs *regs = (struct pt_regs *)
                                (sp + STACK_FRAME_OVERHEAD);
index 19f4688f2f36704701cc357b31d7886257effafe,06d326caa7d8aec3da53ac601ae2020eede76596..6f728e731bedf56020516cbdc0076e53ce518b85
  #include <asm/unistd.h>
  #include <asm/processor.h>
  #include <asm/csr.h>
 +#include <asm/stacktrace.h>
  #include <asm/string.h>
  #include <asm/switch_to.h>
  #include <asm/thread_info.h>
  
  register unsigned long gp_in_global __asm__("gp");
  
 -#ifdef CONFIG_STACKPROTECTOR
 +#if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK)
  #include <linux/stackprotector.h>
  unsigned long __stack_chk_guard __read_mostly;
  EXPORT_SYMBOL(__stack_chk_guard);
@@@ -40,16 -39,11 +40,16 @@@ void arch_cpu_idle(void
        raw_local_irq_enable();
  }
  
 -void show_regs(struct pt_regs *regs)
 +void __show_regs(struct pt_regs *regs)
  {
        show_regs_print_info(KERN_DEFAULT);
  
 -      pr_cont("epc: " REG_FMT " ra : " REG_FMT " sp : " REG_FMT "\n",
 +      if (!user_mode(regs)) {
 +              pr_cont("epc : %pS\n", (void *)regs->epc);
 +              pr_cont(" ra : %pS\n", (void *)regs->ra);
 +      }
 +
 +      pr_cont("epc : " REG_FMT " ra : " REG_FMT " sp : " REG_FMT "\n",
                regs->epc, regs->ra, regs->sp);
        pr_cont(" gp : " REG_FMT " tp : " REG_FMT " t0 : " REG_FMT "\n",
                regs->gp, regs->tp, regs->t0);
        pr_cont("status: " REG_FMT " badaddr: " REG_FMT " cause: " REG_FMT "\n",
                regs->status, regs->badaddr, regs->cause);
  }
 +void show_regs(struct pt_regs *regs)
 +{
 +      __show_regs(regs);
 +      if (!user_mode(regs))
 +              dump_backtrace(regs, NULL, KERN_DEFAULT);
 +}
  
  void start_thread(struct pt_regs *regs, unsigned long pc,
        unsigned long sp)
@@@ -124,7 -112,7 +124,7 @@@ int copy_thread(unsigned long clone_fla
        struct pt_regs *childregs = task_pt_regs(p);
  
        /* p->thread holds context to be restored by __switch_to() */
-       if (unlikely(p->flags & PF_KTHREAD)) {
+       if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
                /* Kernel thread */
                memset(childregs, 0, sizeof(struct pt_regs));
                childregs->gp = gp_in_global;
index 7649b14d69f8ad85bccce8db783ef06efea5d153,0f9c606e1e78edce6a3e1abedca7a9498a8e10b8..b91e88058e0ce39675eb095c9c4876e4ea44ad97
@@@ -183,7 -183,7 +183,7 @@@ void exit_thread(struct task_struct *ts
  #ifndef CONFIG_SMP
        if (last_task_used_math == tsk) {
  #else
 -      if (test_ti_thread_flag(task_thread_info(tsk), TIF_USEDFPU)) {
 +      if (test_tsk_thread_flag(tsk, TIF_USEDFPU)) {
  #endif
                /* Keep process from leaving FPU in a bogon state. */
                put_psr(get_psr() | PSR_EF);
@@@ -309,7 -309,7 +309,7 @@@ int copy_thread(unsigned long clone_fla
        ti->ksp = (unsigned long) new_stack;
        p->thread.kregs = childregs;
  
-       if (unlikely(p->flags & PF_KTHREAD)) {
+       if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
                extern int nwindows;
                unsigned long psr;
                memset(new_stack, 0, STACKFRAME_SZ + TRACEREG_SZ);
diff --combined fs/io_uring.c
index c9f5f295c2acfc9e0dc9d16fadeec1e5a8e5e983,4d79732d7d6b21435e32d455714bddecb8f39d91..4a088581b0f2dfca2845eb23e72125349d1c4eea
@@@ -57,7 -57,6 +57,6 @@@
  #include <linux/mman.h>
  #include <linux/percpu.h>
  #include <linux/slab.h>
- #include <linux/kthread.h>
  #include <linux/blkdev.h>
  #include <linux/bvec.h>
  #include <linux/net.h>
@@@ -254,6 -253,11 +253,11 @@@ struct io_restriction 
        bool registered;
  };
  
+ enum {
+       IO_SQ_THREAD_SHOULD_STOP = 0,
+       IO_SQ_THREAD_SHOULD_PARK,
+ };
  struct io_sq_data {
        refcount_t              refs;
        struct mutex            lock;
        struct wait_queue_head  wait;
  
        unsigned                sq_thread_idle;
+       int                     sq_cpu;
+       pid_t                   task_pid;
+       unsigned long           state;
+       struct completion       startup;
+       struct completion       completion;
+       struct completion       exited;
  };
  
  #define IO_IOPOLL_BATCH                       8
@@@ -323,12 -334,12 +334,12 @@@ struct io_ring_ctx 
        struct {
                unsigned int            flags;
                unsigned int            compat: 1;
-               unsigned int            limit_mem: 1;
                unsigned int            cq_overflow_flushed: 1;
                unsigned int            drain_next: 1;
                unsigned int            eventfd_async: 1;
                unsigned int            restricted: 1;
                unsigned int            sqo_dead: 1;
+               unsigned int            sqo_exec: 1;
  
                /*
                 * Ring buffer of indices into array of io_uring_sqe, which is
                unsigned                cached_cq_overflow;
                unsigned long           sq_check_overflow;
  
+               /* hashed buffered write serialization */
+               struct io_wq_hash       *hash_map;
                struct list_head        defer_list;
                struct list_head        timeout_list;
                struct list_head        cq_overflow_list;
  
        struct io_rings *rings;
  
-       /* IO offload */
-       struct io_wq            *io_wq;
        /*
-        * For SQPOLL usage - we hold a reference to the parent task, so we
-        * have access to the ->files
+        * For SQPOLL usage
         */
        struct task_struct      *sqo_task;
  
        /* Only used for accounting purposes */
        struct mm_struct        *mm_account;
  
- #ifdef CONFIG_BLK_CGROUP
-       struct cgroup_subsys_state      *sqo_blkcg_css;
- #endif
        struct io_sq_data       *sq_data;       /* if using sq thread polling */
  
        struct wait_queue_head  sqo_sq_wait;
  
        struct user_struct      *user;
  
-       const struct cred       *creds;
- #ifdef CONFIG_AUDIT
-       kuid_t                  loginuid;
-       unsigned int            sessionid;
- #endif
        struct completion       ref_comp;
        struct completion       sq_thread_comp;
  
  
        struct io_restriction           restrictions;
  
+       /* exit task_work */
+       struct callback_head            *exit_task_work;
+       struct wait_queue_head          hash_wait;
        /* Keep this last, we don't need it for the fast path */
        struct work_struct              exit_work;
  };
@@@ -838,7 -842,6 +842,6 @@@ struct io_op_def 
        unsigned                plug : 1;
        /* size of async data needed, if any */
        unsigned short          async_size;
-       unsigned                work_flags;
  };
  
  static const struct io_op_def io_op_defs[] = {
                .needs_async_data       = 1,
                .plug                   = 1,
                .async_size             = sizeof(struct io_async_rw),
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
        },
        [IORING_OP_WRITEV] = {
                .needs_file             = 1,
                .needs_async_data       = 1,
                .plug                   = 1,
                .async_size             = sizeof(struct io_async_rw),
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
-                                               IO_WQ_WORK_FSIZE,
        },
        [IORING_OP_FSYNC] = {
                .needs_file             = 1,
-               .work_flags             = IO_WQ_WORK_BLKCG,
        },
        [IORING_OP_READ_FIXED] = {
                .needs_file             = 1,
                .pollin                 = 1,
                .plug                   = 1,
                .async_size             = sizeof(struct io_async_rw),
-               .work_flags             = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
        },
        [IORING_OP_WRITE_FIXED] = {
                .needs_file             = 1,
                .pollout                = 1,
                .plug                   = 1,
                .async_size             = sizeof(struct io_async_rw),
-               .work_flags             = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE |
-                                               IO_WQ_WORK_MM,
        },
        [IORING_OP_POLL_ADD] = {
                .needs_file             = 1,
        [IORING_OP_POLL_REMOVE] = {},
        [IORING_OP_SYNC_FILE_RANGE] = {
                .needs_file             = 1,
-               .work_flags             = IO_WQ_WORK_BLKCG,
        },
        [IORING_OP_SENDMSG] = {
                .needs_file             = 1,
                .pollout                = 1,
                .needs_async_data       = 1,
                .async_size             = sizeof(struct io_async_msghdr),
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
-                                               IO_WQ_WORK_FS,
        },
        [IORING_OP_RECVMSG] = {
                .needs_file             = 1,
                .buffer_select          = 1,
                .needs_async_data       = 1,
                .async_size             = sizeof(struct io_async_msghdr),
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
-                                               IO_WQ_WORK_FS,
        },
        [IORING_OP_TIMEOUT] = {
                .needs_async_data       = 1,
                .async_size             = sizeof(struct io_timeout_data),
-               .work_flags             = IO_WQ_WORK_MM,
        },
        [IORING_OP_TIMEOUT_REMOVE] = {
                /* used by timeout updates' prep() */
-               .work_flags             = IO_WQ_WORK_MM,
        },
        [IORING_OP_ACCEPT] = {
                .needs_file             = 1,
                .unbound_nonreg_file    = 1,
                .pollin                 = 1,
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_FILES,
        },
        [IORING_OP_ASYNC_CANCEL] = {},
        [IORING_OP_LINK_TIMEOUT] = {
                .needs_async_data       = 1,
                .async_size             = sizeof(struct io_timeout_data),
-               .work_flags             = IO_WQ_WORK_MM,
        },
        [IORING_OP_CONNECT] = {
                .needs_file             = 1,
                .pollout                = 1,
                .needs_async_data       = 1,
                .async_size             = sizeof(struct io_async_connect),
-               .work_flags             = IO_WQ_WORK_MM,
        },
        [IORING_OP_FALLOCATE] = {
                .needs_file             = 1,
-               .work_flags             = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE,
-       },
-       [IORING_OP_OPENAT] = {
-               .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG |
-                                               IO_WQ_WORK_FS | IO_WQ_WORK_MM,
-       },
-       [IORING_OP_CLOSE] = {
-               .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG,
-       },
-       [IORING_OP_FILES_UPDATE] = {
-               .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_MM,
-       },
-       [IORING_OP_STATX] = {
-               .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_MM |
-                                               IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
        },
+       [IORING_OP_OPENAT] = {},
+       [IORING_OP_CLOSE] = {},
+       [IORING_OP_FILES_UPDATE] = {},
+       [IORING_OP_STATX] = {},
        [IORING_OP_READ] = {
                .needs_file             = 1,
                .unbound_nonreg_file    = 1,
                .buffer_select          = 1,
                .plug                   = 1,
                .async_size             = sizeof(struct io_async_rw),
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
        },
        [IORING_OP_WRITE] = {
                .needs_file             = 1,
                .pollout                = 1,
                .plug                   = 1,
                .async_size             = sizeof(struct io_async_rw),
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
-                                               IO_WQ_WORK_FSIZE,
        },
        [IORING_OP_FADVISE] = {
                .needs_file             = 1,
-               .work_flags             = IO_WQ_WORK_BLKCG,
-       },
-       [IORING_OP_MADVISE] = {
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
        },
+       [IORING_OP_MADVISE] = {},
        [IORING_OP_SEND] = {
                .needs_file             = 1,
                .unbound_nonreg_file    = 1,
                .pollout                = 1,
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
        },
        [IORING_OP_RECV] = {
                .needs_file             = 1,
                .unbound_nonreg_file    = 1,
                .pollin                 = 1,
                .buffer_select          = 1,
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
        },
        [IORING_OP_OPENAT2] = {
-               .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_FS |
-                                               IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
        },
        [IORING_OP_EPOLL_CTL] = {
                .unbound_nonreg_file    = 1,
-               .work_flags             = IO_WQ_WORK_FILES,
        },
        [IORING_OP_SPLICE] = {
                .needs_file             = 1,
                .hash_reg_file          = 1,
                .unbound_nonreg_file    = 1,
-               .work_flags             = IO_WQ_WORK_BLKCG,
        },
        [IORING_OP_PROVIDE_BUFFERS] = {},
        [IORING_OP_REMOVE_BUFFERS] = {},
        [IORING_OP_SHUTDOWN] = {
                .needs_file             = 1,
        },
-       [IORING_OP_RENAMEAT] = {
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
-                                               IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
-       },
-       [IORING_OP_UNLINKAT] = {
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
-                                               IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
-       },
+       [IORING_OP_RENAMEAT] = {},
+       [IORING_OP_UNLINKAT] = {},
  };
  
  static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
                                         struct task_struct *task,
                                         struct files_struct *files);
+ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx);
  static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node);
  static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
                        struct io_ring_ctx *ctx);
@@@ -1106,6 -1064,21 +1064,6 @@@ static inline void io_set_resource_node
        }
  }
  
 -static bool io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
 -{
 -      if (!percpu_ref_tryget(ref)) {
 -              /* already at zero, wait for ->release() */
 -              if (!try_wait_for_completion(compl))
 -                      synchronize_rcu();
 -              return false;
 -      }
 -
 -      percpu_ref_resurrect(ref);
 -      reinit_completion(compl);
 -      percpu_ref_put(ref);
 -      return true;
 -}
 -
  static bool io_match_task(struct io_kiocb *head,
                          struct task_struct *task,
                          struct files_struct *files)
                        continue;
                if (req->file && req->file->f_op == &io_uring_fops)
                        return true;
-               if ((req->work.flags & IO_WQ_WORK_FILES) &&
-                   req->work.identity->files == files)
+               if (req->task->files == files)
                        return true;
        }
        return false;
  }
  
- static void io_sq_thread_drop_mm_files(void)
- {
-       struct files_struct *files = current->files;
-       struct mm_struct *mm = current->mm;
-       if (mm) {
-               kthread_unuse_mm(mm);
-               mmput(mm);
-               current->mm = NULL;
-       }
-       if (files) {
-               struct nsproxy *nsproxy = current->nsproxy;
-               task_lock(current);
-               current->files = NULL;
-               current->nsproxy = NULL;
-               task_unlock(current);
-               put_files_struct(files);
-               put_nsproxy(nsproxy);
-       }
- }
- static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx)
- {
-       if (!current->files) {
-               struct files_struct *files;
-               struct nsproxy *nsproxy;
-               task_lock(ctx->sqo_task);
-               files = ctx->sqo_task->files;
-               if (!files) {
-                       task_unlock(ctx->sqo_task);
-                       return -EOWNERDEAD;
-               }
-               atomic_inc(&files->count);
-               get_nsproxy(ctx->sqo_task->nsproxy);
-               nsproxy = ctx->sqo_task->nsproxy;
-               task_unlock(ctx->sqo_task);
-               task_lock(current);
-               current->files = files;
-               current->nsproxy = nsproxy;
-               task_unlock(current);
-       }
-       return 0;
- }
- static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
- {
-       struct mm_struct *mm;
-       if (current->mm)
-               return 0;
-       task_lock(ctx->sqo_task);
-       mm = ctx->sqo_task->mm;
-       if (unlikely(!mm || !mmget_not_zero(mm)))
-               mm = NULL;
-       task_unlock(ctx->sqo_task);
-       if (mm) {
-               kthread_use_mm(mm);
-               return 0;
-       }
-       return -EFAULT;
- }
- static int __io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
-                                          struct io_kiocb *req)
- {
-       const struct io_op_def *def = &io_op_defs[req->opcode];
-       int ret;
-       if (def->work_flags & IO_WQ_WORK_MM) {
-               ret = __io_sq_thread_acquire_mm(ctx);
-               if (unlikely(ret))
-                       return ret;
-       }
-       if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) {
-               ret = __io_sq_thread_acquire_files(ctx);
-               if (unlikely(ret))
-                       return ret;
-       }
-       return 0;
- }
- static inline int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
-                                               struct io_kiocb *req)
- {
-       if (!(ctx->flags & IORING_SETUP_SQPOLL))
-               return 0;
-       return __io_sq_thread_acquire_mm_files(ctx, req);
- }
- static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx,
-                                        struct cgroup_subsys_state **cur_css)
- {
- #ifdef CONFIG_BLK_CGROUP
-       /* puts the old one when swapping */
-       if (*cur_css != ctx->sqo_blkcg_css) {
-               kthread_associate_blkcg(ctx->sqo_blkcg_css);
-               *cur_css = ctx->sqo_blkcg_css;
-       }
- #endif
- }
- static void io_sq_thread_unassociate_blkcg(void)
- {
- #ifdef CONFIG_BLK_CGROUP
-       kthread_associate_blkcg(NULL);
- #endif
- }
  static inline void req_set_fail_links(struct io_kiocb *req)
  {
        if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
                req->flags |= REQ_F_FAIL_LINK;
  }
  
- /*
-  * None of these are dereferenced, they are simply used to check if any of
-  * them have changed. If we're under current and check they are still the
-  * same, we're fine to grab references to them for actual out-of-line use.
-  */
- static void io_init_identity(struct io_identity *id)
- {
-       id->files = current->files;
-       id->mm = current->mm;
- #ifdef CONFIG_BLK_CGROUP
-       rcu_read_lock();
-       id->blkcg_css = blkcg_css();
-       rcu_read_unlock();
- #endif
-       id->creds = current_cred();
-       id->nsproxy = current->nsproxy;
-       id->fs = current->fs;
-       id->fsize = rlimit(RLIMIT_FSIZE);
- #ifdef CONFIG_AUDIT
-       id->loginuid = current->loginuid;
-       id->sessionid = current->sessionid;
- #endif
-       refcount_set(&id->count, 1);
- }
  static inline void __io_req_init_async(struct io_kiocb *req)
  {
        memset(&req->work, 0, sizeof(req->work));
   */
  static inline void io_req_init_async(struct io_kiocb *req)
  {
-       struct io_uring_task *tctx = current->io_uring;
        if (req->flags & REQ_F_WORK_INITIALIZED)
                return;
  
        __io_req_init_async(req);
-       /* Grab a ref if this isn't our static identity */
-       req->work.identity = tctx->identity;
-       if (tctx->identity != &tctx->__identity)
-               refcount_inc(&req->work.identity->count);
  }
  
  static void io_ring_ctx_ref_free(struct percpu_ref *ref)
@@@ -1388,40 -1211,14 +1196,14 @@@ static bool req_need_defer(struct io_ki
        return false;
  }
  
- static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req)
- {
-       if (req->work.identity == &tctx->__identity)
-               return;
-       if (refcount_dec_and_test(&req->work.identity->count))
-               kfree(req->work.identity);
- }
  static void io_req_clean_work(struct io_kiocb *req)
  {
        if (!(req->flags & REQ_F_WORK_INITIALIZED))
                return;
  
-       if (req->work.flags & IO_WQ_WORK_MM)
-               mmdrop(req->work.identity->mm);
- #ifdef CONFIG_BLK_CGROUP
-       if (req->work.flags & IO_WQ_WORK_BLKCG)
-               css_put(req->work.identity->blkcg_css);
- #endif
-       if (req->work.flags & IO_WQ_WORK_CREDS)
-               put_cred(req->work.identity->creds);
-       if (req->work.flags & IO_WQ_WORK_FS) {
-               struct fs_struct *fs = req->work.identity->fs;
-               spin_lock(&req->work.identity->fs->lock);
-               if (--fs->users)
-                       fs = NULL;
-               spin_unlock(&req->work.identity->fs->lock);
-               if (fs)
-                       free_fs_struct(fs);
-       }
-       if (req->work.flags & IO_WQ_WORK_FILES) {
-               put_files_struct(req->work.identity->files);
-               put_nsproxy(req->work.identity->nsproxy);
+       if (req->work.creds) {
+               put_cred(req->work.creds);
+               req->work.creds = NULL;
        }
        if (req->flags & REQ_F_INFLIGHT) {
                struct io_ring_ctx *ctx = req->ctx;
        }
  
        req->flags &= ~REQ_F_WORK_INITIALIZED;
-       req->work.flags &= ~(IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | IO_WQ_WORK_FS |
-                            IO_WQ_WORK_CREDS | IO_WQ_WORK_FILES);
-       io_put_identity(req->task->io_uring, req);
- }
- /*
-  * Create a private copy of io_identity, since some fields don't match
-  * the current context.
-  */
- static bool io_identity_cow(struct io_kiocb *req)
- {
-       struct io_uring_task *tctx = current->io_uring;
-       const struct cred *creds = NULL;
-       struct io_identity *id;
-       if (req->work.flags & IO_WQ_WORK_CREDS)
-               creds = req->work.identity->creds;
-       id = kmemdup(req->work.identity, sizeof(*id), GFP_KERNEL);
-       if (unlikely(!id)) {
-               req->work.flags |= IO_WQ_WORK_CANCEL;
-               return false;
-       }
-       /*
-        * We can safely just re-init the creds we copied  Either the field
-        * matches the current one, or we haven't grabbed it yet. The only
-        * exception is ->creds, through registered personalities, so handle
-        * that one separately.
-        */
-       io_init_identity(id);
-       if (creds)
-               id->creds = creds;
-       /* add one for this request */
-       refcount_inc(&id->count);
-       /* drop tctx and req identity references, if needed */
-       if (tctx->identity != &tctx->__identity &&
-           refcount_dec_and_test(&tctx->identity->count))
-               kfree(tctx->identity);
-       if (req->work.identity != &tctx->__identity &&
-           refcount_dec_and_test(&req->work.identity->count))
-               kfree(req->work.identity);
-       req->work.identity = id;
-       tctx->identity = id;
-       return true;
  }
  
  static void io_req_track_inflight(struct io_kiocb *req)
        }
  }
  
- static bool io_grab_identity(struct io_kiocb *req)
- {
-       const struct io_op_def *def = &io_op_defs[req->opcode];
-       struct io_identity *id = req->work.identity;
-       if (def->work_flags & IO_WQ_WORK_FSIZE) {
-               if (id->fsize != rlimit(RLIMIT_FSIZE))
-                       return false;
-               req->work.flags |= IO_WQ_WORK_FSIZE;
-       }
- #ifdef CONFIG_BLK_CGROUP
-       if (!(req->work.flags & IO_WQ_WORK_BLKCG) &&
-           (def->work_flags & IO_WQ_WORK_BLKCG)) {
-               rcu_read_lock();
-               if (id->blkcg_css != blkcg_css()) {
-                       rcu_read_unlock();
-                       return false;
-               }
-               /*
-                * This should be rare, either the cgroup is dying or the task
-                * is moving cgroups. Just punt to root for the handful of ios.
-                */
-               if (css_tryget_online(id->blkcg_css))
-                       req->work.flags |= IO_WQ_WORK_BLKCG;
-               rcu_read_unlock();
-       }
- #endif
-       if (!(req->work.flags & IO_WQ_WORK_CREDS)) {
-               if (id->creds != current_cred())
-                       return false;
-               get_cred(id->creds);
-               req->work.flags |= IO_WQ_WORK_CREDS;
-       }
- #ifdef CONFIG_AUDIT
-       if (!uid_eq(current->loginuid, id->loginuid) ||
-           current->sessionid != id->sessionid)
-               return false;
- #endif
-       if (!(req->work.flags & IO_WQ_WORK_FS) &&
-           (def->work_flags & IO_WQ_WORK_FS)) {
-               if (current->fs != id->fs)
-                       return false;
-               spin_lock(&id->fs->lock);
-               if (!id->fs->in_exec) {
-                       id->fs->users++;
-                       req->work.flags |= IO_WQ_WORK_FS;
-               } else {
-                       req->work.flags |= IO_WQ_WORK_CANCEL;
-               }
-               spin_unlock(&current->fs->lock);
-       }
-       if (!(req->work.flags & IO_WQ_WORK_FILES) &&
-           (def->work_flags & IO_WQ_WORK_FILES) &&
-           !(req->flags & REQ_F_NO_FILE_TABLE)) {
-               if (id->files != current->files ||
-                   id->nsproxy != current->nsproxy)
-                       return false;
-               atomic_inc(&id->files->count);
-               get_nsproxy(id->nsproxy);
-               req->work.flags |= IO_WQ_WORK_FILES;
-               io_req_track_inflight(req);
-       }
-       if (!(req->work.flags & IO_WQ_WORK_MM) &&
-           (def->work_flags & IO_WQ_WORK_MM)) {
-               if (id->mm != current->mm)
-                       return false;
-               mmgrab(id->mm);
-               req->work.flags |= IO_WQ_WORK_MM;
-       }
-       return true;
- }
  static void io_prep_async_work(struct io_kiocb *req)
  {
        const struct io_op_def *def = &io_op_defs[req->opcode];
                if (def->unbound_nonreg_file)
                        req->work.flags |= IO_WQ_WORK_UNBOUND;
        }
-       /* if we fail grabbing identity, we must COW, regrab, and retry */
-       if (io_grab_identity(req))
-               return;
-       if (!io_identity_cow(req))
-               return;
-       /* can't fail at this point */
-       if (!io_grab_identity(req))
-               WARN_ON(1);
+       if (!req->work.creds)
+               req->work.creds = get_current_cred();
  }
  
  static void io_prep_async_link(struct io_kiocb *req)
@@@ -1616,10 -1283,14 +1268,14 @@@ static struct io_kiocb *__io_queue_asyn
  {
        struct io_ring_ctx *ctx = req->ctx;
        struct io_kiocb *link = io_prep_linked_timeout(req);
+       struct io_uring_task *tctx = req->task->io_uring;
+       BUG_ON(!tctx);
+       BUG_ON(!tctx->io_wq);
  
        trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
                                        &req->work, req->flags);
-       io_wq_enqueue(ctx->io_wq, &req->work);
+       io_wq_enqueue(tctx->io_wq, &req->work);
        return link;
  }
  
@@@ -2313,11 -1984,14 +1969,14 @@@ static int io_req_task_work_add(struct 
  static void io_req_task_work_add_fallback(struct io_kiocb *req,
                                          task_work_func_t cb)
  {
-       struct task_struct *tsk = io_wq_get_task(req->ctx->io_wq);
+       struct io_ring_ctx *ctx = req->ctx;
+       struct callback_head *head;
  
        init_task_work(&req->task_work, cb);
-       task_work_add(tsk, &req->task_work, TWA_NONE);
-       wake_up_process(tsk);
+       do {
+               head = READ_ONCE(ctx->exit_task_work);
+               req->task_work.next = head;
+       } while (cmpxchg(&ctx->exit_task_work, head, &req->task_work) != head);
  }
  
  static void __io_req_task_cancel(struct io_kiocb *req, int error)
@@@ -2351,15 -2025,11 +2010,11 @@@ static void __io_req_task_submit(struc
  
        /* ctx stays valid until unlock, even if we drop all ours ctx->refs */
        mutex_lock(&ctx->uring_lock);
-       if (!ctx->sqo_dead && !(current->flags & PF_EXITING) &&
-           !io_sq_thread_acquire_mm_files(ctx, req))
+       if (!ctx->sqo_dead && !(current->flags & PF_EXITING) && !current->in_execve)
                __io_queue_sqe(req);
        else
                __io_req_task_cancel(req, -EFAULT);
        mutex_unlock(&ctx->uring_lock);
-       if (ctx->flags & IORING_SETUP_SQPOLL)
-               io_sq_thread_drop_mm_files();
  }
  
  static void io_req_task_submit(struct callback_head *cb)
@@@ -2823,25 -2493,15 +2478,22 @@@ static bool io_rw_reissue(struct io_kio
  {
  #ifdef CONFIG_BLOCK
        umode_t mode = file_inode(req->file)->i_mode;
-       int ret;
  
        if (!S_ISBLK(mode) && !S_ISREG(mode))
                return false;
        if ((req->flags & REQ_F_NOWAIT) || io_wq_current_is_worker())
                return false;
 +      /*
 +       * If ref is dying, we might be running poll reap from the exit work.
 +       * Don't attempt to reissue from that path, just let it fail with
 +       * -EAGAIN.
 +       */
 +      if (percpu_ref_is_dying(&req->ctx->refs))
 +              return false;
  
        lockdep_assert_held(&req->ctx->uring_lock);
  
-       ret = io_sq_thread_acquire_mm_files(req->ctx, req);
-       if (!ret && io_resubmit_prep(req)) {
+       if (io_resubmit_prep(req)) {
                refcount_inc(&req->refs);
                io_queue_async_work(req);
                return true;
@@@ -5946,12 -5606,15 +5598,15 @@@ static bool io_cancel_cb(struct io_wq_w
        return req->user_data == (unsigned long) data;
  }
  
- static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
+ static int io_async_cancel_one(struct io_uring_task *tctx, void *sqe_addr)
  {
        enum io_wq_cancel cancel_ret;
        int ret = 0;
  
-       cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false);
+       if (!tctx->io_wq)
+               return -ENOENT;
+       cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, sqe_addr, false);
        switch (cancel_ret) {
        case IO_WQ_CANCEL_OK:
                ret = 0;
@@@ -5974,7 -5637,8 +5629,8 @@@ static void io_async_find_and_cancel(st
        unsigned long flags;
        int ret;
  
-       ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
+       ret = io_async_cancel_one(req->task->io_uring,
+                                       (void *) (unsigned long) sqe_addr);
        if (ret != -ENOENT) {
                spin_lock_irqsave(&ctx->completion_lock, flags);
                goto done;
@@@ -6563,10 -6227,9 +6219,9 @@@ static void __io_queue_sqe(struct io_ki
        const struct cred *old_creds = NULL;
        int ret;
  
-       if ((req->flags & REQ_F_WORK_INITIALIZED) &&
-           (req->work.flags & IO_WQ_WORK_CREDS) &&
-           req->work.identity->creds != current_cred())
-               old_creds = override_creds(req->work.identity->creds);
+       if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds &&
+           req->work.creds != current_cred())
+               old_creds = override_creds(req->work.creds);
  
        ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
  
@@@ -6684,9 -6347,6 +6339,6 @@@ static int io_init_req(struct io_ring_c
        if (unlikely(req->opcode >= IORING_OP_LAST))
                return -EINVAL;
  
-       if (unlikely(io_sq_thread_acquire_mm_files(ctx, req)))
-               return -EFAULT;
        if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
                return -EACCES;
  
  
        id = READ_ONCE(sqe->personality);
        if (id) {
-               struct io_identity *iod;
-               iod = idr_find(&ctx->personality_idr, id);
-               if (unlikely(!iod))
-                       return -EINVAL;
-               refcount_inc(&iod->count);
                __io_req_init_async(req);
-               get_cred(iod->creds);
-               req->work.identity = iod;
-               req->work.flags |= IO_WQ_WORK_CREDS;
+               req->work.creds = idr_find(&ctx->personality_idr, id);
+               if (unlikely(!req->work.creds))
+                       return -EINVAL;
+               get_cred(req->work.creds);
        }
  
        state = &ctx->submit_state;
@@@ -7008,71 -6662,97 +6654,97 @@@ static void io_sqd_init_new(struct io_s
        io_sqd_update_thread_idle(sqd);
  }
  
+ static bool io_sq_thread_should_stop(struct io_sq_data *sqd)
+ {
+       return test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+ }
+ static bool io_sq_thread_should_park(struct io_sq_data *sqd)
+ {
+       return test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
+ }
+ static void io_sq_thread_parkme(struct io_sq_data *sqd)
+ {
+       for (;;) {
+               /*
+                * TASK_PARKED is a special state; we must serialize against
+                * possible pending wakeups to avoid store-store collisions on
+                * task->state.
+                *
+                * Such a collision might possibly result in the task state
+                * changin from TASK_PARKED and us failing the
+                * wait_task_inactive() in kthread_park().
+                */
+               set_special_state(TASK_PARKED);
+               if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state))
+                       break;
+               /*
+                * Thread is going to call schedule(), do not preempt it,
+                * or the caller of kthread_park() may spend more time in
+                * wait_task_inactive().
+                */
+               preempt_disable();
+               complete(&sqd->completion);
+               schedule_preempt_disabled();
+               preempt_enable();
+       }
+       __set_current_state(TASK_RUNNING);
+ }
  static int io_sq_thread(void *data)
  {
-       struct cgroup_subsys_state *cur_css = NULL;
-       struct files_struct *old_files = current->files;
-       struct nsproxy *old_nsproxy = current->nsproxy;
-       const struct cred *old_cred = NULL;
        struct io_sq_data *sqd = data;
        struct io_ring_ctx *ctx;
        unsigned long timeout = 0;
+       char buf[TASK_COMM_LEN];
        DEFINE_WAIT(wait);
  
-       task_lock(current);
-       current->files = NULL;
-       current->nsproxy = NULL;
-       task_unlock(current);
+       sprintf(buf, "iou-sqp-%d", sqd->task_pid);
+       set_task_comm(current, buf);
+       sqd->thread = current;
+       current->pf_io_worker = NULL;
+       if (sqd->sq_cpu != -1)
+               set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
+       else
+               set_cpus_allowed_ptr(current, cpu_online_mask);
+       current->flags |= PF_NO_SETAFFINITY;
+       complete(&sqd->completion);
  
-       while (!kthread_should_stop()) {
+       wait_for_completion(&sqd->startup);
+       while (!io_sq_thread_should_stop(sqd)) {
                int ret;
                bool cap_entries, sqt_spin, needs_sched;
  
                /*
                 * Any changes to the sqd lists are synchronized through the
-                * kthread parking. This synchronizes the thread vs users,
+                * thread parking. This synchronizes the thread vs users,
                 * the users are synchronized on the sqd->ctx_lock.
                 */
-               if (kthread_should_park()) {
-                       kthread_parkme();
-                       /*
-                        * When sq thread is unparked, in case the previous park operation
-                        * comes from io_put_sq_data(), which means that sq thread is going
-                        * to be stopped, so here needs to have a check.
-                        */
-                       if (kthread_should_stop())
-                               break;
+               if (io_sq_thread_should_park(sqd)) {
+                       io_sq_thread_parkme(sqd);
+                       continue;
                }
                if (unlikely(!list_empty(&sqd->ctx_new_list))) {
                        io_sqd_init_new(sqd);
                        timeout = jiffies + sqd->sq_thread_idle;
                }
+               if (fatal_signal_pending(current))
+                       break;
                sqt_spin = false;
                cap_entries = !list_is_singular(&sqd->ctx_list);
                list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
-                       if (current->cred != ctx->creds) {
-                               if (old_cred)
-                                       revert_creds(old_cred);
-                               old_cred = override_creds(ctx->creds);
-                       }
-                       io_sq_thread_associate_blkcg(ctx, &cur_css);
- #ifdef CONFIG_AUDIT
-                       current->loginuid = ctx->loginuid;
-                       current->sessionid = ctx->sessionid;
- #endif
                        ret = __io_sq_thread(ctx, cap_entries);
                        if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
                                sqt_spin = true;
-                       io_sq_thread_drop_mm_files();
                }
  
                if (sqt_spin || !time_after(jiffies, timeout)) {
                        io_run_task_work();
-                       io_sq_thread_drop_mm_files();
                        cond_resched();
                        if (sqt_spin)
                                timeout = jiffies + sqd->sq_thread_idle;
                        }
                }
  
-               if (needs_sched && !kthread_should_park()) {
+               if (needs_sched && !io_sq_thread_should_park(sqd)) {
                        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                                io_ring_set_wakeup_flag(ctx);
  
                timeout = jiffies + sqd->sq_thread_idle;
        }
  
-       io_run_task_work();
-       io_sq_thread_drop_mm_files();
-       if (cur_css)
-               io_sq_thread_unassociate_blkcg();
-       if (old_cred)
-               revert_creds(old_cred);
+       list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+               io_uring_cancel_sqpoll(ctx);
  
-       task_lock(current);
-       current->files = old_files;
-       current->nsproxy = old_nsproxy;
-       task_unlock(current);
+       io_run_task_work();
  
-       kthread_parkme();
+       /*
+        * Clear thread under lock so that concurrent parks work correctly
+        */
+       complete_all(&sqd->completion);
+       mutex_lock(&sqd->lock);
+       sqd->thread = NULL;
+       list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+               ctx->sqo_exec = 1;
+               io_ring_set_wakeup_flag(ctx);
+       }
+       mutex_unlock(&sqd->lock);
  
-       return 0;
+       complete(&sqd->exited);
+       do_exit(0);
  }
  
  struct io_wait_queue {
@@@ -7343,13 -7026,11 +7018,13 @@@ static int io_rsrc_ref_quiesce(struct f
                flush_delayed_work(&ctx->rsrc_put_work);
  
                ret = wait_for_completion_interruptible(&data->done);
 -              if (!ret || !io_refs_resurrect(&data->refs, &data->done))
 +              if (!ret)
                        break;
  
 +              percpu_ref_resurrect(&data->refs);
                io_sqe_rsrc_set_node(ctx, data, backup_node);
                backup_node = NULL;
 +              reinit_completion(&data->done);
                mutex_unlock(&ctx->uring_lock);
                ret = io_run_task_work_sig();
                mutex_lock(&ctx->uring_lock);
@@@ -7413,20 -7094,74 +7088,74 @@@ static int io_sqe_files_unregister(stru
        return 0;
  }
  
+ static void io_sq_thread_unpark(struct io_sq_data *sqd)
+       __releases(&sqd->lock)
+ {
+       if (!sqd->thread)
+               return;
+       if (sqd->thread == current)
+               return;
+       clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
+       wake_up_state(sqd->thread, TASK_PARKED);
+       mutex_unlock(&sqd->lock);
+ }
+ static bool io_sq_thread_park(struct io_sq_data *sqd)
+       __acquires(&sqd->lock)
+ {
+       if (sqd->thread == current)
+               return true;
+       mutex_lock(&sqd->lock);
+       if (!sqd->thread) {
+               mutex_unlock(&sqd->lock);
+               return false;
+       }
+       set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
+       wake_up_process(sqd->thread);
+       wait_for_completion(&sqd->completion);
+       return true;
+ }
+ static void io_sq_thread_stop(struct io_sq_data *sqd)
+ {
+       if (!sqd->thread)
+               return;
+       set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+       WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state));
+       wake_up_process(sqd->thread);
+       wait_for_completion(&sqd->exited);
+ }
  static void io_put_sq_data(struct io_sq_data *sqd)
  {
        if (refcount_dec_and_test(&sqd->refs)) {
-               /*
-                * The park is a bit of a work-around, without it we get
-                * warning spews on shutdown with SQPOLL set and affinity
-                * set to a single CPU.
-                */
+               io_sq_thread_stop(sqd);
+               kfree(sqd);
+       }
+ }
+ static void io_sq_thread_finish(struct io_ring_ctx *ctx)
+ {
+       struct io_sq_data *sqd = ctx->sq_data;
+       if (sqd) {
+               complete(&sqd->startup);
                if (sqd->thread) {
-                       kthread_park(sqd->thread);
-                       kthread_stop(sqd->thread);
+                       wait_for_completion(&ctx->sq_thread_comp);
+                       io_sq_thread_park(sqd);
                }
  
-               kfree(sqd);
+               mutex_lock(&sqd->ctx_lock);
+               list_del(&ctx->sqd_list);
+               io_sqd_update_thread_idle(sqd);
+               mutex_unlock(&sqd->ctx_lock);
+               if (sqd->thread)
+                       io_sq_thread_unpark(sqd);
+               io_put_sq_data(sqd);
+               ctx->sq_data = NULL;
        }
  }
  
@@@ -7473,68 -7208,12 +7202,12 @@@ static struct io_sq_data *io_get_sq_dat
        mutex_init(&sqd->ctx_lock);
        mutex_init(&sqd->lock);
        init_waitqueue_head(&sqd->wait);
+       init_completion(&sqd->startup);
+       init_completion(&sqd->completion);
+       init_completion(&sqd->exited);
        return sqd;
  }
  
- static void io_sq_thread_unpark(struct io_sq_data *sqd)
-       __releases(&sqd->lock)
- {
-       if (!sqd->thread)
-               return;
-       kthread_unpark(sqd->thread);
-       mutex_unlock(&sqd->lock);
- }
- static void io_sq_thread_park(struct io_sq_data *sqd)
-       __acquires(&sqd->lock)
- {
-       if (!sqd->thread)
-               return;
-       mutex_lock(&sqd->lock);
-       kthread_park(sqd->thread);
- }
- static void io_sq_thread_stop(struct io_ring_ctx *ctx)
- {
-       struct io_sq_data *sqd = ctx->sq_data;
-       if (sqd) {
-               if (sqd->thread) {
-                       /*
-                        * We may arrive here from the error branch in
-                        * io_sq_offload_create() where the kthread is created
-                        * without being waked up, thus wake it up now to make
-                        * sure the wait will complete.
-                        */
-                       wake_up_process(sqd->thread);
-                       wait_for_completion(&ctx->sq_thread_comp);
-                       io_sq_thread_park(sqd);
-               }
-               mutex_lock(&sqd->ctx_lock);
-               list_del(&ctx->sqd_list);
-               io_sqd_update_thread_idle(sqd);
-               mutex_unlock(&sqd->ctx_lock);
-               if (sqd->thread)
-                       io_sq_thread_unpark(sqd);
-               io_put_sq_data(sqd);
-               ctx->sq_data = NULL;
-       }
- }
- static void io_finish_async(struct io_ring_ctx *ctx)
- {
-       io_sq_thread_stop(ctx);
-       if (ctx->io_wq) {
-               io_wq_destroy(ctx->io_wq);
-               ctx->io_wq = NULL;
-       }
- }
  #if defined(CONFIG_UNIX)
  /*
   * Ensure the UNIX gc is aware of our file set, so we are certain that
@@@ -7561,7 -7240,7 +7234,7 @@@ static int __io_sqe_files_scm(struct io
        skb->sk = sk;
  
        nr_files = 0;
-       fpl->user = get_uid(ctx->user);
+       fpl->user = get_uid(current_user());
        for (i = 0; i < nr; i++) {
                struct file *file = io_file_from_index(ctx, i + offset);
  
@@@ -8093,54 -7772,34 +7766,34 @@@ static struct io_wq_work *io_free_work(
        return req ? &req->work : NULL;
  }
  
- static int io_init_wq_offload(struct io_ring_ctx *ctx,
-                             struct io_uring_params *p)
+ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx)
  {
+       struct io_wq_hash *hash;
        struct io_wq_data data;
-       struct fd f;
-       struct io_ring_ctx *ctx_attach;
        unsigned int concurrency;
-       int ret = 0;
-       data.user = ctx->user;
-       data.free_work = io_free_work;
-       data.do_work = io_wq_submit_work;
-       if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
-               /* Do QD, or 4 * CPUS, whatever is smallest */
-               concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
  
-               ctx->io_wq = io_wq_create(concurrency, &data);
-               if (IS_ERR(ctx->io_wq)) {
-                       ret = PTR_ERR(ctx->io_wq);
-                       ctx->io_wq = NULL;
-               }
-               return ret;
+       hash = ctx->hash_map;
+       if (!hash) {
+               hash = kzalloc(sizeof(*hash), GFP_KERNEL);
+               if (!hash)
+                       return ERR_PTR(-ENOMEM);
+               refcount_set(&hash->refs, 1);
+               init_waitqueue_head(&hash->wait);
+               ctx->hash_map = hash;
        }
  
-       f = fdget(p->wq_fd);
-       if (!f.file)
-               return -EBADF;
-       if (f.file->f_op != &io_uring_fops) {
-               ret = -EINVAL;
-               goto out_fput;
-       }
+       data.hash = hash;
+       data.free_work = io_free_work;
+       data.do_work = io_wq_submit_work;
  
-       ctx_attach = f.file->private_data;
-       /* @io_wq is protected by holding the fd */
-       if (!io_wq_get(ctx_attach->io_wq, &data)) {
-               ret = -EINVAL;
-               goto out_fput;
-       }
+       /* Do QD, or 4 * CPUS, whatever is smallest */
+       concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
  
-       ctx->io_wq = ctx_attach->io_wq;
- out_fput:
-       fdput(f);
-       return ret;
+       return io_wq_create(concurrency, &data);
  }
  
- static int io_uring_alloc_task_context(struct task_struct *task)
+ static int io_uring_alloc_task_context(struct task_struct *task,
+                                      struct io_ring_ctx *ctx)
  {
        struct io_uring_task *tctx;
        int ret;
                return ret;
        }
  
+       tctx->io_wq = io_init_wq_offload(ctx);
+       if (IS_ERR(tctx->io_wq)) {
+               ret = PTR_ERR(tctx->io_wq);
+               percpu_counter_destroy(&tctx->inflight);
+               kfree(tctx);
+               return ret;
+       }
        xa_init(&tctx->xa);
        init_waitqueue_head(&tctx->wait);
        tctx->last = NULL;
        atomic_set(&tctx->in_idle, 0);
        tctx->sqpoll = false;
-       io_init_identity(&tctx->__identity);
-       tctx->identity = &tctx->__identity;
        task->io_uring = tctx;
        spin_lock_init(&tctx->task_lock);
        INIT_WQ_LIST(&tctx->task_list);
@@@ -8175,19 -7840,49 +7834,49 @@@ void __io_uring_free(struct task_struc
        struct io_uring_task *tctx = tsk->io_uring;
  
        WARN_ON_ONCE(!xa_empty(&tctx->xa));
-       WARN_ON_ONCE(refcount_read(&tctx->identity->count) != 1);
-       if (tctx->identity != &tctx->__identity)
-               kfree(tctx->identity);
        percpu_counter_destroy(&tctx->inflight);
        kfree(tctx);
        tsk->io_uring = NULL;
  }
  
+ static int io_sq_thread_fork(struct io_sq_data *sqd, struct io_ring_ctx *ctx)
+ {
+       int ret;
+       clear_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+       reinit_completion(&sqd->completion);
+       ctx->sqo_dead = ctx->sqo_exec = 0;
+       sqd->task_pid = current->pid;
+       current->flags |= PF_IO_WORKER;
+       ret = io_wq_fork_thread(io_sq_thread, sqd);
+       current->flags &= ~PF_IO_WORKER;
+       if (ret < 0) {
+               sqd->thread = NULL;
+               return ret;
+       }
+       wait_for_completion(&sqd->completion);
+       return io_uring_alloc_task_context(sqd->thread, ctx);
+ }
  static int io_sq_offload_create(struct io_ring_ctx *ctx,
                                struct io_uring_params *p)
  {
        int ret;
  
+       /* Retain compatibility with failing for an invalid attach attempt */
+       if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
+                               IORING_SETUP_ATTACH_WQ) {
+               struct fd f;
+               f = fdget(p->wq_fd);
+               if (!f.file)
+                       return -ENXIO;
+               if (f.file->f_op != &io_uring_fops) {
+                       fdput(f);
+                       return -EINVAL;
+               }
+               fdput(f);
+       }
        if (ctx->flags & IORING_SETUP_SQPOLL) {
                struct io_sq_data *sqd;
  
                        ctx->sq_thread_idle = HZ;
  
                if (sqd->thread)
-                       goto done;
+                       return 0;
  
                if (p->flags & IORING_SETUP_SQ_AFF) {
                        int cpu = p->sq_thread_cpu;
                        if (!cpu_online(cpu))
                                goto err;
  
-                       sqd->thread = kthread_create_on_cpu(io_sq_thread, sqd,
-                                                       cpu, "io_uring-sq");
+                       sqd->sq_cpu = cpu;
                } else {
-                       sqd->thread = kthread_create(io_sq_thread, sqd,
-                                                       "io_uring-sq");
+                       sqd->sq_cpu = -1;
                }
-               if (IS_ERR(sqd->thread)) {
-                       ret = PTR_ERR(sqd->thread);
+               sqd->task_pid = current->pid;
+               current->flags |= PF_IO_WORKER;
+               ret = io_wq_fork_thread(io_sq_thread, sqd);
+               current->flags &= ~PF_IO_WORKER;
+               if (ret < 0) {
                        sqd->thread = NULL;
                        goto err;
                }
-               ret = io_uring_alloc_task_context(sqd->thread);
+               wait_for_completion(&sqd->completion);
+               ret = io_uring_alloc_task_context(sqd->thread, ctx);
                if (ret)
                        goto err;
        } else if (p->flags & IORING_SETUP_SQ_AFF) {
                goto err;
        }
  
- done:
-       ret = io_init_wq_offload(ctx, p);
-       if (ret)
-               goto err;
        return 0;
  err:
-       io_finish_async(ctx);
+       io_sq_thread_finish(ctx);
        return ret;
  }
  
@@@ -8259,8 -7952,8 +7946,8 @@@ static void io_sq_offload_start(struct 
  {
        struct io_sq_data *sqd = ctx->sq_data;
  
-       if ((ctx->flags & IORING_SETUP_SQPOLL) && sqd->thread)
-               wake_up_process(sqd->thread);
+       if (ctx->flags & IORING_SETUP_SQPOLL)
+               complete(&sqd->startup);
  }
  
  static inline void __io_unaccount_mem(struct user_struct *user,
@@@ -8290,7 -7983,7 +7977,7 @@@ static inline int __io_account_mem(stru
  
  static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
  {
-       if (ctx->limit_mem)
+       if (ctx->user)
                __io_unaccount_mem(ctx->user, nr_pages);
  
        if (ctx->mm_account)
@@@ -8301,7 -7994,7 +7988,7 @@@ static int io_account_mem(struct io_rin
  {
        int ret;
  
-       if (ctx->limit_mem) {
+       if (ctx->user) {
                ret = __io_account_mem(ctx->user, nr_pages);
                if (ret)
                        return ret;
@@@ -8700,23 -8393,19 +8387,23 @@@ static void io_req_cache_free(struct li
  static void io_req_caches_free(struct io_ring_ctx *ctx, struct task_struct *tsk)
  {
        struct io_submit_state *submit_state = &ctx->submit_state;
 +      struct io_comp_state *cs = &ctx->submit_state.comp;
  
        mutex_lock(&ctx->uring_lock);
  
 -      if (submit_state->free_reqs)
 +      if (submit_state->free_reqs) {
                kmem_cache_free_bulk(req_cachep, submit_state->free_reqs,
                                     submit_state->reqs);
 -
 -      io_req_cache_free(&submit_state->comp.free_list, NULL);
 +              submit_state->free_reqs = 0;
 +      }
  
        spin_lock_irq(&ctx->completion_lock);
 -      io_req_cache_free(&submit_state->comp.locked_free_list, NULL);
 +      list_splice_init(&cs->locked_free_list, &cs->free_list);
 +      cs->locked_free_nr = 0;
        spin_unlock_irq(&ctx->completion_lock);
  
 +      io_req_cache_free(&cs->free_list, NULL);
 +
        mutex_unlock(&ctx->uring_lock);
  }
  
@@@ -8730,21 -8419,14 +8417,14 @@@ static void io_ring_ctx_free(struct io_
        mutex_lock(&ctx->uring_lock);
        mutex_unlock(&ctx->uring_lock);
  
-       io_finish_async(ctx);
+       io_sq_thread_finish(ctx);
        io_sqe_buffers_unregister(ctx);
  
-       if (ctx->sqo_task) {
-               put_task_struct(ctx->sqo_task);
-               ctx->sqo_task = NULL;
+       if (ctx->mm_account) {
                mmdrop(ctx->mm_account);
                ctx->mm_account = NULL;
        }
  
- #ifdef CONFIG_BLK_CGROUP
-       if (ctx->sqo_blkcg_css)
-               css_put(ctx->sqo_blkcg_css);
- #endif
        mutex_lock(&ctx->uring_lock);
        io_sqe_files_unregister(ctx);
        mutex_unlock(&ctx->uring_lock);
  
        percpu_ref_exit(&ctx->refs);
        free_uid(ctx->user);
-       put_cred(ctx->creds);
        io_req_caches_free(ctx, NULL);
+       if (ctx->hash_map)
+               io_wq_put_hash(ctx->hash_map);
        kfree(ctx->cancel_hash);
        kfree(ctx);
  }
@@@ -8812,13 -8495,11 +8493,11 @@@ static int io_uring_fasync(int fd, stru
  
  static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
  {
-       struct io_identity *iod;
+       const struct cred *creds;
  
-       iod = idr_remove(&ctx->personality_idr, id);
-       if (iod) {
-               put_cred(iod->creds);
-               if (refcount_dec_and_test(&iod->count))
-                       kfree(iod);
+       creds = idr_remove(&ctx->personality_idr, id);
+       if (creds) {
+               put_cred(creds);
                return 0;
        }
  
@@@ -8833,6 -8514,28 +8512,28 @@@ static int io_remove_personalities(int 
        return 0;
  }
  
+ static void io_run_ctx_fallback(struct io_ring_ctx *ctx)
+ {
+       struct callback_head *work, *head, *next;
+       do {
+               do {
+                       head = NULL;
+                       work = READ_ONCE(ctx->exit_task_work);
+               } while (cmpxchg(&ctx->exit_task_work, work, head) != work);
+               if (!work)
+                       break;
+               do {
+                       next = work->next;
+                       work->func(work);
+                       work = next;
+                       cond_resched();
+               } while (work);
+       } while (1);
+ }
  static void io_ring_exit_work(struct work_struct *work)
  {
        struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
         */
        do {
                io_uring_try_cancel_requests(ctx, NULL, NULL);
+               io_run_ctx_fallback(ctx);
        } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
        io_ring_ctx_free(ctx);
  }
  
- static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
- {
-       struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-       return req->ctx == data;
- }
  static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
  {
        mutex_lock(&ctx->uring_lock);
        io_kill_timeouts(ctx, NULL, NULL);
        io_poll_remove_all(ctx, NULL, NULL);
  
-       if (ctx->io_wq)
-               io_wq_cancel_cb(ctx->io_wq, io_cancel_ctx_cb, ctx, true);
        /* if we failed setting up the ctx, we might not have any rings */
        io_iopoll_try_reap_events(ctx);
  
@@@ -8956,13 -8650,14 +8648,14 @@@ static void io_uring_try_cancel_request
                                         struct files_struct *files)
  {
        struct io_task_cancel cancel = { .task = task, .files = files, };
+       struct io_uring_task *tctx = current->io_uring;
  
        while (1) {
                enum io_wq_cancel cret;
                bool ret = false;
  
-               if (ctx->io_wq) {
-                       cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb,
+               if (tctx && tctx->io_wq) {
+                       cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
                                               &cancel, true);
                        ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
                }
@@@ -9045,12 -8740,15 +8738,15 @@@ static void io_uring_cancel_task_reques
                                          struct files_struct *files)
  {
        struct task_struct *task = current;
+       bool did_park = false;
  
        if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
                io_disable_sqo_submit(ctx);
-               task = ctx->sq_data->thread;
-               atomic_inc(&task->io_uring->in_idle);
-               io_sq_thread_park(ctx->sq_data);
+               did_park = io_sq_thread_park(ctx->sq_data);
+               if (did_park) {
+                       task = ctx->sq_data->thread;
+                       atomic_inc(&task->io_uring->in_idle);
+               }
        }
  
        io_cancel_defer_files(ctx, task, files);
        if (!files)
                io_uring_try_cancel_requests(ctx, task, NULL);
  
-       if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
+       if (did_park) {
                atomic_dec(&task->io_uring->in_idle);
                io_sq_thread_unpark(ctx->sq_data);
        }
@@@ -9074,7 -8772,7 +8770,7 @@@ static int io_uring_add_task_file(struc
        int ret;
  
        if (unlikely(!tctx)) {
-               ret = io_uring_alloc_task_context(current);
+               ret = io_uring_alloc_task_context(current, ctx);
                if (unlikely(ret))
                        return ret;
                tctx = current->io_uring;
@@@ -9144,8 -8842,13 +8840,13 @@@ void __io_uring_files_cancel(struct fil
                io_uring_cancel_task_requests(file->private_data, files);
        atomic_dec(&tctx->in_idle);
  
-       if (files)
+       if (files) {
                io_uring_remove_task_files(tctx);
+               if (tctx->io_wq) {
+                       io_wq_put(tctx->io_wq);
+                       tctx->io_wq = NULL;
+               }
+       }
  }
  
  static s64 tctx_inflight(struct io_uring_task *tctx)
  
  static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
  {
+       struct io_sq_data *sqd = ctx->sq_data;
        struct io_uring_task *tctx;
        s64 inflight;
        DEFINE_WAIT(wait);
  
-       if (!ctx->sq_data)
+       if (!sqd)
                return;
-       tctx = ctx->sq_data->thread->io_uring;
        io_disable_sqo_submit(ctx);
+       if (!io_sq_thread_park(sqd))
+               return;
+       tctx = ctx->sq_data->thread->io_uring;
  
        atomic_inc(&tctx->in_idle);
        do {
                finish_wait(&tctx->wait, &wait);
        } while (1);
        atomic_dec(&tctx->in_idle);
+       io_sq_thread_unpark(sqd);
  }
  
  /*
@@@ -9236,11 -8943,17 +8941,17 @@@ static int io_uring_flush(struct file *
        struct io_uring_task *tctx = current->io_uring;
        struct io_ring_ctx *ctx = file->private_data;
  
+       /* Ignore helper thread files exit */
+       if (current->flags & PF_IO_WORKER)
+               return 0;
        if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
                io_uring_cancel_task_requests(ctx, NULL);
                io_req_caches_free(ctx, current);
        }
  
+       io_run_ctx_fallback(ctx);
        if (!tctx)
                return 0;
  
@@@ -9439,6 -9152,12 +9150,12 @@@ SYSCALL_DEFINE6(io_uring_enter, unsigne
        if (ctx->flags & IORING_SETUP_SQPOLL) {
                io_cqring_overflow_flush(ctx, false, NULL, NULL);
  
+               if (unlikely(ctx->sqo_exec)) {
+                       ret = io_sq_thread_fork(ctx->sq_data, ctx);
+                       if (ret)
+                               goto out;
+                       ctx->sqo_exec = 0;
+               }
                ret = -EOWNERDEAD;
                if (unlikely(ctx->sqo_dead))
                        goto out;
@@@ -9495,8 -9214,7 +9212,7 @@@ out_fput
  #ifdef CONFIG_PROC_FS
  static int io_uring_show_cred(int id, void *p, void *data)
  {
-       struct io_identity *iod = p;
-       const struct cred *cred = iod->creds;
+       const struct cred *cred = p;
        struct seq_file *m = data;
        struct user_namespace *uns = seq_user_ns(m);
        struct group_info *gi;
@@@ -9541,8 -9259,11 +9257,11 @@@ static void __io_uring_show_fdinfo(stru
         */
        has_lock = mutex_trylock(&ctx->uring_lock);
  
-       if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL))
+       if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
                sq = ctx->sq_data;
+               if (!sq->thread)
+                       sq = NULL;
+       }
  
        seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
        seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
@@@ -9702,7 -9423,6 +9421,6 @@@ static struct file *io_uring_get_file(s
  static int io_uring_create(unsigned entries, struct io_uring_params *p,
                           struct io_uring_params __user *params)
  {
-       struct user_struct *user = NULL;
        struct io_ring_ctx *ctx;
        struct file *file;
        int ret;
                p->cq_entries = 2 * p->sq_entries;
        }
  
-       user = get_uid(current_user());
        ctx = io_ring_ctx_alloc(p);
-       if (!ctx) {
-               free_uid(user);
+       if (!ctx)
                return -ENOMEM;
-       }
        ctx->compat = in_compat_syscall();
-       ctx->limit_mem = !capable(CAP_IPC_LOCK);
-       ctx->user = user;
-       ctx->creds = get_current_cred();
- #ifdef CONFIG_AUDIT
-       ctx->loginuid = current->loginuid;
-       ctx->sessionid = current->sessionid;
- #endif
-       ctx->sqo_task = get_task_struct(current);
+       if (!capable(CAP_IPC_LOCK))
+               ctx->user = get_uid(current_user());
+       ctx->sqo_task = current;
  
        /*
         * This is just grabbed for accounting purposes. When a process exits,
        mmgrab(current->mm);
        ctx->mm_account = current->mm;
  
- #ifdef CONFIG_BLK_CGROUP
-       /*
-        * The sq thread will belong to the original cgroup it was inited in.
-        * If the cgroup goes offline (e.g. disabling the io controller), then
-        * issued bios will be associated with the closest cgroup later in the
-        * block layer.
-        */
-       rcu_read_lock();
-       ctx->sqo_blkcg_css = blkcg_css();
-       ret = css_tryget_online(ctx->sqo_blkcg_css);
-       rcu_read_unlock();
-       if (!ret) {
-               /* don't init against a dying cgroup, have the user try again */
-               ctx->sqo_blkcg_css = NULL;
-               ret = -ENODEV;
-               goto err;
-       }
- #endif
        ret = io_allocate_scq_urings(ctx, p);
        if (ret)
                goto err;
                        IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
                        IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
                        IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
-                       IORING_FEAT_EXT_ARG;
+                       IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS;
  
        if (copy_to_user(params, p, sizeof(*p))) {
                ret = -EFAULT;
  
  static int io_register_personality(struct io_ring_ctx *ctx)
  {
-       struct io_identity *id;
+       const struct cred *creds;
        int ret;
  
-       id = kmalloc(sizeof(*id), GFP_KERNEL);
-       if (unlikely(!id))
-               return -ENOMEM;
-       io_init_identity(id);
-       id->creds = get_current_cred();
+       creds = get_current_cred();
  
-       ret = idr_alloc_cyclic(&ctx->personality_idr, id, 1, USHRT_MAX, GFP_KERNEL);
-       if (ret < 0) {
-               put_cred(id->creds);
-               kfree(id);
-       }
+       ret = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
+                               USHRT_MAX, GFP_KERNEL);
+       if (ret < 0)
+               put_cred(creds);
        return ret;
  }
  
@@@ -10082,10 -9769,8 +9767,10 @@@ static int __io_uring_register(struct i
  
                mutex_lock(&ctx->uring_lock);
  
 -              if (ret && io_refs_resurrect(&ctx->refs, &ctx->ref_comp))
 -                      return ret;
 +              if (ret) {
 +                      percpu_ref_resurrect(&ctx->refs);
 +                      goto out_quiesce;
 +              }
        }
  
        if (ctx->restricted) {
@@@ -10177,7 -9862,6 +9862,7 @@@ out
        if (io_register_op_must_quiesce(opcode)) {
                /* bring the ctx back to life */
                percpu_ref_reinit(&ctx->refs);
 +out_quiesce:
                reinit_completion(&ctx->ref_comp);
        }
        return ret;
diff --combined net/socket.c
index 23c7842389de38d58117a322bf4812ef22b943c2,90a60899aae5b64a3a19597d7585b8b84aaba5ef..84a8049c2b09960856c47a837941d054e0e1d7ea
@@@ -334,7 -334,6 +334,7 @@@ static const struct xattr_handler sockf
  };
  
  static int sockfs_security_xattr_set(const struct xattr_handler *handler,
 +                                   struct user_namespace *mnt_userns,
                                     struct dentry *dentry, struct inode *inode,
                                     const char *suffix, const void *value,
                                     size_t size, int flags)
@@@ -538,10 -537,9 +538,10 @@@ static ssize_t sockfs_listxattr(struct 
        return used;
  }
  
 -static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr)
 +static int sockfs_setattr(struct user_namespace *mnt_userns,
 +                        struct dentry *dentry, struct iattr *iattr)
  {
 -      int err = simple_setattr(dentry, iattr);
 +      int err = simple_setattr(&init_user_ns, dentry, iattr);
  
        if (!err && (iattr->ia_valid & ATTR_UID)) {
                struct socket *sock = SOCKET_I(d_inode(dentry));
@@@ -2413,10 -2411,6 +2413,6 @@@ static int ___sys_sendmsg(struct socke
  long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg,
                        unsigned int flags)
  {
-       /* disallow ancillary data requests from this path */
-       if (msg->msg_control || msg->msg_controllen)
-               return -EINVAL;
        return ____sys_sendmsg(sock, msg, flags, NULL, 0);
  }
  
@@@ -2625,12 -2619,6 +2621,6 @@@ long __sys_recvmsg_sock(struct socket *
                        struct user_msghdr __user *umsg,
                        struct sockaddr __user *uaddr, unsigned int flags)
  {
-       if (msg->msg_control || msg->msg_controllen) {
-               /* disallow ancillary data reqs unless cmsg is plain data */
-               if (!(sock->ops->flags & PROTO_CMSG_DATA_ONLY))
-                       return -EINVAL;
-       }
        return ____sys_recvmsg(sock, msg, umsg, uaddr, flags, 0);
  }