Merge tag 'v6.4/kernel.user_worker' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 24 Apr 2023 19:52:35 +0000 (12:52 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 24 Apr 2023 19:52:35 +0000 (12:52 -0700)
Pull user work thread updates from Christian Brauner:
 "This contains the work generalizing the ability to create a kernel
  worker from a userspace process.

  Such user workers will run with the same credentials as the userspace
  process they were created from providing stronger security and
  accounting guarantees than the traditional override_creds() approach
  ever could've hoped for.

  The original work was heavily based and optimzed for the needs of
  io_uring which was the first user. However, as it quickly turned out
  the ability to create user workers inherting properties from a
  userspace process is generally useful.

  The vhost subsystem currently creates workers using the kthread api.
  The consequences of using the kthread api are that RLIMITs don't work
  correctly as they are inherited from khtreadd. This leads to bugs
  where more workers are created than would be allowed by the RLIMITs of
  the userspace process in lieu of which workers are created.

  Problems like this disappear with user workers created from the
  userspace processes for which they perform the work. In addition,
  providing this api allows vhost to remove additional complexity. For
  example, cgroup and mm sharing will just work out of the box with user
  workers based on the relevant userspace process instead of manually
  ensuring the correct cgroup and mm contexts are used.

  So the vhost subsystem should simply be made to use the same mechanism
  as io_uring. To this end the original mechanism used for
  create_io_thread() is generalized into user workers:

   - Introduce PF_USER_WORKER as a generic indicator that a given task
     is a user worker, i.e., a kernel task that was created from a
     userspace process. Now a PF_IO_WORKER thread is just a specialized
     version of PF_USER_WORKER. So io_uring io workers raise both flags.

   - Make copy_process() available to core kernel code

   - Extend struct kernel_clone_args with the following bitfields
     allowing to indicate to copy_process():
       - to create a user worker (raise PF_USER_WORKER)
       - to not inherit any files from the userspace process
       - to ignore signals

  After all generic changes are in place the vhost subsystem implements
  a new dedicated vhost api based on user workers. Finally, vhost is
  switched to rely on the new api moving it off of kthreads.

  Thanks to Mike for sticking it out and making it through this rather
  arduous journey"

* tag 'v6.4/kernel.user_worker' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux:
  vhost: use vhost_tasks for worker threads
  vhost: move worker thread fields to new struct
  vhost_task: Allow vhost layer to use copy_process
  fork: allow kernel code to call copy_process
  fork: Add kernel_clone_args flag to ignore signals
  fork: add kernel_clone_args flag to not dup/clone files
  fork/vm: Move common PF_IO_WORKER behavior to new flag
  kernel: Make io_thread and kthread bit fields
  kthread: Pass in the thread's name during creation
  kernel: Allow a kernel thread's name to be set in copy_process
  csky: Remove kernel_thread declaration

14 files changed:
MAINTAINERS
arch/csky/include/asm/processor.h
drivers/vhost/Kconfig
drivers/vhost/vhost.c
drivers/vhost/vhost.h
include/linux/sched.h
include/linux/sched/task.h
include/linux/sched/vhost_task.h [new file with mode: 0644]
init/main.c
kernel/Makefile
kernel/fork.c
kernel/kthread.c
kernel/vhost_task.c [new file with mode: 0644]
mm/vmscan.c

index 4ae3bb24af1fb85f850fb1f0ebc8940ea02d0b84..14fadebf42fdd5f53d5f4e75deb1dd28ae45e340 100644 (file)
@@ -22177,7 +22177,9 @@ L:      virtualization@lists.linux-foundation.org
 L:     netdev@vger.kernel.org
 S:     Maintained
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git
+F:     kernel/vhost_task.c
 F:     drivers/vhost/
+F:     include/linux/sched/vhost_task.h
 F:     include/linux/vhost_iotlb.h
 F:     include/uapi/linux/vhost.h
 
index ea75d72dea86966232dd4b04ec4fa4ea419ef784..e487a46d1c37b47f1c9e46b4b341bff60a0f0152 100644 (file)
@@ -72,8 +72,6 @@ struct task_struct;
 /* Prepare to copy thread state - unlazy all lazy status */
 #define prepare_to_copy(tsk)    do { } while (0)
 
-extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
-
 unsigned long __get_wchan(struct task_struct *p);
 
 #define KSTK_EIP(tsk)          (task_pt_regs(tsk)->pc)
index 587fbae0618213ea5399a2cf0bf7f55cd8ed558b..b455d9ab6f3d9c989ca10f37efd1fbebbcae665b 100644 (file)
@@ -13,9 +13,14 @@ config VHOST_RING
          This option is selected by any driver which needs to access
          the host side of a virtio ring.
 
+config VHOST_TASK
+       bool
+       default n
+
 config VHOST
        tristate
        select VHOST_IOTLB
+       select VHOST_TASK
        help
          This option is selected by any driver which needs to access
          the core of vhost.
index b0dcf1f390504103b083654b5825cd96872414bd..6d07b42833be4093b7c6a6f4b738e1b4ef800db9 100644 (file)
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/kthread.h>
-#include <linux/cgroup.h>
 #include <linux/module.h>
 #include <linux/sort.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/vhost_task.h>
 #include <linux/interval_tree_generic.h>
 #include <linux/nospec.h>
 #include <linux/kcov.h>
@@ -255,8 +255,8 @@ void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
                 * sure it was not in the list.
                 * test_and_set_bit() implies a memory barrier.
                 */
-               llist_add(&work->node, &dev->work_list);
-               wake_up_process(dev->worker);
+               llist_add(&work->node, &dev->worker->work_list);
+               wake_up_process(dev->worker->vtsk->task);
        }
 }
 EXPORT_SYMBOL_GPL(vhost_work_queue);
@@ -264,7 +264,7 @@ EXPORT_SYMBOL_GPL(vhost_work_queue);
 /* A lockless hint for busy polling code to exit the loop */
 bool vhost_has_work(struct vhost_dev *dev)
 {
-       return !llist_empty(&dev->work_list);
+       return dev->worker && !llist_empty(&dev->worker->work_list);
 }
 EXPORT_SYMBOL_GPL(vhost_has_work);
 
@@ -335,22 +335,20 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 
 static int vhost_worker(void *data)
 {
-       struct vhost_dev *dev = data;
+       struct vhost_worker *worker = data;
        struct vhost_work *work, *work_next;
        struct llist_node *node;
 
-       kthread_use_mm(dev->mm);
-
        for (;;) {
                /* mb paired w/ kthread_stop */
                set_current_state(TASK_INTERRUPTIBLE);
 
-               if (kthread_should_stop()) {
+               if (vhost_task_should_stop(worker->vtsk)) {
                        __set_current_state(TASK_RUNNING);
                        break;
                }
 
-               node = llist_del_all(&dev->work_list);
+               node = llist_del_all(&worker->work_list);
                if (!node)
                        schedule();
 
@@ -360,14 +358,14 @@ static int vhost_worker(void *data)
                llist_for_each_entry_safe(work, work_next, node, node) {
                        clear_bit(VHOST_WORK_QUEUED, &work->flags);
                        __set_current_state(TASK_RUNNING);
-                       kcov_remote_start_common(dev->kcov_handle);
+                       kcov_remote_start_common(worker->kcov_handle);
                        work->fn(work);
                        kcov_remote_stop();
                        if (need_resched())
                                schedule();
                }
        }
-       kthread_unuse_mm(dev->mm);
+
        return 0;
 }
 
@@ -479,7 +477,6 @@ void vhost_dev_init(struct vhost_dev *dev,
        dev->byte_weight = byte_weight;
        dev->use_worker = use_worker;
        dev->msg_handler = msg_handler;
-       init_llist_head(&dev->work_list);
        init_waitqueue_head(&dev->wait);
        INIT_LIST_HEAD(&dev->read_list);
        INIT_LIST_HEAD(&dev->pending_list);
@@ -509,31 +506,6 @@ long vhost_dev_check_owner(struct vhost_dev *dev)
 }
 EXPORT_SYMBOL_GPL(vhost_dev_check_owner);
 
-struct vhost_attach_cgroups_struct {
-       struct vhost_work work;
-       struct task_struct *owner;
-       int ret;
-};
-
-static void vhost_attach_cgroups_work(struct vhost_work *work)
-{
-       struct vhost_attach_cgroups_struct *s;
-
-       s = container_of(work, struct vhost_attach_cgroups_struct, work);
-       s->ret = cgroup_attach_task_all(s->owner, current);
-}
-
-static int vhost_attach_cgroups(struct vhost_dev *dev)
-{
-       struct vhost_attach_cgroups_struct attach;
-
-       attach.owner = current;
-       vhost_work_init(&attach.work, vhost_attach_cgroups_work);
-       vhost_work_queue(dev, &attach.work);
-       vhost_dev_flush(dev);
-       return attach.ret;
-}
-
 /* Caller should have device mutex */
 bool vhost_dev_has_owner(struct vhost_dev *dev)
 {
@@ -571,10 +543,54 @@ static void vhost_detach_mm(struct vhost_dev *dev)
        dev->mm = NULL;
 }
 
+static void vhost_worker_free(struct vhost_dev *dev)
+{
+       struct vhost_worker *worker = dev->worker;
+
+       if (!worker)
+               return;
+
+       dev->worker = NULL;
+       WARN_ON(!llist_empty(&worker->work_list));
+       vhost_task_stop(worker->vtsk);
+       kfree(worker);
+}
+
+static int vhost_worker_create(struct vhost_dev *dev)
+{
+       struct vhost_worker *worker;
+       struct vhost_task *vtsk;
+       char name[TASK_COMM_LEN];
+       int ret;
+
+       worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
+       if (!worker)
+               return -ENOMEM;
+
+       dev->worker = worker;
+       worker->kcov_handle = kcov_common_handle();
+       init_llist_head(&worker->work_list);
+       snprintf(name, sizeof(name), "vhost-%d", current->pid);
+
+       vtsk = vhost_task_create(vhost_worker, worker, name);
+       if (!vtsk) {
+               ret = -ENOMEM;
+               goto free_worker;
+       }
+
+       worker->vtsk = vtsk;
+       vhost_task_start(vtsk);
+       return 0;
+
+free_worker:
+       kfree(worker);
+       dev->worker = NULL;
+       return ret;
+}
+
 /* Caller should have device mutex */
 long vhost_dev_set_owner(struct vhost_dev *dev)
 {
-       struct task_struct *worker;
        int err;
 
        /* Is there an owner already? */
@@ -585,36 +601,21 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 
        vhost_attach_mm(dev);
 
-       dev->kcov_handle = kcov_common_handle();
        if (dev->use_worker) {
-               worker = kthread_create(vhost_worker, dev,
-                                       "vhost-%d", current->pid);
-               if (IS_ERR(worker)) {
-                       err = PTR_ERR(worker);
-                       goto err_worker;
-               }
-
-               dev->worker = worker;
-               wake_up_process(worker); /* avoid contributing to loadavg */
-
-               err = vhost_attach_cgroups(dev);
+               err = vhost_worker_create(dev);
                if (err)
-                       goto err_cgroup;
+                       goto err_worker;
        }
 
        err = vhost_dev_alloc_iovecs(dev);
        if (err)
-               goto err_cgroup;
+               goto err_iovecs;
 
        return 0;
-err_cgroup:
-       if (dev->worker) {
-               kthread_stop(dev->worker);
-               dev->worker = NULL;
-       }
+err_iovecs:
+       vhost_worker_free(dev);
 err_worker:
        vhost_detach_mm(dev);
-       dev->kcov_handle = 0;
 err_mm:
        return err;
 }
@@ -705,12 +706,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
        dev->iotlb = NULL;
        vhost_clear_msg(dev);
        wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
-       WARN_ON(!llist_empty(&dev->work_list));
-       if (dev->worker) {
-               kthread_stop(dev->worker);
-               dev->worker = NULL;
-               dev->kcov_handle = 0;
-       }
+       vhost_worker_free(dev);
        vhost_detach_mm(dev);
 }
 EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
index 1647b750169c79363b8e9ea3ac9c667d0af6af69..0308638cdeeebef46ca0ad98994eb8c6edd803f4 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/irqbypass.h>
 
 struct vhost_work;
+struct vhost_task;
 typedef void (*vhost_work_fn_t)(struct vhost_work *work);
 
 #define VHOST_WORK_QUEUED 1
@@ -25,6 +26,12 @@ struct vhost_work {
        unsigned long           flags;
 };
 
+struct vhost_worker {
+       struct vhost_task       *vtsk;
+       struct llist_head       work_list;
+       u64                     kcov_handle;
+};
+
 /* Poll a file (eventfd or socket) */
 /* Note: there's nothing vhost specific about this structure. */
 struct vhost_poll {
@@ -147,8 +154,7 @@ struct vhost_dev {
        struct vhost_virtqueue **vqs;
        int nvqs;
        struct eventfd_ctx *log_ctx;
-       struct llist_head work_list;
-       struct task_struct *worker;
+       struct vhost_worker *worker;
        struct vhost_iotlb *umem;
        struct vhost_iotlb *iotlb;
        spinlock_t iotlb_lock;
@@ -158,7 +164,6 @@ struct vhost_dev {
        int iov_limit;
        int weight;
        int byte_weight;
-       u64 kcov_handle;
        bool use_worker;
        int (*msg_handler)(struct vhost_dev *dev, u32 asid,
                           struct vhost_iotlb_msg *msg);
index 63d242164b1a91af1fcb89c3cf6e146b32cda391..e1e605b1255b8c2ed1a78bdc517b8c51f1a344c2 100644 (file)
@@ -1729,7 +1729,7 @@ extern struct pid *cad_pid;
 #define PF_MEMALLOC            0x00000800      /* Allocating memory */
 #define PF_NPROC_EXCEEDED      0x00001000      /* set_user() noticed that RLIMIT_NPROC was exceeded */
 #define PF_USED_MATH           0x00002000      /* If unset the fpu must be initialized before use */
-#define PF__HOLE__00004000     0x00004000
+#define PF_USER_WORKER         0x00004000      /* Kernel thread cloned from userspace thread */
 #define PF_NOFREEZE            0x00008000      /* This thread should not be frozen */
 #define PF__HOLE__00010000     0x00010000
 #define PF_KSWAPD              0x00020000      /* I am kswapd */
index 357e0068497c162b097a18099922ae23a20c7015..537cbf9a2adeab8ef6bc372474ab63afe292d74d 100644 (file)
@@ -23,7 +23,13 @@ struct kernel_clone_args {
        int __user *pidfd;
        int __user *child_tid;
        int __user *parent_tid;
+       const char *name;
        int exit_signal;
+       u32 kthread:1;
+       u32 io_thread:1;
+       u32 user_worker:1;
+       u32 no_files:1;
+       u32 ignore_signals:1;
        unsigned long stack;
        unsigned long stack_size;
        unsigned long tls;
@@ -31,8 +37,6 @@ struct kernel_clone_args {
        /* Number of elements in *set_tid */
        size_t set_tid_size;
        int cgroup;
-       int io_thread;
-       int kthread;
        int idle;
        int (*fn)(void *);
        void *fn_arg;
@@ -89,9 +93,12 @@ extern void exit_files(struct task_struct *);
 extern void exit_itimers(struct task_struct *);
 
 extern pid_t kernel_clone(struct kernel_clone_args *kargs);
+struct task_struct *copy_process(struct pid *pid, int trace, int node,
+                                struct kernel_clone_args *args);
 struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
 struct task_struct *fork_idle(int);
-extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
+extern pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
+                           unsigned long flags);
 extern pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags);
 extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
 int kernel_wait(pid_t pid, int *stat);
diff --git a/include/linux/sched/vhost_task.h b/include/linux/sched/vhost_task.h
new file mode 100644 (file)
index 0000000..6123c10
--- /dev/null
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_VHOST_TASK_H
+#define _LINUX_VHOST_TASK_H
+
+#include <linux/completion.h>
+
+struct task_struct;
+
+struct vhost_task {
+       int (*fn)(void *data);
+       void *data;
+       struct completion exited;
+       unsigned long flags;
+       struct task_struct *task;
+};
+
+struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg,
+                                    const char *name);
+void vhost_task_start(struct vhost_task *vtsk);
+void vhost_task_stop(struct vhost_task *vtsk);
+bool vhost_task_should_stop(struct vhost_task *vtsk);
+
+#endif
index bb87b789c54396920870b99167735fb4df8ac0cf..123f50eb362b43806eb4e71297b76ff4e2e82f43 100644 (file)
@@ -711,7 +711,7 @@ noinline void __ref rest_init(void)
        rcu_read_unlock();
 
        numa_default_policy();
-       pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
+       pid = kernel_thread(kthreadd, NULL, NULL, CLONE_FS | CLONE_FILES);
        rcu_read_lock();
        kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
        rcu_read_unlock();
index 10ef068f598d5db37a00f951f057c90b61e04326..6fc72b3afbde10c420317764fd12e0ec77446711 100644 (file)
@@ -15,6 +15,7 @@ obj-y     = fork.o exec_domain.o panic.o \
 obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o
 obj-$(CONFIG_MODULES) += kmod.o
 obj-$(CONFIG_MULTIUSER) += groups.o
+obj-$(CONFIG_VHOST_TASK) += vhost_task.o
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace internal ftrace files
index ea332319dffea33f7501517b2d00ff150614bea9..d6cd5849eb51c5410a18a40a8ea89bcd7a195363 100644 (file)
@@ -1626,7 +1626,8 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
        return 0;
 }
 
-static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
+                     int no_files)
 {
        struct files_struct *oldf, *newf;
        int error = 0;
@@ -1638,6 +1639,11 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
        if (!oldf)
                goto out;
 
+       if (no_files) {
+               tsk->files = NULL;
+               goto out;
+       }
+
        if (clone_flags & CLONE_FILES) {
                atomic_inc(&oldf->count);
                goto out;
@@ -2009,7 +2015,7 @@ static void rv_task_fork(struct task_struct *p)
  * parts of the process environment (as per the clone
  * flags). The actual kick-off is left to the caller.
  */
-static __latent_entropy struct task_struct *copy_process(
+__latent_entropy struct task_struct *copy_process(
                                        struct pid *pid,
                                        int trace,
                                        int node,
@@ -2102,6 +2108,8 @@ static __latent_entropy struct task_struct *copy_process(
        p->flags &= ~PF_KTHREAD;
        if (args->kthread)
                p->flags |= PF_KTHREAD;
+       if (args->user_worker)
+               p->flags |= PF_USER_WORKER;
        if (args->io_thread) {
                /*
                 * Mark us an IO worker, and block any signal that isn't
@@ -2111,6 +2119,9 @@ static __latent_entropy struct task_struct *copy_process(
                siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
        }
 
+       if (args->name)
+               strscpy_pad(p->comm, args->name, sizeof(p->comm));
+
        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
        /*
         * Clear TID on mm_release()?
@@ -2253,7 +2264,7 @@ static __latent_entropy struct task_struct *copy_process(
        retval = copy_semundo(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_security;
-       retval = copy_files(clone_flags, p);
+       retval = copy_files(clone_flags, p, args->no_files);
        if (retval)
                goto bad_fork_cleanup_semundo;
        retval = copy_fs(clone_flags, p);
@@ -2278,6 +2289,9 @@ static __latent_entropy struct task_struct *copy_process(
        if (retval)
                goto bad_fork_cleanup_io;
 
+       if (args->ignore_signals)
+               ignore_signals(p);
+
        stackleak_task_init(p);
 
        if (pid != &init_struct_pid) {
@@ -2626,6 +2640,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
                .fn             = fn,
                .fn_arg         = arg,
                .io_thread      = 1,
+               .user_worker    = 1,
        };
 
        return copy_process(NULL, 0, node, &args);
@@ -2729,7 +2744,8 @@ pid_t kernel_clone(struct kernel_clone_args *args)
 /*
  * Create a kernel thread.
  */
-pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
+                   unsigned long flags)
 {
        struct kernel_clone_args args = {
                .flags          = ((lower_32_bits(flags) | CLONE_VM |
@@ -2737,6 +2753,7 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
                .exit_signal    = (lower_32_bits(flags) & CSIGNAL),
                .fn             = fn,
                .fn_arg         = arg,
+               .name           = name,
                .kthread        = 1,
        };
 
index 7e6751b29101e5ca4050aa512b922ab05ddfc5e2..4bc6e0971ec93ce9322f5743b3d7c8b43a76b9b0 100644 (file)
@@ -38,6 +38,7 @@ struct task_struct *kthreadd_task;
 struct kthread_create_info
 {
        /* Information passed to kthread() from kthreadd. */
+       char *full_name;
        int (*threadfn)(void *data);
        void *data;
        int node;
@@ -343,10 +344,12 @@ static int kthread(void *_create)
        /* Release the structure when caller killed by a fatal signal. */
        done = xchg(&create->done, NULL);
        if (!done) {
+               kfree(create->full_name);
                kfree(create);
                kthread_exit(-EINTR);
        }
 
+       self->full_name = create->full_name;
        self->threadfn = threadfn;
        self->data = data;
 
@@ -396,11 +399,13 @@ static void create_kthread(struct kthread_create_info *create)
        current->pref_node_fork = create->node;
 #endif
        /* We want our own signal handler (we take no signals by default). */
-       pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
+       pid = kernel_thread(kthread, create, create->full_name,
+                           CLONE_FS | CLONE_FILES | SIGCHLD);
        if (pid < 0) {
                /* Release the structure when caller killed by a fatal signal. */
                struct completion *done = xchg(&create->done, NULL);
 
+               kfree(create->full_name);
                if (!done) {
                        kfree(create);
                        return;
@@ -427,6 +432,11 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
        create->data = data;
        create->node = node;
        create->done = &done;
+       create->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
+       if (!create->full_name) {
+               task = ERR_PTR(-ENOMEM);
+               goto free_create;
+       }
 
        spin_lock(&kthread_create_lock);
        list_add_tail(&create->list, &kthread_create_list);
@@ -453,26 +463,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
                wait_for_completion(&done);
        }
        task = create->result;
-       if (!IS_ERR(task)) {
-               char name[TASK_COMM_LEN];
-               va_list aq;
-               int len;
-
-               /*
-                * task is already visible to other tasks, so updating
-                * COMM must be protected.
-                */
-               va_copy(aq, args);
-               len = vsnprintf(name, sizeof(name), namefmt, aq);
-               va_end(aq);
-               if (len >= TASK_COMM_LEN) {
-                       struct kthread *kthread = to_kthread(task);
-
-                       /* leave it truncated when out of memory. */
-                       kthread->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
-               }
-               set_task_comm(task, name);
-       }
+free_create:
        kfree(create);
        return task;
 }
diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
new file mode 100644 (file)
index 0000000..b7cbd66
--- /dev/null
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 Oracle Corporation
+ */
+#include <linux/slab.h>
+#include <linux/completion.h>
+#include <linux/sched/task.h>
+#include <linux/sched/vhost_task.h>
+#include <linux/sched/signal.h>
+
+enum vhost_task_flags {
+       VHOST_TASK_FLAGS_STOP,
+};
+
+static int vhost_task_fn(void *data)
+{
+       struct vhost_task *vtsk = data;
+       int ret;
+
+       ret = vtsk->fn(vtsk->data);
+       complete(&vtsk->exited);
+       do_exit(ret);
+}
+
+/**
+ * vhost_task_stop - stop a vhost_task
+ * @vtsk: vhost_task to stop
+ *
+ * Callers must call vhost_task_should_stop and return from their worker
+ * function when it returns true;
+ */
+void vhost_task_stop(struct vhost_task *vtsk)
+{
+       pid_t pid = vtsk->task->pid;
+
+       set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags);
+       wake_up_process(vtsk->task);
+       /*
+        * Make sure vhost_task_fn is no longer accessing the vhost_task before
+        * freeing it below. If userspace crashed or exited without closing,
+        * then the vhost_task->task could already be marked dead so
+        * kernel_wait will return early.
+        */
+       wait_for_completion(&vtsk->exited);
+       /*
+        * If we are just closing/removing a device and the parent process is
+        * not exiting then reap the task.
+        */
+       kernel_wait4(pid, NULL, __WCLONE, NULL);
+       kfree(vtsk);
+}
+EXPORT_SYMBOL_GPL(vhost_task_stop);
+
+/**
+ * vhost_task_should_stop - should the vhost task return from the work function
+ * @vtsk: vhost_task to stop
+ */
+bool vhost_task_should_stop(struct vhost_task *vtsk)
+{
+       return test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags);
+}
+EXPORT_SYMBOL_GPL(vhost_task_should_stop);
+
+/**
+ * vhost_task_create - create a copy of a process to be used by the kernel
+ * @fn: thread stack
+ * @arg: data to be passed to fn
+ * @name: the thread's name
+ *
+ * This returns a specialized task for use by the vhost layer or NULL on
+ * failure. The returned task is inactive, and the caller must fire it up
+ * through vhost_task_start().
+ */
+struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg,
+                                    const char *name)
+{
+       struct kernel_clone_args args = {
+               .flags          = CLONE_FS | CLONE_UNTRACED | CLONE_VM,
+               .exit_signal    = 0,
+               .fn             = vhost_task_fn,
+               .name           = name,
+               .user_worker    = 1,
+               .no_files       = 1,
+               .ignore_signals = 1,
+       };
+       struct vhost_task *vtsk;
+       struct task_struct *tsk;
+
+       vtsk = kzalloc(sizeof(*vtsk), GFP_KERNEL);
+       if (!vtsk)
+               return NULL;
+       init_completion(&vtsk->exited);
+       vtsk->data = arg;
+       vtsk->fn = fn;
+
+       args.fn_arg = vtsk;
+
+       tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args);
+       if (IS_ERR(tsk)) {
+               kfree(vtsk);
+               return NULL;
+       }
+
+       vtsk->task = tsk;
+       return vtsk;
+}
+EXPORT_SYMBOL_GPL(vhost_task_create);
+
+/**
+ * vhost_task_start - start a vhost_task created with vhost_task_create
+ * @vtsk: vhost_task to wake up
+ */
+void vhost_task_start(struct vhost_task *vtsk)
+{
+       wake_up_new_task(vtsk->task);
+}
+EXPORT_SYMBOL_GPL(vhost_task_start);
index 9c1c5e8b24b8f56fe05d8d3883ce17ffaa2eb7f3..7ba6bfdd9a5f7bf3ef357947bb98c25b87da2c07 100644 (file)
@@ -1151,12 +1151,12 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
        DEFINE_WAIT(wait);
 
        /*
-        * Do not throttle IO workers, kthreads other than kswapd or
+        * Do not throttle user workers, kthreads other than kswapd or
         * workqueues. They may be required for reclaim to make
         * forward progress (e.g. journalling workqueues or kthreads).
         */
        if (!current_is_kswapd() &&
-           current->flags & (PF_IO_WORKER|PF_KTHREAD)) {
+           current->flags & (PF_USER_WORKER|PF_KTHREAD)) {
                cond_resched();
                return;
        }