Merge tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-block.git] / kernel / fork.c
index bfe73db1c26c21c0ea8fe775fe90a3d3c475bb36..4342200d5e2b1982cc73d359ee1ae4754538734a 100644 (file)
@@ -451,13 +451,49 @@ static struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
+#ifdef CONFIG_PER_VMA_LOCK
+
+/* SLAB cache for vm_area_struct.lock */
+static struct kmem_cache *vma_lock_cachep;
+
+static bool vma_lock_alloc(struct vm_area_struct *vma)
+{
+       vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
+       if (!vma->vm_lock)
+               return false;
+
+       init_rwsem(&vma->vm_lock->lock);
+       vma->vm_lock_seq = -1;
+
+       return true;
+}
+
+static inline void vma_lock_free(struct vm_area_struct *vma)
+{
+       kmem_cache_free(vma_lock_cachep, vma->vm_lock);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
+static inline void vma_lock_free(struct vm_area_struct *vma) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
 struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
 {
        struct vm_area_struct *vma;
 
        vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
-       if (vma)
-               vma_init(vma, mm);
+       if (!vma)
+               return NULL;
+
+       vma_init(vma, mm);
+       if (!vma_lock_alloc(vma)) {
+               kmem_cache_free(vm_area_cachep, vma);
+               return NULL;
+       }
+
        return vma;
 }
 
@@ -465,26 +501,56 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 {
        struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 
-       if (new) {
-               ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
-               ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
-               /*
-                * orig->shared.rb may be modified concurrently, but the clone
-                * will be reinitialized.
-                */
-               data_race(memcpy(new, orig, sizeof(*new)));
-               INIT_LIST_HEAD(&new->anon_vma_chain);
-               dup_anon_vma_name(orig, new);
+       if (!new)
+               return NULL;
+
+       ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+       ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+       /*
+        * orig->shared.rb may be modified concurrently, but the clone
+        * will be reinitialized.
+        */
+       data_race(memcpy(new, orig, sizeof(*new)));
+       if (!vma_lock_alloc(new)) {
+               kmem_cache_free(vm_area_cachep, new);
+               return NULL;
        }
+       INIT_LIST_HEAD(&new->anon_vma_chain);
+       vma_numab_state_init(new);
+       dup_anon_vma_name(orig, new);
+
        return new;
 }
 
-void vm_area_free(struct vm_area_struct *vma)
+void __vm_area_free(struct vm_area_struct *vma)
 {
+       vma_numab_state_free(vma);
        free_anon_vma_name(vma);
+       vma_lock_free(vma);
        kmem_cache_free(vm_area_cachep, vma);
 }
 
+#ifdef CONFIG_PER_VMA_LOCK
+static void vm_area_free_rcu_cb(struct rcu_head *head)
+{
+       struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
+                                                 vm_rcu);
+
+       /* The vma should not be locked while being destroyed. */
+       VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
+       __vm_area_free(vma);
+}
+#endif
+
+void vm_area_free(struct vm_area_struct *vma)
+{
+#ifdef CONFIG_PER_VMA_LOCK
+       call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
+#else
+       __vm_area_free(vma);
+#endif
+}
+
 static void account_kernel_stack(struct task_struct *tsk, int account)
 {
        if (IS_ENABLED(CONFIG_VMAP_STACK)) {
@@ -775,6 +841,67 @@ static void check_mm(struct mm_struct *mm)
 #define allocate_mm()  (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
 #define free_mm(mm)    (kmem_cache_free(mm_cachep, (mm)))
 
+static void do_check_lazy_tlb(void *arg)
+{
+       struct mm_struct *mm = arg;
+
+       WARN_ON_ONCE(current->active_mm == mm);
+}
+
+static void do_shoot_lazy_tlb(void *arg)
+{
+       struct mm_struct *mm = arg;
+
+       if (current->active_mm == mm) {
+               WARN_ON_ONCE(current->mm);
+               current->active_mm = &init_mm;
+               switch_mm(mm, &init_mm, current);
+       }
+}
+
+static void cleanup_lazy_tlbs(struct mm_struct *mm)
+{
+       if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+               /*
+                * In this case, lazy tlb mms are refounted and would not reach
+                * __mmdrop until all CPUs have switched away and mmdrop()ed.
+                */
+               return;
+       }
+
+       /*
+        * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it
+        * requires lazy mm users to switch to another mm when the refcount
+        * drops to zero, before the mm is freed. This requires IPIs here to
+        * switch kernel threads to init_mm.
+        *
+        * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm
+        * switch with the final userspace teardown TLB flush which leaves the
+        * mm lazy on this CPU but no others, reducing the need for additional
+        * IPIs here. There are cases where a final IPI is still required here,
+        * such as the final mmdrop being performed on a different CPU than the
+        * one exiting, or kernel threads using the mm when userspace exits.
+        *
+        * IPI overheads have not found to be expensive, but they could be
+        * reduced in a number of possible ways, for example (roughly
+        * increasing order of complexity):
+        * - The last lazy reference created by exit_mm() could instead switch
+        *   to init_mm, however it's probable this will run on the same CPU
+        *   immediately afterwards, so this may not reduce IPIs much.
+        * - A batch of mms requiring IPIs could be gathered and freed at once.
+        * - CPUs store active_mm where it can be remotely checked without a
+        *   lock, to filter out false-positives in the cpumask.
+        * - After mm_users or mm_count reaches zero, switching away from the
+        *   mm could clear mm_cpumask to reduce some IPIs, perhaps together
+        *   with some batching or delaying of the final IPIs.
+        * - A delayed freeing and RCU-like quiescing sequence based on mm
+        *   switching to avoid IPIs completely.
+        */
+       on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+       if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
+               on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
+}
+
 /*
  * Called when the last reference to the mm
  * is dropped: either by a lazy thread or by
@@ -786,6 +913,10 @@ void __mmdrop(struct mm_struct *mm)
 
        BUG_ON(mm == &init_mm);
        WARN_ON_ONCE(mm == current->mm);
+
+       /* Ensure no CPUs are using this as their lazy tlb mm */
+       cleanup_lazy_tlbs(mm);
+
        WARN_ON_ONCE(mm == current->active_mm);
        mm_free_pgd(mm);
        destroy_context(mm);
@@ -1128,6 +1259,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
        seqcount_init(&mm->write_protect_seq);
        mmap_init_lock(mm);
        INIT_LIST_HEAD(&mm->mmlist);
+#ifdef CONFIG_PER_VMA_LOCK
+       mm->mm_lock_seq = 0;
+#endif
        mm_pgtables_bytes_init(mm);
        mm->map_count = 0;
        mm->locked_vm = 0;
@@ -3159,6 +3293,9 @@ void __init proc_caches_init(void)
                        NULL);
 
        vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
+#ifdef CONFIG_PER_VMA_LOCK
+       vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
+#endif
        mmap_init();
        nsproxy_cache_init();
 }