Merge tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-block.git] / include / linux / mm.h
index d0a6c99aba0988fc5eca123ca6b2859f39350f61..3731999cd9f09a0e873d5987e030f8b3cde7987a 100644 (file)
@@ -29,6 +29,7 @@
 #include <linux/pgtable.h>
 #include <linux/kasan.h>
 #include <linux/memremap.h>
+#include <linux/slab.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -257,6 +258,8 @@ void setup_initial_init_mm(void *start_code, void *end_code,
 struct vm_area_struct *vm_area_alloc(struct mm_struct *);
 struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
 void vm_area_free(struct vm_area_struct *);
+/* Use only if VMA has no other users */
+void __vm_area_free(struct vm_area_struct *vma);
 
 #ifndef CONFIG_MMU
 extern struct rb_root nommu_region_tree;
@@ -479,7 +482,8 @@ static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
        { FAULT_FLAG_USER,              "USER" }, \
        { FAULT_FLAG_REMOTE,            "REMOTE" }, \
        { FAULT_FLAG_INSTRUCTION,       "INSTRUCTION" }, \
-       { FAULT_FLAG_INTERRUPTIBLE,     "INTERRUPTIBLE" }
+       { FAULT_FLAG_INTERRUPTIBLE,     "INTERRUPTIBLE" }, \
+       { FAULT_FLAG_VMA_LOCK,          "VMA_LOCK" }
 
 /*
  * vm_fault is filled by the pagefault handler and passed to the vma's
@@ -624,13 +628,21 @@ struct vm_operations_struct {
                                          unsigned long addr);
 };
 
-#ifdef CONFIG_PER_VMA_LOCK
-static inline void vma_init_lock(struct vm_area_struct *vma)
+#ifdef CONFIG_NUMA_BALANCING
+static inline void vma_numab_state_init(struct vm_area_struct *vma)
+{
+       vma->numab_state = NULL;
+}
+static inline void vma_numab_state_free(struct vm_area_struct *vma)
 {
-       init_rwsem(&vma->lock);
-       vma->vm_lock_seq = -1;
+       kfree(vma->numab_state);
 }
+#else
+static inline void vma_numab_state_init(struct vm_area_struct *vma) {}
+static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
+#endif /* CONFIG_NUMA_BALANCING */
 
+#ifdef CONFIG_PER_VMA_LOCK
 /*
  * Try to read-lock a vma. The function is allowed to occasionally yield false
  * locked result to avoid performance overhead, in which case we fall back to
@@ -642,17 +654,17 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
        if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
                return false;
 
-       if (unlikely(down_read_trylock(&vma->lock) == 0))
+       if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
                return false;
 
        /*
         * Overflow might produce false locked result.
         * False unlocked result is impossible because we modify and check
-        * vma->vm_lock_seq under vma->lock protection and mm->mm_lock_seq
+        * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
         * modification invalidates all existing locks.
         */
        if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
-               up_read(&vma->lock);
+               up_read(&vma->vm_lock->lock);
                return false;
        }
        return true;
@@ -661,39 +673,67 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
 static inline void vma_end_read(struct vm_area_struct *vma)
 {
        rcu_read_lock(); /* keeps vma alive till the end of up_read */
-       up_read(&vma->lock);
+       up_read(&vma->vm_lock->lock);
        rcu_read_unlock();
 }
 
-static inline void vma_start_write(struct vm_area_struct *vma)
+static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
 {
-       int mm_lock_seq;
-
        mmap_assert_write_locked(vma->vm_mm);
 
        /*
         * current task is holding mmap_write_lock, both vma->vm_lock_seq and
         * mm->mm_lock_seq can't be concurrently modified.
         */
-       mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
-       if (vma->vm_lock_seq == mm_lock_seq)
+       *mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
+       return (vma->vm_lock_seq == *mm_lock_seq);
+}
+
+static inline void vma_start_write(struct vm_area_struct *vma)
+{
+       int mm_lock_seq;
+
+       if (__is_vma_write_locked(vma, &mm_lock_seq))
                return;
 
-       down_write(&vma->lock);
+       down_write(&vma->vm_lock->lock);
        vma->vm_lock_seq = mm_lock_seq;
-       up_write(&vma->lock);
+       up_write(&vma->vm_lock->lock);
+}
+
+static inline bool vma_try_start_write(struct vm_area_struct *vma)
+{
+       int mm_lock_seq;
+
+       if (__is_vma_write_locked(vma, &mm_lock_seq))
+               return true;
+
+       if (!down_write_trylock(&vma->vm_lock->lock))
+               return false;
+
+       vma->vm_lock_seq = mm_lock_seq;
+       up_write(&vma->vm_lock->lock);
+       return true;
 }
 
 static inline void vma_assert_write_locked(struct vm_area_struct *vma)
 {
-       mmap_assert_write_locked(vma->vm_mm);
-       /*
-        * current task is holding mmap_write_lock, both vma->vm_lock_seq and
-        * mm->mm_lock_seq can't be concurrently modified.
-        */
-       VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma);
+       int mm_lock_seq;
+
+       VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
 }
 
+static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
+{
+       /* When detaching vma should be write-locked */
+       if (detached)
+               vma_assert_write_locked(vma);
+       vma->detached = detached;
+}
+
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+                                         unsigned long address);
+
 #else /* CONFIG_PER_VMA_LOCK */
 
 static inline void vma_init_lock(struct vm_area_struct *vma) {}
@@ -701,10 +741,18 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
                { return false; }
 static inline void vma_end_read(struct vm_area_struct *vma) {}
 static inline void vma_start_write(struct vm_area_struct *vma) {}
+static inline bool vma_try_start_write(struct vm_area_struct *vma)
+               { return true; }
 static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
+static inline void vma_mark_detached(struct vm_area_struct *vma,
+                                    bool detached) {}
 
 #endif /* CONFIG_PER_VMA_LOCK */
 
+/*
+ * WARNING: vma_init does not initialize vma->vm_lock.
+ * Use vm_area_alloc()/vm_area_free() if vma needs locking.
+ */
 static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 {
        static const struct vm_operations_struct dummy_vm_ops = {};
@@ -713,7 +761,8 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
        vma->vm_mm = mm;
        vma->vm_ops = &dummy_vm_ops;
        INIT_LIST_HEAD(&vma->anon_vma_chain);
-       vma_init_lock(vma);
+       vma_mark_detached(vma, false);
+       vma_numab_state_init(vma);
 }
 
 /* Use when VMA is not part of the VMA tree and needs no locking */
@@ -1637,6 +1686,16 @@ static inline int xchg_page_access_time(struct page *page, int time)
        last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
        return last_time << PAGE_ACCESS_TIME_BUCKETS;
 }
+
+static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
+{
+       unsigned int pid_bit;
+
+       pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
+       if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) {
+               __set_bit(pid_bit, &vma->numab_state->access_pids[1]);
+       }
+}
 #else /* !CONFIG_NUMA_BALANCING */
 static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
 {
@@ -1686,6 +1745,10 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
 {
        return false;
 }
+
+static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
+{
+}
 #endif /* CONFIG_NUMA_BALANCING */
 
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
@@ -2972,7 +3035,6 @@ extern void setup_per_cpu_pageset(void);
 extern int min_free_kbytes;
 extern int watermark_boost_factor;
 extern int watermark_scale_factor;
-extern bool arch_has_descending_max_zone_pfns(void);
 
 /* nommu.c */
 extern atomic_long_t mmap_pages_allocated;
@@ -3498,6 +3560,22 @@ void vmemmap_populate_print_last(void);
 void vmemmap_free(unsigned long start, unsigned long end,
                struct vmem_altmap *altmap);
 #endif
+
+#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
+static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
+                                          struct dev_pagemap *pgmap)
+{
+       return is_power_of_2(sizeof(struct page)) &&
+               pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap;
+}
+#else
+static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
+                                          struct dev_pagemap *pgmap)
+{
+       return false;
+}
+#endif
+
 void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
                                  unsigned long nr_pages);
 
@@ -3515,8 +3593,6 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
 extern int memory_failure(unsigned long pfn, int flags);
 extern void memory_failure_queue_kick(int cpu);
 extern int unpoison_memory(unsigned long pfn);
-extern int sysctl_memory_failure_early_kill;
-extern int sysctl_memory_failure_recovery;
 extern void shake_page(struct page *p);
 extern atomic_long_t num_poisoned_pages __read_mostly;
 extern int soft_offline_page(unsigned long pfn, int flags);
@@ -3526,6 +3602,7 @@ extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
                                        bool *migratable_cleared);
 void num_poisoned_pages_inc(unsigned long pfn);
 void num_poisoned_pages_sub(unsigned long pfn, long i);
+struct task_struct *task_early_kill(struct task_struct *tsk, int force_early);
 #else
 static inline void memory_failure_queue(unsigned long pfn, int flags)
 {
@@ -3546,6 +3623,12 @@ static inline void num_poisoned_pages_sub(unsigned long pfn, long i)
 }
 #endif
 
+#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_KSM)
+void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
+                    struct vm_area_struct *vma, struct list_head *to_kill,
+                    unsigned long ksm_addr);
+#endif
+
 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
 extern void memblk_nr_poison_inc(unsigned long pfn);
 extern void memblk_nr_poison_sub(unsigned long pfn, long i);
@@ -3615,14 +3698,12 @@ extern const struct attribute_group memory_failure_attr_group;
 extern void clear_huge_page(struct page *page,
                            unsigned long addr_hint,
                            unsigned int pages_per_huge_page);
-extern void copy_user_huge_page(struct page *dst, struct page *src,
-                               unsigned long addr_hint,
-                               struct vm_area_struct *vma,
-                               unsigned int pages_per_huge_page);
-extern long copy_huge_page_from_user(struct page *dst_page,
-                               const void __user *usr_src,
-                               unsigned int pages_per_huge_page,
-                               bool allow_pagefault);
+int copy_user_large_folio(struct folio *dst, struct folio *src,
+                         unsigned long addr_hint,
+                         struct vm_area_struct *vma);
+long copy_folio_from_user(struct folio *dst_folio,
+                          const void __user *usr_src,
+                          bool allow_pagefault);
 
 /**
  * vma_is_special_huge - Are transhuge page-table entries considered special?