Merge tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-block.git] / include / linux / mm.h
index 98da268b834a37c1eac55113327bbbf1a38a6049..3731999cd9f09a0e873d5987e030f8b3cde7987a 100644 (file)
@@ -29,6 +29,7 @@
 #include <linux/pgtable.h>
 #include <linux/kasan.h>
 #include <linux/memremap.h>
+#include <linux/slab.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -38,6 +39,7 @@ struct pt_regs;
 
 extern int sysctl_page_lock_unfairness;
 
+void mm_core_init(void);
 void init_mm_internals(void);
 
 #ifndef CONFIG_NUMA            /* Don't use mapnrs, do it properly */
@@ -256,6 +258,8 @@ void setup_initial_init_mm(void *start_code, void *end_code,
 struct vm_area_struct *vm_area_alloc(struct mm_struct *);
 struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
 void vm_area_free(struct vm_area_struct *);
+/* Use only if VMA has no other users */
+void __vm_area_free(struct vm_area_struct *vma);
 
 #ifndef CONFIG_MMU
 extern struct rb_root nommu_region_tree;
@@ -478,7 +482,8 @@ static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
        { FAULT_FLAG_USER,              "USER" }, \
        { FAULT_FLAG_REMOTE,            "REMOTE" }, \
        { FAULT_FLAG_INSTRUCTION,       "INSTRUCTION" }, \
-       { FAULT_FLAG_INTERRUPTIBLE,     "INTERRUPTIBLE" }
+       { FAULT_FLAG_INTERRUPTIBLE,     "INTERRUPTIBLE" }, \
+       { FAULT_FLAG_VMA_LOCK,          "VMA_LOCK" }
 
 /*
  * vm_fault is filled by the pagefault handler and passed to the vma's
@@ -623,6 +628,131 @@ struct vm_operations_struct {
                                          unsigned long addr);
 };
 
+#ifdef CONFIG_NUMA_BALANCING
+static inline void vma_numab_state_init(struct vm_area_struct *vma)
+{
+       vma->numab_state = NULL;
+}
+static inline void vma_numab_state_free(struct vm_area_struct *vma)
+{
+       kfree(vma->numab_state);
+}
+#else
+static inline void vma_numab_state_init(struct vm_area_struct *vma) {}
+static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
+#endif /* CONFIG_NUMA_BALANCING */
+
+#ifdef CONFIG_PER_VMA_LOCK
+/*
+ * Try to read-lock a vma. The function is allowed to occasionally yield false
+ * locked result to avoid performance overhead, in which case we fall back to
+ * using mmap_lock. The function should never yield false unlocked result.
+ */
+static inline bool vma_start_read(struct vm_area_struct *vma)
+{
+       /* Check before locking. A race might cause false locked result. */
+       if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
+               return false;
+
+       if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
+               return false;
+
+       /*
+        * Overflow might produce false locked result.
+        * False unlocked result is impossible because we modify and check
+        * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
+        * modification invalidates all existing locks.
+        */
+       if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
+               up_read(&vma->vm_lock->lock);
+               return false;
+       }
+       return true;
+}
+
+static inline void vma_end_read(struct vm_area_struct *vma)
+{
+       rcu_read_lock(); /* keeps vma alive till the end of up_read */
+       up_read(&vma->vm_lock->lock);
+       rcu_read_unlock();
+}
+
+static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
+{
+       mmap_assert_write_locked(vma->vm_mm);
+
+       /*
+        * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+        * mm->mm_lock_seq can't be concurrently modified.
+        */
+       *mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
+       return (vma->vm_lock_seq == *mm_lock_seq);
+}
+
+static inline void vma_start_write(struct vm_area_struct *vma)
+{
+       int mm_lock_seq;
+
+       if (__is_vma_write_locked(vma, &mm_lock_seq))
+               return;
+
+       down_write(&vma->vm_lock->lock);
+       vma->vm_lock_seq = mm_lock_seq;
+       up_write(&vma->vm_lock->lock);
+}
+
+static inline bool vma_try_start_write(struct vm_area_struct *vma)
+{
+       int mm_lock_seq;
+
+       if (__is_vma_write_locked(vma, &mm_lock_seq))
+               return true;
+
+       if (!down_write_trylock(&vma->vm_lock->lock))
+               return false;
+
+       vma->vm_lock_seq = mm_lock_seq;
+       up_write(&vma->vm_lock->lock);
+       return true;
+}
+
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+{
+       int mm_lock_seq;
+
+       VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
+}
+
+static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
+{
+       /* When detaching vma should be write-locked */
+       if (detached)
+               vma_assert_write_locked(vma);
+       vma->detached = detached;
+}
+
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+                                         unsigned long address);
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline void vma_init_lock(struct vm_area_struct *vma) {}
+static inline bool vma_start_read(struct vm_area_struct *vma)
+               { return false; }
+static inline void vma_end_read(struct vm_area_struct *vma) {}
+static inline void vma_start_write(struct vm_area_struct *vma) {}
+static inline bool vma_try_start_write(struct vm_area_struct *vma)
+               { return true; }
+static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
+static inline void vma_mark_detached(struct vm_area_struct *vma,
+                                    bool detached) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
+/*
+ * WARNING: vma_init does not initialize vma->vm_lock.
+ * Use vm_area_alloc()/vm_area_free() if vma needs locking.
+ */
 static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 {
        static const struct vm_operations_struct dummy_vm_ops = {};
@@ -631,6 +761,8 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
        vma->vm_mm = mm;
        vma->vm_ops = &dummy_vm_ops;
        INIT_LIST_HEAD(&vma->anon_vma_chain);
+       vma_mark_detached(vma, false);
+       vma_numab_state_init(vma);
 }
 
 /* Use when VMA is not part of the VMA tree and needs no locking */
@@ -644,28 +776,28 @@ static inline void vm_flags_init(struct vm_area_struct *vma,
 static inline void vm_flags_reset(struct vm_area_struct *vma,
                                  vm_flags_t flags)
 {
-       mmap_assert_write_locked(vma->vm_mm);
+       vma_start_write(vma);
        vm_flags_init(vma, flags);
 }
 
 static inline void vm_flags_reset_once(struct vm_area_struct *vma,
                                       vm_flags_t flags)
 {
-       mmap_assert_write_locked(vma->vm_mm);
+       vma_start_write(vma);
        WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
 }
 
 static inline void vm_flags_set(struct vm_area_struct *vma,
                                vm_flags_t flags)
 {
-       mmap_assert_write_locked(vma->vm_mm);
+       vma_start_write(vma);
        ACCESS_PRIVATE(vma, __vm_flags) |= flags;
 }
 
 static inline void vm_flags_clear(struct vm_area_struct *vma,
                                  vm_flags_t flags)
 {
-       mmap_assert_write_locked(vma->vm_mm);
+       vma_start_write(vma);
        ACCESS_PRIVATE(vma, __vm_flags) &= ~flags;
 }
 
@@ -686,7 +818,7 @@ static inline void __vm_flags_mod(struct vm_area_struct *vma,
 static inline void vm_flags_mod(struct vm_area_struct *vma,
                                vm_flags_t set, vm_flags_t clear)
 {
-       mmap_assert_write_locked(vma->vm_mm);
+       vma_start_write(vma);
        __vm_flags_mod(vma, set, clear);
 }
 
@@ -1554,6 +1686,16 @@ static inline int xchg_page_access_time(struct page *page, int time)
        last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
        return last_time << PAGE_ACCESS_TIME_BUCKETS;
 }
+
+static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
+{
+       unsigned int pid_bit;
+
+       pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
+       if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) {
+               __set_bit(pid_bit, &vma->numab_state->access_pids[1]);
+       }
+}
 #else /* !CONFIG_NUMA_BALANCING */
 static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
 {
@@ -1603,6 +1745,10 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
 {
        return false;
 }
+
+static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
+{
+}
 #endif /* CONFIG_NUMA_BALANCING */
 
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
@@ -2636,12 +2782,6 @@ static inline bool ptlock_init(struct page *page) { return true; }
 static inline void ptlock_free(struct page *page) {}
 #endif /* USE_SPLIT_PTE_PTLOCKS */
 
-static inline void pgtable_init(void)
-{
-       ptlock_cache_init();
-       pgtable_cache_init();
-}
-
 static inline bool pgtable_pte_page_ctor(struct page *page)
 {
        if (!ptlock_init(page))
@@ -2785,7 +2925,6 @@ extern unsigned long free_reserved_area(void *start, void *end,
                                        int poison, const char *s);
 
 extern void adjust_managed_page_count(struct page *page, long count);
-extern void mem_init_print_info(void);
 
 extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end);
 
@@ -2896,7 +3035,6 @@ extern void setup_per_cpu_pageset(void);
 extern int min_free_kbytes;
 extern int watermark_boost_factor;
 extern int watermark_scale_factor;
-extern bool arch_has_descending_max_zone_pfns(void);
 
 /* nommu.c */
 extern atomic_long_t mmap_pages_allocated;
@@ -3185,8 +3323,6 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn, pgprot_t pgprot);
 vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                        pfn_t pfn);
-vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
-                       pfn_t pfn, pgprot_t pgprot);
 vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
                unsigned long addr, pfn_t pfn);
 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
@@ -3256,7 +3392,6 @@ extern int apply_to_existing_page_range(struct mm_struct *mm,
                                   unsigned long address, unsigned long size,
                                   pte_fn_t fn, void *data);
 
-extern void __init init_mem_debugging_and_hardening(void);
 #ifdef CONFIG_PAGE_POISONING
 extern void __kernel_poison_pages(struct page *page, int numpages);
 extern void __kernel_unpoison_pages(struct page *page, int numpages);
@@ -3425,6 +3560,22 @@ void vmemmap_populate_print_last(void);
 void vmemmap_free(unsigned long start, unsigned long end,
                struct vmem_altmap *altmap);
 #endif
+
+#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
+static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
+                                          struct dev_pagemap *pgmap)
+{
+       return is_power_of_2(sizeof(struct page)) &&
+               pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap;
+}
+#else
+static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
+                                          struct dev_pagemap *pgmap)
+{
+       return false;
+}
+#endif
+
 void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
                                  unsigned long nr_pages);
 
@@ -3451,6 +3602,7 @@ extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
                                        bool *migratable_cleared);
 void num_poisoned_pages_inc(unsigned long pfn);
 void num_poisoned_pages_sub(unsigned long pfn, long i);
+struct task_struct *task_early_kill(struct task_struct *tsk, int force_early);
 #else
 static inline void memory_failure_queue(unsigned long pfn, int flags)
 {
@@ -3471,6 +3623,12 @@ static inline void num_poisoned_pages_sub(unsigned long pfn, long i)
 }
 #endif
 
+#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_KSM)
+void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
+                    struct vm_area_struct *vma, struct list_head *to_kill,
+                    unsigned long ksm_addr);
+#endif
+
 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
 extern void memblk_nr_poison_inc(unsigned long pfn);
 extern void memblk_nr_poison_sub(unsigned long pfn, long i);
@@ -3540,14 +3698,12 @@ extern const struct attribute_group memory_failure_attr_group;
 extern void clear_huge_page(struct page *page,
                            unsigned long addr_hint,
                            unsigned int pages_per_huge_page);
-extern void copy_user_huge_page(struct page *dst, struct page *src,
-                               unsigned long addr_hint,
-                               struct vm_area_struct *vma,
-                               unsigned int pages_per_huge_page);
-extern long copy_huge_page_from_user(struct page *dst_page,
-                               const void __user *usr_src,
-                               unsigned int pages_per_huge_page,
-                               bool allow_pagefault);
+int copy_user_large_folio(struct folio *dst, struct folio *src,
+                         unsigned long addr_hint,
+                         struct vm_area_struct *vma);
+long copy_folio_from_user(struct folio *dst_folio,
+                          const void __user *usr_src,
+                          bool allow_pagefault);
 
 /**
  * vma_is_special_huge - Are transhuge page-table entries considered special?