Merge tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel...

[linux-block.git] / include / linux / mm.h
diff --git a/include/linux/mm.h b/include/linux/mm.h

index d0a6c99aba0988fc5eca123ca6b2859f39350f61..3731999cd9f09a0e873d5987e030f8b3cde7987a 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -29,6 +29,7 @@
  #include <linux/pgtable.h>
  #include <linux/kasan.h>
  #include <linux/memremap.h>
+#include <linux/slab.h>
  
  struct mempolicy;
  struct anon_vma;
@@ -257,6 +258,8 @@ void setup_initial_init_mm(void *start_code, void *end_code,
  struct vm_area_struct *vm_area_alloc(struct mm_struct *);
  struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
  void vm_area_free(struct vm_area_struct *);
+/* Use only if VMA has no other users */
+void __vm_area_free(struct vm_area_struct *vma);
  
  #ifndef CONFIG_MMU
  extern struct rb_root nommu_region_tree;
@@ -479,7 +482,8 @@ static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
         { FAULT_FLAG_USER,              "USER" }, \
         { FAULT_FLAG_REMOTE,            "REMOTE" }, \
         { FAULT_FLAG_INSTRUCTION,       "INSTRUCTION" }, \
-       { FAULT_FLAG_INTERRUPTIBLE,     "INTERRUPTIBLE" }
+       { FAULT_FLAG_INTERRUPTIBLE,     "INTERRUPTIBLE" }, \
+       { FAULT_FLAG_VMA_LOCK,          "VMA_LOCK" }
  
  /*
   * vm_fault is filled by the pagefault handler and passed to the vma's
@@ -624,13 +628,21 @@ struct vm_operations_struct {
                                           unsigned long addr);
  };
  
-#ifdef CONFIG_PER_VMA_LOCK
-static inline void vma_init_lock(struct vm_area_struct *vma)
+#ifdef CONFIG_NUMA_BALANCING
+static inline void vma_numab_state_init(struct vm_area_struct *vma)
+{
+       vma->numab_state = NULL;
+}
+static inline void vma_numab_state_free(struct vm_area_struct *vma)
  {
-       init_rwsem(&vma->lock);
-       vma->vm_lock_seq = -1;
+       kfree(vma->numab_state);
  }
+#else
+static inline void vma_numab_state_init(struct vm_area_struct *vma) {}
+static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
+#endif /* CONFIG_NUMA_BALANCING */
  
+#ifdef CONFIG_PER_VMA_LOCK
  /*
   * Try to read-lock a vma. The function is allowed to occasionally yield false
   * locked result to avoid performance overhead, in which case we fall back to
@@ -642,17 +654,17 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
         if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
                 return false;
  
-       if (unlikely(down_read_trylock(&vma->lock) == 0))
+       if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
                 return false;
  
         /*
          * Overflow might produce false locked result.
          * False unlocked result is impossible because we modify and check
-        * vma->vm_lock_seq under vma->lock protection and mm->mm_lock_seq
+        * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
          * modification invalidates all existing locks.
          */
         if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
-               up_read(&vma->lock);
+               up_read(&vma->vm_lock->lock);
                 return false;
         }
         return true;
@@ -661,39 +673,67 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
  static inline void vma_end_read(struct vm_area_struct *vma)
  {
         rcu_read_lock(); /* keeps vma alive till the end of up_read */
-       up_read(&vma->lock);
+       up_read(&vma->vm_lock->lock);
         rcu_read_unlock();
  }
  
-static inline void vma_start_write(struct vm_area_struct *vma)
+static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
  {
-       int mm_lock_seq;
-
         mmap_assert_write_locked(vma->vm_mm);
  
         /*
          * current task is holding mmap_write_lock, both vma->vm_lock_seq and
          * mm->mm_lock_seq can't be concurrently modified.
          */
-       mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
-       if (vma->vm_lock_seq == mm_lock_seq)
+       *mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
+       return (vma->vm_lock_seq == *mm_lock_seq);
+}
+
+static inline void vma_start_write(struct vm_area_struct *vma)
+{
+       int mm_lock_seq;
+
+       if (__is_vma_write_locked(vma, &mm_lock_seq))
                 return;
  
-       down_write(&vma->lock);
+       down_write(&vma->vm_lock->lock);
         vma->vm_lock_seq = mm_lock_seq;
-       up_write(&vma->lock);
+       up_write(&vma->vm_lock->lock);
+}
+
+static inline bool vma_try_start_write(struct vm_area_struct *vma)
+{
+       int mm_lock_seq;
+
+       if (__is_vma_write_locked(vma, &mm_lock_seq))
+               return true;
+
+       if (!down_write_trylock(&vma->vm_lock->lock))
+               return false;
+
+       vma->vm_lock_seq = mm_lock_seq;
+       up_write(&vma->vm_lock->lock);
+       return true;
  }
  
  static inline void vma_assert_write_locked(struct vm_area_struct *vma)
  {
-       mmap_assert_write_locked(vma->vm_mm);
-       /*
-        * current task is holding mmap_write_lock, both vma->vm_lock_seq and
-        * mm->mm_lock_seq can't be concurrently modified.
-        */
-       VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma);
+       int mm_lock_seq;
+
+       VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
  }
  
+static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
+{
+       /* When detaching vma should be write-locked */
+       if (detached)
+               vma_assert_write_locked(vma);
+       vma->detached = detached;
+}
+
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+                                         unsigned long address);
+
  #else /* CONFIG_PER_VMA_LOCK */
  
  static inline void vma_init_lock(struct vm_area_struct *vma) {}
@@ -701,10 +741,18 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
                 { return false; }
  static inline void vma_end_read(struct vm_area_struct *vma) {}
  static inline void vma_start_write(struct vm_area_struct *vma) {}
+static inline bool vma_try_start_write(struct vm_area_struct *vma)
+               { return true; }
  static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
+static inline void vma_mark_detached(struct vm_area_struct *vma,
+                                    bool detached) {}
  
  #endif /* CONFIG_PER_VMA_LOCK */
  
+/*
+ * WARNING: vma_init does not initialize vma->vm_lock.
+ * Use vm_area_alloc()/vm_area_free() if vma needs locking.
+ */
  static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
  {
         static const struct vm_operations_struct dummy_vm_ops = {};
@@ -713,7 +761,8 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
         vma->vm_mm = mm;
         vma->vm_ops = &dummy_vm_ops;
         INIT_LIST_HEAD(&vma->anon_vma_chain);
-       vma_init_lock(vma);
+       vma_mark_detached(vma, false);
+       vma_numab_state_init(vma);
  }
  
  /* Use when VMA is not part of the VMA tree and needs no locking */
@@ -1637,6 +1686,16 @@ static inline int xchg_page_access_time(struct page *page, int time)
         last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
         return last_time << PAGE_ACCESS_TIME_BUCKETS;
  }
+
+static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
+{
+       unsigned int pid_bit;
+
+       pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
+       if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) {
+               __set_bit(pid_bit, &vma->numab_state->access_pids[1]);
+       }
+}
  #else /* !CONFIG_NUMA_BALANCING */
  static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
  {
@@ -1686,6 +1745,10 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
  {
         return false;
  }
+
+static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
+{
+}
  #endif /* CONFIG_NUMA_BALANCING */
  
  #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
@@ -2972,7 +3035,6 @@ extern void setup_per_cpu_pageset(void);
  extern int min_free_kbytes;
  extern int watermark_boost_factor;
  extern int watermark_scale_factor;
-extern bool arch_has_descending_max_zone_pfns(void);
  
  /* nommu.c */
  extern atomic_long_t mmap_pages_allocated;
@@ -3498,6 +3560,22 @@ void vmemmap_populate_print_last(void);
  void vmemmap_free(unsigned long start, unsigned long end,
                 struct vmem_altmap *altmap);
  #endif
+
+#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
+static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
+                                          struct dev_pagemap *pgmap)
+{
+       return is_power_of_2(sizeof(struct page)) &&
+               pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap;
+}
+#else
+static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
+                                          struct dev_pagemap *pgmap)
+{
+       return false;
+}
+#endif
+
  void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
                                   unsigned long nr_pages);
  
@@ -3515,8 +3593,6 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
  extern int memory_failure(unsigned long pfn, int flags);
  extern void memory_failure_queue_kick(int cpu);
  extern int unpoison_memory(unsigned long pfn);
-extern int sysctl_memory_failure_early_kill;
-extern int sysctl_memory_failure_recovery;
  extern void shake_page(struct page *p);
  extern atomic_long_t num_poisoned_pages __read_mostly;
  extern int soft_offline_page(unsigned long pfn, int flags);
@@ -3526,6 +3602,7 @@ extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
                                         bool *migratable_cleared);
  void num_poisoned_pages_inc(unsigned long pfn);
  void num_poisoned_pages_sub(unsigned long pfn, long i);
+struct task_struct *task_early_kill(struct task_struct *tsk, int force_early);
  #else
  static inline void memory_failure_queue(unsigned long pfn, int flags)
  {
@@ -3546,6 +3623,12 @@ static inline void num_poisoned_pages_sub(unsigned long pfn, long i)
  }
  #endif
  
+#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_KSM)
+void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
+                    struct vm_area_struct *vma, struct list_head *to_kill,
+                    unsigned long ksm_addr);
+#endif
+
  #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
  extern void memblk_nr_poison_inc(unsigned long pfn);
  extern void memblk_nr_poison_sub(unsigned long pfn, long i);
@@ -3615,14 +3698,12 @@ extern const struct attribute_group memory_failure_attr_group;
  extern void clear_huge_page(struct page *page,
                             unsigned long addr_hint,
                             unsigned int pages_per_huge_page);
-extern void copy_user_huge_page(struct page *dst, struct page *src,
-                               unsigned long addr_hint,
-                               struct vm_area_struct *vma,
-                               unsigned int pages_per_huge_page);
-extern long copy_huge_page_from_user(struct page *dst_page,
-                               const void __user *usr_src,
-                               unsigned int pages_per_huge_page,
-                               bool allow_pagefault);
+int copy_user_large_folio(struct folio *dst, struct folio *src,
+                         unsigned long addr_hint,
+                         struct vm_area_struct *vma);
+long copy_folio_from_user(struct folio *dst_folio,
+                          const void __user *usr_src,
+                          bool allow_pagefault);
  
  /**
   * vma_is_special_huge - Are transhuge page-table entries considered special?