Merge tag 'mm-stable-2024-05-17-19-19' of git://git.kernel.org/pub/scm/linux/kernel...

[linux-2.6-block.git] / include / linux / mm.h
diff --git a/include/linux/mm.h b/include/linux/mm.h

index b6bdaa18b9e9d44583cd245c662e6c2bfdae3825..9849dfda44d43ccdaaab5e5a49d979753d189f0e 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -5,6 +5,7 @@
  #include <linux/errno.h>
  #include <linux/mmdebug.h>
  #include <linux/gfp.h>
+#include <linux/pgalloc_tag.h>
  #include <linux/bug.h>
  #include <linux/list.h>
  #include <linux/mmzone.h>
@@ -1199,7 +1200,7 @@ static inline int is_vmalloc_or_module_addr(const void *x)
   * debugging purposes - it does not include PTE-mapped sub-pages; look
   * at folio_mapcount() or page_mapcount() instead.
   */
-static inline int folio_entire_mapcount(struct folio *folio)
+static inline int folio_entire_mapcount(const struct folio *folio)
  {
         VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
         return atomic_read(&folio->_entire_mapcount) + 1;
@@ -1231,7 +1232,7 @@ static inline int page_mapcount(struct page *page)
         int mapcount = atomic_read(&page->_mapcount) + 1;
  
         /* Handle page_has_type() pages */
-       if (mapcount < 0)
+       if (mapcount < PAGE_MAPCOUNT_RESERVE + 1)
                 mapcount = 0;
         if (unlikely(PageCompound(page)))
                 mapcount += folio_entire_mapcount(page_folio(page));
@@ -1239,34 +1240,44 @@ static inline int page_mapcount(struct page *page)
         return mapcount;
  }
  
-int folio_total_mapcount(struct folio *folio);
+static inline int folio_large_mapcount(const struct folio *folio)
+{
+       VM_WARN_ON_FOLIO(!folio_test_large(folio), folio);
+       return atomic_read(&folio->_large_mapcount) + 1;
+}
  
  /**
- * folio_mapcount() - Calculate the number of mappings of this folio.
+ * folio_mapcount() - Number of mappings of this folio.
   * @folio: The folio.
   *
- * A large folio tracks both how many times the entire folio is mapped,
- * and how many times each individual page in the folio is mapped.
- * This function calculates the total number of times the folio is
- * mapped.
+ * The folio mapcount corresponds to the number of present user page table
+ * entries that reference any part of a folio. Each such present user page
+ * table entry must be paired with exactly on folio reference.
+ *
+ * For ordindary folios, each user page table entry (PTE/PMD/PUD/...) counts
+ * exactly once.
+ *
+ * For hugetlb folios, each abstracted "hugetlb" user page table entry that
+ * references the entire folio counts exactly once, even when such special
+ * page table entries are comprised of multiple ordinary page table entries.
+ *
+ * Will report 0 for pages which cannot be mapped into userspace, such as
+ * slab, page tables and similar.
   *
   * Return: The number of times this folio is mapped.
   */
-static inline int folio_mapcount(struct folio *folio)
+static inline int folio_mapcount(const struct folio *folio)
  {
-       if (likely(!folio_test_large(folio)))
-               return atomic_read(&folio->_mapcount) + 1;
-       return folio_total_mapcount(folio);
-}
+       int mapcount;
  
-static inline bool folio_large_is_mapped(struct folio *folio)
-{
-       /*
-        * Reading _entire_mapcount below could be omitted if hugetlb
-        * participated in incrementing nr_pages_mapped when compound mapped.
-        */
-       return atomic_read(&folio->_nr_pages_mapped) > 0 ||
-               atomic_read(&folio->_entire_mapcount) >= 0;
+       if (likely(!folio_test_large(folio))) {
+               mapcount = atomic_read(&folio->_mapcount) + 1;
+               /* Handle page_has_type() pages */
+               if (mapcount < PAGE_MAPCOUNT_RESERVE + 1)
+                       mapcount = 0;
+               return mapcount;
+       }
+       return folio_large_mapcount(folio);
  }
  
  /**
@@ -1275,11 +1286,9 @@ static inline bool folio_large_is_mapped(struct folio *folio)
   *
   * Return: True if any page in this folio is referenced by user page tables.
   */
-static inline bool folio_mapped(struct folio *folio)
+static inline bool folio_mapped(const struct folio *folio)
  {
-       if (likely(!folio_test_large(folio)))
-               return atomic_read(&folio->_mapcount) >= 0;
-       return folio_large_is_mapped(folio);
+       return folio_mapcount(folio) >= 1;
  }
  
  /*
@@ -1287,11 +1296,9 @@ static inline bool folio_mapped(struct folio *folio)
   * For compound page it returns true if any sub-page of compound page is mapped,
   * even if this particular sub-page is not itself mapped by any PTE or PMD.
   */
-static inline bool page_mapped(struct page *page)
+static inline bool page_mapped(const struct page *page)
  {
-       if (likely(!PageCompound(page)))
-               return atomic_read(&page->_mapcount) >= 0;
-       return folio_large_is_mapped(page_folio(page));
+       return folio_mapped(page_folio(page));
  }
  
  static inline struct page *virt_to_head_page(const void *x)
@@ -1317,8 +1324,6 @@ void folio_copy(struct folio *dst, struct folio *src);
  
  unsigned long nr_free_buffer_pages(void);
  
-void destroy_large_folio(struct folio *folio);
-
  /* Returns the number of bytes in this potentially compound page. */
  static inline unsigned long page_size(struct page *page)
  {
@@ -1436,27 +1441,22 @@ vm_fault_t finish_fault(struct vm_fault *vmf);
  #if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
  DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
  
-bool __put_devmap_managed_page_refs(struct page *page, int refs);
-static inline bool put_devmap_managed_page_refs(struct page *page, int refs)
+bool __put_devmap_managed_folio_refs(struct folio *folio, int refs);
+static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs)
  {
         if (!static_branch_unlikely(&devmap_managed_key))
                 return false;
-       if (!is_zone_device_page(page))
+       if (!folio_is_zone_device(folio))
                 return false;
-       return __put_devmap_managed_page_refs(page, refs);
+       return __put_devmap_managed_folio_refs(folio, refs);
  }
  #else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
-static inline bool put_devmap_managed_page_refs(struct page *page, int refs)
+static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs)
  {
         return false;
  }
  #endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
  
-static inline bool put_devmap_managed_page(struct page *page)
-{
-       return put_devmap_managed_page_refs(page, 1);
-}
-
  /* 127: arbitrary random number, small enough to assemble well */
  #define folio_ref_zero_or_close_to_overflow(folio) \
         ((unsigned int) folio_ref_count(folio) + 127u <= 127u)
@@ -1575,7 +1575,7 @@ static inline void put_page(struct page *page)
          * For some devmap managed pages we need to catch refcount transition
          * from 2 to 1:
          */
-       if (put_devmap_managed_page(&folio->page))
+       if (put_devmap_managed_folio_refs(folio, 1))
                 return;
         folio_put(folio);
  }
@@ -2069,7 +2069,7 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
   *
   * Return: A positive power of two.
   */
-static inline long folio_nr_pages(struct folio *folio)
+static inline long folio_nr_pages(const struct folio *folio)
  {
         if (!folio_test_large(folio))
                 return 1;
@@ -2164,21 +2164,64 @@ static inline size_t folio_size(struct folio *folio)
  }
  
  /**
- * folio_estimated_sharers - Estimate the number of sharers of a folio.
+ * folio_likely_mapped_shared - Estimate if the folio is mapped into the page
+ *                             tables of more than one MM
   * @folio: The folio.
   *
- * folio_estimated_sharers() aims to serve as a function to efficiently
- * estimate the number of processes sharing a folio. This is done by
- * looking at the precise mapcount of the first subpage in the folio, and
- * assuming the other subpages are the same. This may not be true for large
- * folios. If you want exact mapcounts for exact calculations, look at
- * page_mapcount() or folio_total_mapcount().
+ * This function checks if the folio is currently mapped into more than one
+ * MM ("mapped shared"), or if the folio is only mapped into a single MM
+ * ("mapped exclusively").
+ *
+ * As precise information is not easily available for all folios, this function
+ * estimates the number of MMs ("sharers") that are currently mapping a folio
+ * using the number of times the first page of the folio is currently mapped
+ * into page tables.
+ *
+ * For small anonymous folios (except KSM folios) and anonymous hugetlb folios,
+ * the return value will be exactly correct, because they can only be mapped
+ * at most once into an MM, and they cannot be partially mapped.
+ *
+ * For other folios, the result can be fuzzy:
+ *    #. For partially-mappable large folios (THP), the return value can wrongly
+ *       indicate "mapped exclusively" (false negative) when the folio is
+ *       only partially mapped into at least one MM.
+ *    #. For pagecache folios (including hugetlb), the return value can wrongly
+ *       indicate "mapped shared" (false positive) when two VMAs in the same MM
+ *       cover the same file range.
+ *    #. For (small) KSM folios, the return value can wrongly indicate "mapped
+ *       shared" (false positive), when the folio is mapped multiple times into
+ *       the same MM.
+ *
+ * Further, this function only considers current page table mappings that
+ * are tracked using the folio mapcount(s).
   *
- * Return: The estimated number of processes sharing a folio.
+ * This function does not consider:
+ *    #. If the folio might get mapped in the (near) future (e.g., swapcache,
+ *       pagecache, temporary unmapping for migration).
+ *    #. If the folio is mapped differently (VM_PFNMAP).
+ *    #. If hugetlb page table sharing applies. Callers might want to check
+ *       hugetlb_pmd_shared().
+ *
+ * Return: Whether the folio is estimated to be mapped into more than one MM.
   */
-static inline int folio_estimated_sharers(struct folio *folio)
+static inline bool folio_likely_mapped_shared(struct folio *folio)
  {
-       return page_mapcount(folio_page(folio, 0));
+       int mapcount = folio_mapcount(folio);
+
+       /* Only partially-mappable folios require more care. */
+       if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio)))
+               return mapcount > 1;
+
+       /* A single mapping implies "mapped exclusively". */
+       if (mapcount <= 1)
+               return false;
+
+       /* If any page is mapped more than once we treat it "mapped shared". */
+       if (folio_entire_mapcount(folio) || mapcount > folio_nr_pages(folio))
+               return true;
+
+       /* Let's guess based on the first subpage. */
+       return atomic_read(&folio->_mapcount) > 0;
  }
  
  #ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
@@ -2393,12 +2436,8 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
                 unsigned long end, unsigned long floor, unsigned long ceiling);
  int
  copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
-int follow_pte(struct mm_struct *mm, unsigned long address,
+int follow_pte(struct vm_area_struct *vma, unsigned long address,
                pte_t **ptepp, spinlock_t **ptlp);
-int follow_pfn(struct vm_area_struct *vma, unsigned long address,
-       unsigned long *pfn);
-int follow_phys(struct vm_area_struct *vma, unsigned long address,
-               unsigned int flags, unsigned long *prot, resource_size_t *phys);
  int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
                         void *buf, int len, int write);
  
@@ -2552,7 +2591,7 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
                                             MM_CP_UFFD_WP_RESOLVE)
  
  bool vma_needs_dirty_tracking(struct vm_area_struct *vma);
-int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
+bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
  static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
  {
         /*
@@ -2859,12 +2898,13 @@ static inline bool pagetable_is_reserved(struct ptdesc *pt)
   *
   * Return: The ptdesc describing the allocated page tables.
   */
-static inline struct ptdesc *pagetable_alloc(gfp_t gfp, unsigned int order)
+static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order)
  {
-       struct page *page = alloc_pages(gfp | __GFP_COMP, order);
+       struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order);
  
         return page_ptdesc(page);
  }
+#define pagetable_alloc(...)   alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__))
  
  /**
   * pagetable_free - Free pagetables
@@ -3134,6 +3174,14 @@ extern void reserve_bootmem_region(phys_addr_t start,
  /* Free the reserved page into the buddy system, so it gets managed. */
  static inline void free_reserved_page(struct page *page)
  {
+       if (mem_alloc_profiling_enabled()) {
+               union codetag_ref *ref = get_page_tag_ref(page);
+
+               if (ref) {
+                       set_codetag_empty(ref);
+                       put_page_tag_ref(ref);
+               }
+       }
         ClearPageReserved(page);
         init_page_count(page);
         __free_page(page);
@@ -3195,8 +3243,6 @@ static inline unsigned long get_num_physpages(void)
   */
  void free_area_init(unsigned long *max_zone_pfn);
  unsigned long node_map_pfn_alignment(void);
-unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn,
-                                               unsigned long end_pfn);
  extern unsigned long absent_pages_in_range(unsigned long start_pfn,
                                                 unsigned long end_pfn);
  extern void get_pfn_range_for_nid(unsigned int nid,
@@ -3212,7 +3258,6 @@ static inline int early_pfn_to_nid(unsigned long pfn)
  extern int __meminit early_pfn_to_nid(unsigned long pfn);
  #endif
  
-extern void set_dma_reserve(unsigned long new_dma_reserve);
  extern void mem_init(void);
  extern void __init mmap_init(void);
  
@@ -3224,9 +3269,6 @@ static inline void show_mem(void)
  extern long si_mem_available(void);
  extern void si_meminfo(struct sysinfo * val);
  extern void si_meminfo_node(struct sysinfo *val, int nid);
-#ifdef __HAVE_ARCH_RESERVED_KERNEL_PAGES
-extern unsigned long arch_reserved_kernel_pages(void);
-#endif
  
  extern __printf(3, 4)
  void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...);
@@ -3385,7 +3427,16 @@ extern int install_special_mapping(struct mm_struct *mm,
  unsigned long randomize_stack_top(unsigned long stack_top);
  unsigned long randomize_page(unsigned long start, unsigned long range);
  
-extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+unsigned long
+__get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+                   unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags);
+
+static inline unsigned long
+get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+                 unsigned long pgoff, unsigned long flags)
+{
+       return __get_unmapped_area(file, addr, len, pgoff, flags, 0);
+}
  
  extern unsigned long mmap_region(struct file *file, unsigned long addr,
         unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
@@ -3431,6 +3482,7 @@ struct vm_unmapped_area_info {
         unsigned long high_limit;
         unsigned long align_mask;
         unsigned long align_offset;
+       unsigned long start_gap;
  };
  
  extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info);
@@ -3724,7 +3776,14 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
  static inline bool want_init_on_free(void)
  {
         return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON,
-                                  &init_on_free);
+                               &init_on_free);
+}
+
+DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, init_mlocked_on_free);
+static inline bool want_init_mlocked_on_free(void)
+{
+       return static_branch_maybe(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON,
+                               &init_mlocked_on_free);
  }
  
  extern bool _debug_pagealloc_enabled_early;
@@ -3787,24 +3846,22 @@ static inline bool page_is_guard(struct page *page)
         return PageGuard(page);
  }
  
-bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
-                     int migratetype);
+bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order);
  static inline bool set_page_guard(struct zone *zone, struct page *page,
-                                 unsigned int order, int migratetype)
+                                 unsigned int order)
  {
         if (!debug_guardpage_enabled())
                 return false;
-       return __set_page_guard(zone, page, order, migratetype);
+       return __set_page_guard(zone, page, order);
  }
  
-void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order,
-                       int migratetype);
+void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order);
  static inline void clear_page_guard(struct zone *zone, struct page *page,
-                                   unsigned int order, int migratetype)
+                                   unsigned int order)
  {
         if (!debug_guardpage_enabled())
                 return;
-       __clear_page_guard(zone, page, order, migratetype);
+       __clear_page_guard(zone, page, order);
  }
  
  #else  /* CONFIG_DEBUG_PAGEALLOC */
@@ -3814,9 +3871,9 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; }
  static inline bool debug_guardpage_enabled(void) { return false; }
  static inline bool page_is_guard(struct page *page) { return false; }
  static inline bool set_page_guard(struct zone *zone, struct page *page,
-                       unsigned int order, int migratetype) { return false; }
+                       unsigned int order) { return false; }
  static inline void clear_page_guard(struct zone *zone, struct page *page,
-                               unsigned int order, int migratetype) {}
+                               unsigned int order) {}
  #endif /* CONFIG_DEBUG_PAGEALLOC */
  
  #ifdef __HAVE_ARCH_GATE_AREA
@@ -3971,7 +4028,6 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
  extern int memory_failure(unsigned long pfn, int flags);
  extern void memory_failure_queue_kick(int cpu);
  extern int unpoison_memory(unsigned long pfn);
-extern void shake_page(struct page *p);
  extern atomic_long_t num_poisoned_pages __read_mostly;
  extern int soft_offline_page(unsigned long pfn, int flags);
  #ifdef CONFIG_MEMORY_FAILURE
@@ -4204,4 +4260,7 @@ static inline bool pfn_is_unaccepted_memory(unsigned long pfn)
         return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE);
  }
  
+void vma_pgtable_walk_begin(struct vm_area_struct *vma);
+void vma_pgtable_walk_end(struct vm_area_struct *vma);
+
  #endif /* _LINUX_MM_H */