Merge tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-block.git] / mm / hugetlb.c
index 245038a9fe4eaa27e51ad83ba0abf4cb2c403deb..f154019e6b840c41dfe7220a218c7da6f22c6267 100644 (file)
@@ -2050,19 +2050,23 @@ int PageHuge(struct page *page)
 }
 EXPORT_SYMBOL_GPL(PageHuge);
 
-/*
- * PageHeadHuge() only returns true for hugetlbfs head page, but not for
- * normal or transparent huge pages.
+/**
+ * folio_test_hugetlb - Determine if the folio belongs to hugetlbfs
+ * @folio: The folio to test.
+ *
+ * Context: Any context.  Caller should have a reference on the folio to
+ * prevent it from being turned into a tail page.
+ * Return: True for hugetlbfs folios, false for anon folios or folios
+ * belonging to other filesystems.
  */
-int PageHeadHuge(struct page *page_head)
+bool folio_test_hugetlb(struct folio *folio)
 {
-       struct folio *folio = (struct folio *)page_head;
        if (!folio_test_large(folio))
-               return 0;
+               return false;
 
        return folio->_folio_dtor == HUGETLB_PAGE_DTOR;
 }
-EXPORT_SYMBOL_GPL(PageHeadHuge);
+EXPORT_SYMBOL_GPL(folio_test_hugetlb);
 
 /*
  * Find and lock address space (mapping) in write mode.
@@ -2090,7 +2094,7 @@ pgoff_t hugetlb_basepage_index(struct page *page)
        pgoff_t index = page_index(page_head);
        unsigned long compound_idx;
 
-       if (compound_order(page_head) >= MAX_ORDER)
+       if (compound_order(page_head) > MAX_ORDER)
                compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
        else
                compound_idx = page - page_head;
@@ -4202,6 +4206,12 @@ static void __init hugetlb_sysfs_init(void)
        hugetlb_register_all_nodes();
 }
 
+#ifdef CONFIG_SYSCTL
+static void hugetlb_sysctl_init(void);
+#else
+static inline void hugetlb_sysctl_init(void) { }
+#endif
+
 static int __init hugetlb_init(void)
 {
        int i;
@@ -4257,6 +4267,7 @@ static int __init hugetlb_init(void)
 
        hugetlb_sysfs_init();
        hugetlb_cgroup_file_init();
+       hugetlb_sysctl_init();
 
 #ifdef CONFIG_SMP
        num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
@@ -4497,7 +4508,7 @@ static int __init default_hugepagesz_setup(char *s)
         * The number of default huge pages (for this size) could have been
         * specified as the first hugetlb parameter: hugepages=X.  If so,
         * then default_hstate_max_huge_pages is set.  If the default huge
-        * page size is gigantic (>= MAX_ORDER), then the pages must be
+        * page size is gigantic (> MAX_ORDER), then the pages must be
         * allocated here from bootmem allocator.
         */
        if (default_hstate_max_huge_pages) {
@@ -4588,7 +4599,7 @@ out:
        return ret;
 }
 
-int hugetlb_sysctl_handler(struct ctl_table *table, int write,
+static int hugetlb_sysctl_handler(struct ctl_table *table, int write,
                          void *buffer, size_t *length, loff_t *ppos)
 {
 
@@ -4597,7 +4608,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
 }
 
 #ifdef CONFIG_NUMA
-int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
+static int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
                          void *buffer, size_t *length, loff_t *ppos)
 {
        return hugetlb_sysctl_handler_common(true, table, write,
@@ -4605,7 +4616,7 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
 }
 #endif /* CONFIG_NUMA */
 
-int hugetlb_overcommit_handler(struct ctl_table *table, int write,
+static int hugetlb_overcommit_handler(struct ctl_table *table, int write,
                void *buffer, size_t *length, loff_t *ppos)
 {
        struct hstate *h = &default_hstate;
@@ -4634,6 +4645,44 @@ out:
        return ret;
 }
 
+static struct ctl_table hugetlb_table[] = {
+       {
+               .procname       = "nr_hugepages",
+               .data           = NULL,
+               .maxlen         = sizeof(unsigned long),
+               .mode           = 0644,
+               .proc_handler   = hugetlb_sysctl_handler,
+       },
+#ifdef CONFIG_NUMA
+       {
+               .procname       = "nr_hugepages_mempolicy",
+               .data           = NULL,
+               .maxlen         = sizeof(unsigned long),
+               .mode           = 0644,
+               .proc_handler   = &hugetlb_mempolicy_sysctl_handler,
+       },
+#endif
+       {
+               .procname       = "hugetlb_shm_group",
+               .data           = &sysctl_hugetlb_shm_group,
+               .maxlen         = sizeof(gid_t),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+       {
+               .procname       = "nr_overcommit_hugepages",
+               .data           = NULL,
+               .maxlen         = sizeof(unsigned long),
+               .mode           = 0644,
+               .proc_handler   = hugetlb_overcommit_handler,
+       },
+       { }
+};
+
+static void hugetlb_sysctl_init(void)
+{
+       register_sysctl_init("vm", hugetlb_table);
+}
 #endif /* CONFIG_SYSCTL */
 
 void hugetlb_report_meminfo(struct seq_file *m)
@@ -4949,11 +4998,15 @@ static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
 
 static void
 hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
-                    struct folio *new_folio)
+                     struct folio *new_folio, pte_t old)
 {
+       pte_t newpte = make_huge_pte(vma, &new_folio->page, 1);
+
        __folio_mark_uptodate(new_folio);
        hugepage_add_new_anon_rmap(new_folio, vma, addr);
-       set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, &new_folio->page, 1));
+       if (userfaultfd_wp(vma) && huge_pte_uffd_wp(old))
+               newpte = huge_pte_mkuffd_wp(newpte);
+       set_huge_pte_at(vma->vm_mm, addr, ptep, newpte);
        hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
        folio_set_hugetlb_migratable(new_folio);
 }
@@ -5028,14 +5081,12 @@ again:
                         */
                        ;
                } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
-                       bool uffd_wp = huge_pte_uffd_wp(entry);
-
-                       if (!userfaultfd_wp(dst_vma) && uffd_wp)
+                       if (!userfaultfd_wp(dst_vma))
                                entry = huge_pte_clear_uffd_wp(entry);
                        set_huge_pte_at(dst, addr, dst_pte, entry);
                } else if (unlikely(is_hugetlb_entry_migration(entry))) {
                        swp_entry_t swp_entry = pte_to_swp_entry(entry);
-                       bool uffd_wp = huge_pte_uffd_wp(entry);
+                       bool uffd_wp = pte_swp_uffd_wp(entry);
 
                        if (!is_readable_migration_entry(swp_entry) && cow) {
                                /*
@@ -5046,10 +5097,10 @@ again:
                                                        swp_offset(swp_entry));
                                entry = swp_entry_to_pte(swp_entry);
                                if (userfaultfd_wp(src_vma) && uffd_wp)
-                                       entry = huge_pte_mkuffd_wp(entry);
+                                       entry = pte_swp_mkuffd_wp(entry);
                                set_huge_pte_at(src, addr, src_pte, entry);
                        }
-                       if (!userfaultfd_wp(dst_vma) && uffd_wp)
+                       if (!userfaultfd_wp(dst_vma))
                                entry = huge_pte_clear_uffd_wp(entry);
                        set_huge_pte_at(dst, addr, dst_pte, entry);
                } else if (unlikely(is_pte_marker(entry))) {
@@ -5093,9 +5144,14 @@ again:
                                        ret = PTR_ERR(new_folio);
                                        break;
                                }
-                               copy_user_huge_page(&new_folio->page, ptepage, addr, dst_vma,
-                                                   npages);
+                               ret = copy_user_large_folio(new_folio,
+                                                     page_folio(ptepage),
+                                                     addr, dst_vma);
                                put_page(ptepage);
+                               if (ret) {
+                                       folio_put(new_folio);
+                                       break;
+                               }
 
                                /* Install the new hugetlb folio if src pte stable */
                                dst_ptl = huge_pte_lock(h, dst, dst_pte);
@@ -5109,7 +5165,8 @@ again:
                                        /* huge_ptep of dst_pte won't change as in child */
                                        goto again;
                                }
-                               hugetlb_install_folio(dst_vma, dst_pte, addr, new_folio);
+                               hugetlb_install_folio(dst_vma, dst_pte, addr,
+                                                     new_folio, src_pte_old);
                                spin_unlock(src_ptl);
                                spin_unlock(dst_ptl);
                                continue;
@@ -5127,6 +5184,9 @@ again:
                                entry = huge_pte_wrprotect(entry);
                        }
 
+                       if (!userfaultfd_wp(dst_vma))
+                               entry = huge_pte_clear_uffd_wp(entry);
+
                        set_huge_pte_at(dst, addr, dst_pte, entry);
                        hugetlb_count_add(npages, dst);
                }
@@ -5612,8 +5672,10 @@ retry_avoidcopy:
                goto out_release_all;
        }
 
-       copy_user_huge_page(&new_folio->page, old_page, address, vma,
-                           pages_per_huge_page(h));
+       if (copy_user_large_folio(new_folio, page_folio(old_page), address, vma)) {
+               ret = VM_FAULT_HWPOISON_LARGE;
+               goto out_release_all;
+       }
        __folio_mark_uptodate(new_folio);
 
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr,
@@ -5627,13 +5689,16 @@ retry_avoidcopy:
        spin_lock(ptl);
        ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
        if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
+               pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare);
+
                /* Break COW or unshare */
                huge_ptep_clear_flush(vma, haddr, ptep);
                mmu_notifier_invalidate_range(mm, range.start, range.end);
                page_remove_rmap(old_page, vma, true);
                hugepage_add_new_anon_rmap(new_folio, vma, haddr);
-               set_huge_pte_at(mm, haddr, ptep,
-                               make_huge_pte(vma, &new_folio->page, !unshare));
+               if (huge_pte_uffd_wp(pte))
+                       newpte = huge_pte_mkuffd_wp(newpte);
+               set_huge_pte_at(mm, haddr, ptep, newpte);
                folio_set_hugetlb_migratable(new_folio);
                /* Make the old page be freed below */
                new_folio = page_folio(old_page);
@@ -5790,7 +5855,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
         */
        new_folio = false;
        folio = filemap_lock_folio(mapping, idx);
-       if (!folio) {
+       if (IS_ERR(folio)) {
                size = i_size_read(mapping->host) >> huge_page_shift(h);
                if (idx >= size)
                        goto out;
@@ -6081,6 +6146,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                vma_end_reservation(h, vma, haddr);
 
                pagecache_folio = filemap_lock_folio(mapping, idx);
+               if (IS_ERR(pagecache_folio))
+                       pagecache_folio = NULL;
        }
 
        ptl = huge_pte_lock(h, mm, ptep);
@@ -6164,19 +6231,19 @@ out_mutex:
 
 #ifdef CONFIG_USERFAULTFD
 /*
- * Used by userfaultfd UFFDIO_COPY.  Based on mcopy_atomic_pte with
- * modifications for huge pages.
+ * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte
+ * with modifications for hugetlb pages.
  */
-int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
-                           pte_t *dst_pte,
-                           struct vm_area_struct *dst_vma,
-                           unsigned long dst_addr,
-                           unsigned long src_addr,
-                           enum mcopy_atomic_mode mode,
-                           struct page **pagep,
-                           bool wp_copy)
-{
-       bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
+int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
+                            struct vm_area_struct *dst_vma,
+                            unsigned long dst_addr,
+                            unsigned long src_addr,
+                            uffd_flags_t flags,
+                            struct folio **foliop)
+{
+       struct mm_struct *dst_mm = dst_vma->vm_mm;
+       bool is_continue = uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE);
+       bool wp_enabled = (flags & MFILL_ATOMIC_WP);
        struct hstate *h = hstate_vma(dst_vma);
        struct address_space *mapping = dst_vma->vm_file->f_mapping;
        pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
@@ -6192,11 +6259,11 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
        if (is_continue) {
                ret = -EFAULT;
                folio = filemap_lock_folio(mapping, idx);
-               if (!folio)
+               if (IS_ERR(folio))
                        goto out;
                folio_in_pagecache = true;
-       } else if (!*pagep) {
-               /* If a page already exists, then it's UFFDIO_COPY for
+       } else if (!*foliop) {
+               /* If a folio already exists, then it's UFFDIO_COPY for
                 * a non-missing case. Return -EEXIST.
                 */
                if (vm_shared &&
@@ -6211,9 +6278,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                        goto out;
                }
 
-               ret = copy_huge_page_from_user(&folio->page,
-                                               (const void __user *) src_addr,
-                                               pages_per_huge_page(h), false);
+               ret = copy_folio_from_user(folio, (const void __user *) src_addr,
+                                          false);
 
                /* fallback to copy_from_user outside mmap_lock */
                if (unlikely(ret)) {
@@ -6232,33 +6298,36 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                                ret = -ENOMEM;
                                goto out;
                        }
-                       *pagep = &folio->page;
-                       /* Set the outparam pagep and return to the caller to
+                       *foliop = folio;
+                       /* Set the outparam foliop and return to the caller to
                         * copy the contents outside the lock. Don't free the
-                        * page.
+                        * folio.
                         */
                        goto out;
                }
        } else {
                if (vm_shared &&
                    hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
-                       put_page(*pagep);
+                       folio_put(*foliop);
                        ret = -EEXIST;
-                       *pagep = NULL;
+                       *foliop = NULL;
                        goto out;
                }
 
                folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0);
                if (IS_ERR(folio)) {
-                       put_page(*pagep);
+                       folio_put(*foliop);
                        ret = -ENOMEM;
-                       *pagep = NULL;
+                       *foliop = NULL;
+                       goto out;
+               }
+               ret = copy_user_large_folio(folio, *foliop, dst_addr, dst_vma);
+               folio_put(*foliop);
+               *foliop = NULL;
+               if (ret) {
+                       folio_put(folio);
                        goto out;
                }
-               copy_user_huge_page(&folio->page, *pagep, dst_addr, dst_vma,
-                                   pages_per_huge_page(h));
-               put_page(*pagep);
-               *pagep = NULL;
        }
 
        /*
@@ -6311,7 +6380,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
         * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
         * with wp flag set, don't set pte write bit.
         */
-       if (wp_copy || (is_continue && !vm_shared))
+       if (wp_enabled || (is_continue && !vm_shared))
                writable = 0;
        else
                writable = dst_vma->vm_flags & VM_WRITE;
@@ -6326,7 +6395,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
        _dst_pte = huge_pte_mkdirty(_dst_pte);
        _dst_pte = pte_mkyoung(_dst_pte);
 
-       if (wp_copy)
+       if (wp_enabled)
                _dst_pte = huge_pte_mkuffd_wp(_dst_pte);
 
        set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);