mm: fix race between __split_huge_pmd_locked() and GUP-fast

author Ryan Roberts <ryan.roberts@arm.com>

Wed, 1 May 2024 14:33:10 +0000 (15:33 +0100)

committer Andrew Morton <akpm@linux-foundation.org>

Tue, 7 May 2024 17:37:00 +0000 (10:37 -0700)
author Ryan Roberts <ryan.roberts@arm.com>
Wed, 1 May 2024 14:33:10 +0000 (15:33 +0100)
committer Andrew Morton <akpm@linux-foundation.org>
Tue, 7 May 2024 17:37:00 +0000 (10:37 -0700)
diff --git a/Documentation/mm/arch_pgtable_helpers.rst b/Documentation/mm/arch_pgtable_helpers.rst

index 2466d3363af79ff3fa7af2d4c06ba465b6d5e887..ad50ca6f495ebe251f5f79e33ea97166cd547a00 100644 (file)
--- a/Documentation/mm/arch_pgtable_helpers.rst
+++ b/Documentation/mm/arch_pgtable_helpers.rst
@@ -140,7 +140,8 @@ PMD Page Table Helpers
  +---------------------------+--------------------------------------------------+
  | pmd_swp_clear_soft_dirty  | Clears a soft dirty swapped PMD                  |
  +---------------------------+--------------------------------------------------+
-| pmd_mkinvalid             | Invalidates a mapped PMD [1]                     |
+| pmd_mkinvalid             | Invalidates a present PMD; do not call for       |
+|                           | non-present PMD [1]                              |
  +---------------------------+--------------------------------------------------+
  | pmd_set_huge              | Creates a PMD huge mapping                       |
  +---------------------------+--------------------------------------------------+
@@ -196,7 +197,8 @@ PUD Page Table Helpers
  +---------------------------+--------------------------------------------------+
  | pud_mkdevmap              | Creates a ZONE_DEVICE mapped PUD                 |
  +---------------------------+--------------------------------------------------+
-| pud_mkinvalid             | Invalidates a mapped PUD [1]                     |
+| pud_mkinvalid             | Invalidates a present PUD; do not call for       |
+|                           | non-present PUD [1]                              |
  +---------------------------+--------------------------------------------------+
  | pud_set_huge              | Creates a PUD huge mapping                       |
  +---------------------------+--------------------------------------------------+
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c

index 83823db3488b98deac4fda1b324dddbb10ec5947..2975ea0841ba4dbe4bd176b399abce24200d0d77 100644 (file)
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -170,6 +170,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
  {
         unsigned long old_pmd;
  
+       VM_WARN_ON_ONCE(!pmd_present(*pmdp));
         old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID);
         flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
         return __pmd(old_pmd);
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h

index 2cb2a2e7b34b832032f03df944802b1dea0d8fb1..558902edbfecefa89c6cf776c360af6827d927f7 100644 (file)
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1769,8 +1769,10 @@ static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
  static inline pmd_t pmdp_invalidate(struct vm_area_struct *vma,
                                    unsigned long addr, pmd_t *pmdp)
  {
-       pmd_t pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
+       pmd_t pmd;
  
+       VM_WARN_ON_ONCE(!pmd_present(*pmdp));
+       pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
         return pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd);
  }
  
diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c

index 19642f7ffb52a5f02421a946e8b19a0cbcd3bba1..8648a50afe8899a32f9da447fe7c3f57b99cd7da 100644 (file)
--- a/arch/sparc/mm/tlb.c
+++ b/arch/sparc/mm/tlb.c
@@ -249,6 +249,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
  {
         pmd_t old, entry;
  
+       VM_WARN_ON_ONCE(!pmd_present(*pmdp));
         entry = __pmd(pmd_val(*pmdp) & ~_PAGE_VALID);
         old = pmdp_establish(vma, address, pmdp, entry);
         flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c

index 94767c82fc0d7226743a62b02b3856a1216f6935..93e54ba91fbfdbfaf09ffa42a2d8c855187576d7 100644 (file)
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -631,6 +631,8 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
  pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
                          pmd_t *pmdp)
  {
+       VM_WARN_ON_ONCE(!pmd_present(*pmdp));
+
         /*
          * No flush is necessary. Once an invalid PTE is established, the PTE's
          * access and dirty bits cannot be updated.
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index 08e4f3343bcd0e3617ce39350640b43f865a9e89..ccdcff73284a028bb287a78561c00f7239ea3c94 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2430,32 +2430,11 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                 return __split_huge_zero_page_pmd(vma, haddr, pmd);
         }
  
-       /*
-        * Up to this point the pmd is present and huge and userland has the
-        * whole access to the hugepage during the split (which happens in
-        * place). If we overwrite the pmd with the not-huge version pointing
-        * to the pte here (which of course we could if all CPUs were bug
-        * free), userland could trigger a small page size TLB miss on the
-        * small sized TLB while the hugepage TLB entry is still established in
-        * the huge TLB. Some CPU doesn't like that.
-        * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
-        * 383 on page 105. Intel should be safe but is also warns that it's
-        * only safe if the permission and cache attributes of the two entries
-        * loaded in the two TLB is identical (which should be the case here).
-        * But it is generally safer to never allow small and huge TLB entries
-        * for the same virtual address to be loaded simultaneously. So instead
-        * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
-        * current pmd notpresent (atomically because here the pmd_trans_huge
-        * must remain set at all times on the pmd until the split is complete
-        * for this pmd), then we flush the SMP TLB and finally we write the
-        * non-huge version of the pmd entry with pmd_populate.
-        */
-       old_pmd = pmdp_invalidate(vma, haddr, pmd);
-
-       pmd_migration = is_pmd_migration_entry(old_pmd);
+       pmd_migration = is_pmd_migration_entry(*pmd);
         if (unlikely(pmd_migration)) {
                 swp_entry_t entry;
  
+               old_pmd = *pmd;
                 entry = pmd_to_swp_entry(old_pmd);
                 page = pfn_swap_entry_to_page(entry);
                 write = is_writable_migration_entry(entry);
@@ -2466,6 +2445,30 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                 soft_dirty = pmd_swp_soft_dirty(old_pmd);
                 uffd_wp = pmd_swp_uffd_wp(old_pmd);
         } else {
+               /*
+                * Up to this point the pmd is present and huge and userland has
+                * the whole access to the hugepage during the split (which
+                * happens in place). If we overwrite the pmd with the not-huge
+                * version pointing to the pte here (which of course we could if
+                * all CPUs were bug free), userland could trigger a small page
+                * size TLB miss on the small sized TLB while the hugepage TLB
+                * entry is still established in the huge TLB. Some CPU doesn't
+                * like that. See
+                * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
+                * 383 on page 105. Intel should be safe but is also warns that
+                * it's only safe if the permission and cache attributes of the
+                * two entries loaded in the two TLB is identical (which should
+                * be the case here). But it is generally safer to never allow
+                * small and huge TLB entries for the same virtual address to be
+                * loaded simultaneously. So instead of doing "pmd_populate();
+                * flush_pmd_tlb_range();" we first mark the current pmd
+                * notpresent (atomically because here the pmd_trans_huge must
+                * remain set at all times on the pmd until the split is
+                * complete for this pmd), then we flush the SMP TLB and finally
+                * we write the non-huge version of the pmd entry with
+                * pmd_populate.
+                */
+               old_pmd = pmdp_invalidate(vma, haddr, pmd);
                 page = pmd_page(old_pmd);
                 folio = page_folio(page);
                 if (pmd_dirty(old_pmd)) {
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c

index 4fcd959dcc4d02367d06fd06079d5c4ee0fe4f43..a78a4adf711ac2e4a842174dee50c2efbdde57bf 100644 (file)
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -198,6 +198,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
  pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                      pmd_t *pmdp)
  {
+       VM_WARN_ON_ONCE(!pmd_present(*pmdp));
         pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
         flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
         return old;
@@ -208,6 +209,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
  pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
                          pmd_t *pmdp)
  {
+       VM_WARN_ON_ONCE(!pmd_present(*pmdp));
         return pmdp_invalidate(vma, address, pmdp);
  }
  #endif
author	Ryan Roberts <ryan.roberts@arm.com>
	Wed, 1 May 2024 14:33:10 +0000 (15:33 +0100)
committer	Andrew Morton <akpm@linux-foundation.org>
	Tue, 7 May 2024 17:37:00 +0000 (10:37 -0700)
Documentation/mm/arch_pgtable_helpers.rst		patch \| blob \| blame \| history
arch/powerpc/mm/book3s64/pgtable.c		patch \| blob \| blame \| history
arch/s390/include/asm/pgtable.h		patch \| blob \| blame \| history
arch/sparc/mm/tlb.c		patch \| blob \| blame \| history
arch/x86/mm/pgtable.c		patch \| blob \| blame \| history
mm/huge_memory.c		patch \| blob \| blame \| history
mm/pgtable-generic.c		patch \| blob \| blame \| history