mm: new follow_pfnmap API
authorPeter Xu <peterx@redhat.com>
Mon, 26 Aug 2024 20:43:43 +0000 (16:43 -0400)
committerAndrew Morton <akpm@linux-foundation.org>
Tue, 17 Sep 2024 08:06:59 +0000 (01:06 -0700)
Introduce a pair of APIs to follow pfn mappings to get entry information.
It's very similar to what follow_pte() does before, but different in that
it recognizes huge pfn mappings.

Link: https://lkml.kernel.org/r/20240826204353.2228736-10-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Gavin Shan <gshan@redhat.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Niklas Schnelle <schnelle@linux.ibm.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/mm.h
mm/memory.c

index c2307a2d275d730fb664ede8d8a9109cc94060ad..62bd8941489762e7965bc0d8ef745e76519e202d 100644 (file)
@@ -2373,6 +2373,37 @@ int follow_pte(struct vm_area_struct *vma, unsigned long address,
 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
                        void *buf, int len, int write);
 
+struct follow_pfnmap_args {
+       /**
+        * Inputs:
+        * @vma: Pointer to @vm_area_struct struct
+        * @address: the virtual address to walk
+        */
+       struct vm_area_struct *vma;
+       unsigned long address;
+       /**
+        * Internals:
+        *
+        * The caller shouldn't touch any of these.
+        */
+       spinlock_t *lock;
+       pte_t *ptep;
+       /**
+        * Outputs:
+        *
+        * @pfn: the PFN of the address
+        * @pgprot: the pgprot_t of the mapping
+        * @writable: whether the mapping is writable
+        * @special: whether the mapping is a special mapping (real PFN maps)
+        */
+       unsigned long pfn;
+       pgprot_t pgprot;
+       bool writable;
+       bool special;
+};
+int follow_pfnmap_start(struct follow_pfnmap_args *args);
+void follow_pfnmap_end(struct follow_pfnmap_args *args);
+
 extern void truncate_pagecache(struct inode *inode, loff_t new);
 extern void truncate_setsize(struct inode *inode, loff_t newsize);
 void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
index 9a540e2316255fec95b397b75a7bc5e87082bb73..3878bf69bc14cf59a43b967049328339899d1541 100644 (file)
@@ -6172,6 +6172,156 @@ out:
 }
 EXPORT_SYMBOL_GPL(follow_pte);
 
+static inline void pfnmap_args_setup(struct follow_pfnmap_args *args,
+                                    spinlock_t *lock, pte_t *ptep,
+                                    pgprot_t pgprot, unsigned long pfn_base,
+                                    unsigned long addr_mask, bool writable,
+                                    bool special)
+{
+       args->lock = lock;
+       args->ptep = ptep;
+       args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT);
+       args->pgprot = pgprot;
+       args->writable = writable;
+       args->special = special;
+}
+
+static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma)
+{
+#ifdef CONFIG_LOCKDEP
+       struct address_space *mapping = vma->vm_file->f_mapping;
+
+       if (mapping)
+               lockdep_assert(lockdep_is_held(&vma->vm_file->f_mapping->i_mmap_rwsem) ||
+                              lockdep_is_held(&vma->vm_mm->mmap_lock));
+       else
+               lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock));
+#endif
+}
+
+/**
+ * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address
+ * @args: Pointer to struct @follow_pfnmap_args
+ *
+ * The caller needs to setup args->vma and args->address to point to the
+ * virtual address as the target of such lookup.  On a successful return,
+ * the results will be put into other output fields.
+ *
+ * After the caller finished using the fields, the caller must invoke
+ * another follow_pfnmap_end() to proper releases the locks and resources
+ * of such look up request.
+ *
+ * During the start() and end() calls, the results in @args will be valid
+ * as proper locks will be held.  After the end() is called, all the fields
+ * in @follow_pfnmap_args will be invalid to be further accessed.  Further
+ * use of such information after end() may require proper synchronizations
+ * by the caller with page table updates, otherwise it can create a
+ * security bug.
+ *
+ * If the PTE maps a refcounted page, callers are responsible to protect
+ * against invalidation with MMU notifiers; otherwise access to the PFN at
+ * a later point in time can trigger use-after-free.
+ *
+ * Only IO mappings and raw PFN mappings are allowed.  The mmap semaphore
+ * should be taken for read, and the mmap semaphore cannot be released
+ * before the end() is invoked.
+ *
+ * This function must not be used to modify PTE content.
+ *
+ * Return: zero on success, negative otherwise.
+ */
+int follow_pfnmap_start(struct follow_pfnmap_args *args)
+{
+       struct vm_area_struct *vma = args->vma;
+       unsigned long address = args->address;
+       struct mm_struct *mm = vma->vm_mm;
+       spinlock_t *lock;
+       pgd_t *pgdp;
+       p4d_t *p4dp, p4d;
+       pud_t *pudp, pud;
+       pmd_t *pmdp, pmd;
+       pte_t *ptep, pte;
+
+       pfnmap_lockdep_assert(vma);
+
+       if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+               goto out;
+
+       if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+               goto out;
+retry:
+       pgdp = pgd_offset(mm, address);
+       if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp)))
+               goto out;
+
+       p4dp = p4d_offset(pgdp, address);
+       p4d = READ_ONCE(*p4dp);
+       if (p4d_none(p4d) || unlikely(p4d_bad(p4d)))
+               goto out;
+
+       pudp = pud_offset(p4dp, address);
+       pud = READ_ONCE(*pudp);
+       if (pud_none(pud))
+               goto out;
+       if (pud_leaf(pud)) {
+               lock = pud_lock(mm, pudp);
+               if (!unlikely(pud_leaf(pud))) {
+                       spin_unlock(lock);
+                       goto retry;
+               }
+               pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud),
+                                 pud_pfn(pud), PUD_MASK, pud_write(pud),
+                                 pud_special(pud));
+               return 0;
+       }
+
+       pmdp = pmd_offset(pudp, address);
+       pmd = pmdp_get_lockless(pmdp);
+       if (pmd_leaf(pmd)) {
+               lock = pmd_lock(mm, pmdp);
+               if (!unlikely(pmd_leaf(pmd))) {
+                       spin_unlock(lock);
+                       goto retry;
+               }
+               pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd),
+                                 pmd_pfn(pmd), PMD_MASK, pmd_write(pmd),
+                                 pmd_special(pmd));
+               return 0;
+       }
+
+       ptep = pte_offset_map_lock(mm, pmdp, address, &lock);
+       if (!ptep)
+               goto out;
+       pte = ptep_get(ptep);
+       if (!pte_present(pte))
+               goto unlock;
+       pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte),
+                         pte_pfn(pte), PAGE_MASK, pte_write(pte),
+                         pte_special(pte));
+       return 0;
+unlock:
+       pte_unmap_unlock(ptep, lock);
+out:
+       return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(follow_pfnmap_start);
+
+/**
+ * follow_pfnmap_end(): End a follow_pfnmap_start() process
+ * @args: Pointer to struct @follow_pfnmap_args
+ *
+ * Must be used in pair of follow_pfnmap_start().  See the start() function
+ * above for more information.
+ */
+void follow_pfnmap_end(struct follow_pfnmap_args *args)
+{
+       if (args->lock)
+               spin_unlock(args->lock);
+       if (args->ptep)
+               pte_unmap(args->ptep);
+}
+EXPORT_SYMBOL_GPL(follow_pfnmap_end);
+
 #ifdef CONFIG_HAVE_IOREMAP_PROT
 /**
  * generic_access_phys - generic implementation for iomem mmap access