mm: introduce get_user_pages_longterm

[linux-2.6-block.git] / mm / gup.c
diff --git a/mm/gup.c b/mm/gup.c

index b2b4d4263768d82d61b99d2a2125251345fa778c..d3fb60e5bfacd4c733957dc526c28c41bd2321d1 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -66,7 +66,7 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
   */
  static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
  {
-       return pte_write(pte) ||
+       return pte_access_permitted(pte, WRITE) ||
                 ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
  }
  
@@ -1095,6 +1095,70 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
  }
  EXPORT_SYMBOL(get_user_pages);
  
+#ifdef CONFIG_FS_DAX
+/*
+ * This is the same as get_user_pages() in that it assumes we are
+ * operating on the current task's mm, but it goes further to validate
+ * that the vmas associated with the address range are suitable for
+ * longterm elevated page reference counts. For example, filesystem-dax
+ * mappings are subject to the lifetime enforced by the filesystem and
+ * we need guarantees that longterm users like RDMA and V4L2 only
+ * establish mappings that have a kernel enforced revocation mechanism.
+ *
+ * "longterm" == userspace controlled elevated page count lifetime.
+ * Contrast this to iov_iter_get_pages() usages which are transient.
+ */
+long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
+               unsigned int gup_flags, struct page **pages,
+               struct vm_area_struct **vmas_arg)
+{
+       struct vm_area_struct **vmas = vmas_arg;
+       struct vm_area_struct *vma_prev = NULL;
+       long rc, i;
+
+       if (!pages)
+               return -EINVAL;
+
+       if (!vmas) {
+               vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *),
+                              GFP_KERNEL);
+               if (!vmas)
+                       return -ENOMEM;
+       }
+
+       rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+
+       for (i = 0; i < rc; i++) {
+               struct vm_area_struct *vma = vmas[i];
+
+               if (vma == vma_prev)
+                       continue;
+
+               vma_prev = vma;
+
+               if (vma_is_fsdax(vma))
+                       break;
+       }
+
+       /*
+        * Either get_user_pages() failed, or the vma validation
+        * succeeded, in either case we don't need to put_page() before
+        * returning.
+        */
+       if (i >= rc)
+               goto out;
+
+       for (i = 0; i < rc; i++)
+               put_page(pages[i]);
+       rc = -EOPNOTSUPP;
+out:
+       if (vmas != vmas_arg)
+               kfree(vmas);
+       return rc;
+}
+EXPORT_SYMBOL(get_user_pages_longterm);
+#endif /* CONFIG_FS_DAX */
+
  /**
   * populate_vma_page_range() -  populate a range of pages in the vma.
   * @vma:   target vma
@@ -1643,6 +1707,47 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
         return 1;
  }
  
+static void gup_pgd_range(unsigned long addr, unsigned long end,
+               int write, struct page **pages, int *nr)
+{
+       unsigned long next;
+       pgd_t *pgdp;
+
+       pgdp = pgd_offset(current->mm, addr);
+       do {
+               pgd_t pgd = READ_ONCE(*pgdp);
+
+               next = pgd_addr_end(addr, end);
+               if (pgd_none(pgd))
+                       return;
+               if (unlikely(pgd_huge(pgd))) {
+                       if (!gup_huge_pgd(pgd, pgdp, addr, next, write,
+                                         pages, nr))
+                               return;
+               } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
+                       if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
+                                        PGDIR_SHIFT, next, write, pages, nr))
+                               return;
+               } else if (!gup_p4d_range(pgd, addr, next, write, pages, nr))
+                       return;
+       } while (pgdp++, addr = next, addr != end);
+}
+
+#ifndef gup_fast_permitted
+/*
+ * Check if it's allowed to use __get_user_pages_fast() for the range, or
+ * we need to fall back to the slow version:
+ */
+bool gup_fast_permitted(unsigned long start, int nr_pages, int write)
+{
+       unsigned long len, end;
+
+       len = (unsigned long) nr_pages << PAGE_SHIFT;
+       end = start + len;
+       return end >= start;
+}
+#endif
+
  /*
   * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
   * the regular GUP. It will only return non-negative values.
@@ -1650,10 +1755,8 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
  int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
                           struct page **pages)
  {
-       struct mm_struct *mm = current->mm;
         unsigned long addr, len, end;
-       unsigned long next, flags;
-       pgd_t *pgdp;
+       unsigned long flags;
         int nr = 0;
  
         start &= PAGE_MASK;
@@ -1677,45 +1780,15 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
          * block IPIs that come from THPs splitting.
          */
  
-       local_irq_save(flags);
-       pgdp = pgd_offset(mm, addr);
-       do {
-               pgd_t pgd = READ_ONCE(*pgdp);
-
-               next = pgd_addr_end(addr, end);
-               if (pgd_none(pgd))
-                       break;
-               if (unlikely(pgd_huge(pgd))) {
-                       if (!gup_huge_pgd(pgd, pgdp, addr, next, write,
-                                         pages, &nr))
-                               break;
-               } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
-                       if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
-                                        PGDIR_SHIFT, next, write, pages, &nr))
-                               break;
-               } else if (!gup_p4d_range(pgd, addr, next, write, pages, &nr))
-                       break;
-       } while (pgdp++, addr = next, addr != end);
-       local_irq_restore(flags);
+       if (gup_fast_permitted(start, nr_pages, write)) {
+               local_irq_save(flags);
+               gup_pgd_range(addr, end, write, pages, &nr);
+               local_irq_restore(flags);
+       }
  
         return nr;
  }
  
-#ifndef gup_fast_permitted
-/*
- * Check if it's allowed to use __get_user_pages_fast() for the range, or
- * we need to fall back to the slow version:
- */
-bool gup_fast_permitted(unsigned long start, int nr_pages, int write)
-{
-       unsigned long len, end;
-
-       len = (unsigned long) nr_pages << PAGE_SHIFT;
-       end = start + len;
-       return end >= start;
-}
-#endif
-
  /**
   * get_user_pages_fast() - pin user pages in memory
   * @start:     starting user address
@@ -1735,12 +1808,22 @@ bool gup_fast_permitted(unsigned long start, int nr_pages, int write)
  int get_user_pages_fast(unsigned long start, int nr_pages, int write,
                         struct page **pages)
  {
+       unsigned long addr, len, end;
         int nr = 0, ret = 0;
  
         start &= PAGE_MASK;
+       addr = start;
+       len = (unsigned long) nr_pages << PAGE_SHIFT;
+       end = start + len;
+
+       if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+                                       (void __user *)start, len)))
+               return 0;
  
         if (gup_fast_permitted(start, nr_pages, write)) {
-               nr = __get_user_pages_fast(start, nr_pages, write, pages);
+               local_irq_disable();
+               gup_pgd_range(addr, end, write, pages, &nr);
+               local_irq_enable();
                 ret = nr;
         }