Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

[linux-2.6-block.git] / mm / gup.c
diff --git a/mm/gup.c b/mm/gup.c

index ddde097cf9e4106bc02ea55538926f44ed8e587c..8bbaa5523116719038ce2ace262330e36f6209f3 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -134,6 +134,7 @@ void put_user_pages(struct page **pages, unsigned long npages)
  }
  EXPORT_SYMBOL(put_user_pages);
  
+#ifdef CONFIG_MMU
  static struct page *no_page_table(struct vm_area_struct *vma,
                 unsigned int flags)
  {
@@ -515,7 +516,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
   * an error pointer if there is a mapping to something not represented
   * by a page descriptor (see also vm_normal_page()).
   */
-struct page *follow_page_mask(struct vm_area_struct *vma,
+static struct page *follow_page_mask(struct vm_area_struct *vma,
                               unsigned long address, unsigned int flags,
                               struct follow_page_context *ctx)
  {
@@ -585,11 +586,14 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
                 pgd = pgd_offset_k(address);
         else
                 pgd = pgd_offset_gate(mm, address);
-       BUG_ON(pgd_none(*pgd));
+       if (pgd_none(*pgd))
+               return -EFAULT;
         p4d = p4d_offset(pgd, address);
-       BUG_ON(p4d_none(*p4d));
+       if (p4d_none(*p4d))
+               return -EFAULT;
         pud = pud_offset(p4d, address);
-       BUG_ON(pud_none(*pud));
+       if (pud_none(*pud))
+               return -EFAULT;
         pmd = pmd_offset(pud, address);
         if (!pmd_present(*pmd))
                 return -EFAULT;
@@ -605,13 +609,6 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
                 if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
                         goto unmap;
                 *page = pte_page(*pte);
-
-               /*
-                * This should never happen (a device public page in the gate
-                * area).
-                */
-               if (is_device_public_page(*page))
-                       goto unmap;
         }
         if (unlikely(!try_get_page(*page))) {
                 ret = -ENOMEM;
@@ -1100,86 +1097,6 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
         return pages_done;
  }
  
-/*
- * We can leverage the VM_FAULT_RETRY functionality in the page fault
- * paths better by using either get_user_pages_locked() or
- * get_user_pages_unlocked().
- *
- * get_user_pages_locked() is suitable to replace the form:
- *
- *      down_read(&mm->mmap_sem);
- *      do_something()
- *      get_user_pages(tsk, mm, ..., pages, NULL);
- *      up_read(&mm->mmap_sem);
- *
- *  to:
- *
- *      int locked = 1;
- *      down_read(&mm->mmap_sem);
- *      do_something()
- *      get_user_pages_locked(tsk, mm, ..., pages, &locked);
- *      if (locked)
- *          up_read(&mm->mmap_sem);
- */
-long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
-                          unsigned int gup_flags, struct page **pages,
-                          int *locked)
-{
-       /*
-        * FIXME: Current FOLL_LONGTERM behavior is incompatible with
-        * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
-        * vmas.  As there are no users of this flag in this call we simply
-        * disallow this option for now.
-        */
-       if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
-               return -EINVAL;
-
-       return __get_user_pages_locked(current, current->mm, start, nr_pages,
-                                      pages, NULL, locked,
-                                      gup_flags | FOLL_TOUCH);
-}
-EXPORT_SYMBOL(get_user_pages_locked);
-
-/*
- * get_user_pages_unlocked() is suitable to replace the form:
- *
- *      down_read(&mm->mmap_sem);
- *      get_user_pages(tsk, mm, ..., pages, NULL);
- *      up_read(&mm->mmap_sem);
- *
- *  with:
- *
- *      get_user_pages_unlocked(tsk, mm, ..., pages);
- *
- * It is functionally equivalent to get_user_pages_fast so
- * get_user_pages_fast should be used instead if specific gup_flags
- * (e.g. FOLL_FORCE) are not required.
- */
-long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
-                            struct page **pages, unsigned int gup_flags)
-{
-       struct mm_struct *mm = current->mm;
-       int locked = 1;
-       long ret;
-
-       /*
-        * FIXME: Current FOLL_LONGTERM behavior is incompatible with
-        * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
-        * vmas.  As there are no users of this flag in this call we simply
-        * disallow this option for now.
-        */
-       if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
-               return -EINVAL;
-
-       down_read(&mm->mmap_sem);
-       ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL,
-                                     &locked, gup_flags | FOLL_TOUCH);
-       if (locked)
-               up_read(&mm->mmap_sem);
-       return ret;
-}
-EXPORT_SYMBOL(get_user_pages_unlocked);
-
  /*
   * get_user_pages_remote() - pin user pages in memory
   * @tsk:       the task_struct to use for page fault accounting, or
@@ -1256,6 +1173,198 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
  }
  EXPORT_SYMBOL(get_user_pages_remote);
  
+/**
+ * populate_vma_page_range() -  populate a range of pages in the vma.
+ * @vma:   target vma
+ * @start: start address
+ * @end:   end address
+ * @nonblocking:
+ *
+ * This takes care of mlocking the pages too if VM_LOCKED is set.
+ *
+ * return 0 on success, negative error code on error.
+ *
+ * vma->vm_mm->mmap_sem must be held.
+ *
+ * If @nonblocking is NULL, it may be held for read or write and will
+ * be unperturbed.
+ *
+ * If @nonblocking is non-NULL, it must held for read only and may be
+ * released.  If it's released, *@nonblocking will be set to 0.
+ */
+long populate_vma_page_range(struct vm_area_struct *vma,
+               unsigned long start, unsigned long end, int *nonblocking)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       unsigned long nr_pages = (end - start) / PAGE_SIZE;
+       int gup_flags;
+
+       VM_BUG_ON(start & ~PAGE_MASK);
+       VM_BUG_ON(end   & ~PAGE_MASK);
+       VM_BUG_ON_VMA(start < vma->vm_start, vma);
+       VM_BUG_ON_VMA(end   > vma->vm_end, vma);
+       VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
+
+       gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
+       if (vma->vm_flags & VM_LOCKONFAULT)
+               gup_flags &= ~FOLL_POPULATE;
+       /*
+        * We want to touch writable mappings with a write fault in order
+        * to break COW, except for shared mappings because these don't COW
+        * and we would not want to dirty them for nothing.
+        */
+       if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
+               gup_flags |= FOLL_WRITE;
+
+       /*
+        * We want mlock to succeed for regions that have any permissions
+        * other than PROT_NONE.
+        */
+       if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
+               gup_flags |= FOLL_FORCE;
+
+       /*
+        * We made sure addr is within a VMA, so the following will
+        * not result in a stack expansion that recurses back here.
+        */
+       return __get_user_pages(current, mm, start, nr_pages, gup_flags,
+                               NULL, NULL, nonblocking);
+}
+
+/*
+ * __mm_populate - populate and/or mlock pages within a range of address space.
+ *
+ * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
+ * flags. VMAs must be already marked with the desired vm_flags, and
+ * mmap_sem must not be held.
+ */
+int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
+{
+       struct mm_struct *mm = current->mm;
+       unsigned long end, nstart, nend;
+       struct vm_area_struct *vma = NULL;
+       int locked = 0;
+       long ret = 0;
+
+       end = start + len;
+
+       for (nstart = start; nstart < end; nstart = nend) {
+               /*
+                * We want to fault in pages for [nstart; end) address range.
+                * Find first corresponding VMA.
+                */
+               if (!locked) {
+                       locked = 1;
+                       down_read(&mm->mmap_sem);
+                       vma = find_vma(mm, nstart);
+               } else if (nstart >= vma->vm_end)
+                       vma = vma->vm_next;
+               if (!vma || vma->vm_start >= end)
+                       break;
+               /*
+                * Set [nstart; nend) to intersection of desired address
+                * range with the first VMA. Also, skip undesirable VMA types.
+                */
+               nend = min(end, vma->vm_end);
+               if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+                       continue;
+               if (nstart < vma->vm_start)
+                       nstart = vma->vm_start;
+               /*
+                * Now fault in a range of pages. populate_vma_page_range()
+                * double checks the vma flags, so that it won't mlock pages
+                * if the vma was already munlocked.
+                */
+               ret = populate_vma_page_range(vma, nstart, nend, &locked);
+               if (ret < 0) {
+                       if (ignore_errors) {
+                               ret = 0;
+                               continue;       /* continue at next VMA */
+                       }
+                       break;
+               }
+               nend = nstart + ret * PAGE_SIZE;
+               ret = 0;
+       }
+       if (locked)
+               up_read(&mm->mmap_sem);
+       return ret;     /* 0 or negative error code */
+}
+
+/**
+ * get_dump_page() - pin user page in memory while writing it to core dump
+ * @addr: user address
+ *
+ * Returns struct page pointer of user page pinned for dump,
+ * to be freed afterwards by put_page().
+ *
+ * Returns NULL on any kind of failure - a hole must then be inserted into
+ * the corefile, to preserve alignment with its headers; and also returns
+ * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
+ * allowing a hole to be left in the corefile to save diskspace.
+ *
+ * Called without mmap_sem, but after all other threads have been killed.
+ */
+#ifdef CONFIG_ELF_CORE
+struct page *get_dump_page(unsigned long addr)
+{
+       struct vm_area_struct *vma;
+       struct page *page;
+
+       if (__get_user_pages(current, current->mm, addr, 1,
+                            FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
+                            NULL) < 1)
+               return NULL;
+       flush_cache_page(vma, addr, page_to_pfn(page));
+       return page;
+}
+#endif /* CONFIG_ELF_CORE */
+#else /* CONFIG_MMU */
+static long __get_user_pages_locked(struct task_struct *tsk,
+               struct mm_struct *mm, unsigned long start,
+               unsigned long nr_pages, struct page **pages,
+               struct vm_area_struct **vmas, int *locked,
+               unsigned int foll_flags)
+{
+       struct vm_area_struct *vma;
+       unsigned long vm_flags;
+       int i;
+
+       /* calculate required read or write permissions.
+        * If FOLL_FORCE is set, we only require the "MAY" flags.
+        */
+       vm_flags  = (foll_flags & FOLL_WRITE) ?
+                       (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+       vm_flags &= (foll_flags & FOLL_FORCE) ?
+                       (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+
+       for (i = 0; i < nr_pages; i++) {
+               vma = find_vma(mm, start);
+               if (!vma)
+                       goto finish_or_fault;
+
+               /* protect what we can, including chardevs */
+               if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
+                   !(vm_flags & vma->vm_flags))
+                       goto finish_or_fault;
+
+               if (pages) {
+                       pages[i] = virt_to_page(start);
+                       if (pages[i])
+                               get_page(pages[i]);
+               }
+               if (vmas)
+                       vmas[i] = vma;
+               start = (start + PAGE_SIZE) & PAGE_MASK;
+       }
+
+       return i;
+
+finish_or_fault:
+       return i ? : -EFAULT;
+}
+#endif /* !CONFIG_MMU */
+
  #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
  static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
  {
@@ -1336,25 +1445,31 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk,
                                         struct vm_area_struct **vmas,
                                         unsigned int gup_flags)
  {
-       long i;
+       unsigned long i;
+       unsigned long step;
         bool drain_allow = true;
         bool migrate_allow = true;
         LIST_HEAD(cma_page_list);
  
  check_again:
-       for (i = 0; i < nr_pages; i++) {
+       for (i = 0; i < nr_pages;) {
+
+               struct page *head = compound_head(pages[i]);
+
+               /*
+                * gup may start from a tail page. Advance step by the left
+                * part.
+                */
+               step = (1 << compound_order(head)) - (pages[i] - head);
                 /*
                  * If we get a page from the CMA zone, since we are going to
                  * be pinning these entries, we might as well move them out
                  * of the CMA zone if possible.
                  */
-               if (is_migrate_cma_page(pages[i])) {
-
-                       struct page *head = compound_head(pages[i]);
-
-                       if (PageHuge(head)) {
+               if (is_migrate_cma_page(head)) {
+                       if (PageHuge(head))
                                 isolate_huge_page(head, &cma_page_list);
-                       } else {
+                       else {
                                 if (!PageLRU(head) && drain_allow) {
                                         lru_add_drain_all();
                                         drain_allow = false;
@@ -1369,6 +1484,8 @@ check_again:
                                 }
                         }
                 }
+
+               i += step;
         }
  
         if (!list_empty(&cma_page_list)) {
@@ -1417,7 +1534,7 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk,
  {
         return nr_pages;
  }
-#endif
+#endif /* CONFIG_CMA */
  
  /*
   * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
@@ -1503,155 +1620,88 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
  }
  EXPORT_SYMBOL(get_user_pages);
  
-/**
- * populate_vma_page_range() -  populate a range of pages in the vma.
- * @vma:   target vma
- * @start: start address
- * @end:   end address
- * @nonblocking:
- *
- * This takes care of mlocking the pages too if VM_LOCKED is set.
+/*
+ * We can leverage the VM_FAULT_RETRY functionality in the page fault
+ * paths better by using either get_user_pages_locked() or
+ * get_user_pages_unlocked().
   *
- * return 0 on success, negative error code on error.
+ * get_user_pages_locked() is suitable to replace the form:
   *
- * vma->vm_mm->mmap_sem must be held.
+ *      down_read(&mm->mmap_sem);
+ *      do_something()
+ *      get_user_pages(tsk, mm, ..., pages, NULL);
+ *      up_read(&mm->mmap_sem);
   *
- * If @nonblocking is NULL, it may be held for read or write and will
- * be unperturbed.
+ *  to:
   *
- * If @nonblocking is non-NULL, it must held for read only and may be
- * released.  If it's released, *@nonblocking will be set to 0.
+ *      int locked = 1;
+ *      down_read(&mm->mmap_sem);
+ *      do_something()
+ *      get_user_pages_locked(tsk, mm, ..., pages, &locked);
+ *      if (locked)
+ *          up_read(&mm->mmap_sem);
   */
-long populate_vma_page_range(struct vm_area_struct *vma,
-               unsigned long start, unsigned long end, int *nonblocking)
+long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
+                          unsigned int gup_flags, struct page **pages,
+                          int *locked)
  {
-       struct mm_struct *mm = vma->vm_mm;
-       unsigned long nr_pages = (end - start) / PAGE_SIZE;
-       int gup_flags;
-
-       VM_BUG_ON(start & ~PAGE_MASK);
-       VM_BUG_ON(end   & ~PAGE_MASK);
-       VM_BUG_ON_VMA(start < vma->vm_start, vma);
-       VM_BUG_ON_VMA(end   > vma->vm_end, vma);
-       VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
-
-       gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
-       if (vma->vm_flags & VM_LOCKONFAULT)
-               gup_flags &= ~FOLL_POPULATE;
-       /*
-        * We want to touch writable mappings with a write fault in order
-        * to break COW, except for shared mappings because these don't COW
-        * and we would not want to dirty them for nothing.
-        */
-       if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
-               gup_flags |= FOLL_WRITE;
-
         /*
-        * We want mlock to succeed for regions that have any permissions
-        * other than PROT_NONE.
+        * FIXME: Current FOLL_LONGTERM behavior is incompatible with
+        * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+        * vmas.  As there are no users of this flag in this call we simply
+        * disallow this option for now.
          */
-       if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
-               gup_flags |= FOLL_FORCE;
+       if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+               return -EINVAL;
  
-       /*
-        * We made sure addr is within a VMA, so the following will
-        * not result in a stack expansion that recurses back here.
-        */
-       return __get_user_pages(current, mm, start, nr_pages, gup_flags,
-                               NULL, NULL, nonblocking);
+       return __get_user_pages_locked(current, current->mm, start, nr_pages,
+                                      pages, NULL, locked,
+                                      gup_flags | FOLL_TOUCH);
  }
+EXPORT_SYMBOL(get_user_pages_locked);
  
  /*
- * __mm_populate - populate and/or mlock pages within a range of address space.
+ * get_user_pages_unlocked() is suitable to replace the form:
   *
- * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
- * flags. VMAs must be already marked with the desired vm_flags, and
- * mmap_sem must not be held.
+ *      down_read(&mm->mmap_sem);
+ *      get_user_pages(tsk, mm, ..., pages, NULL);
+ *      up_read(&mm->mmap_sem);
+ *
+ *  with:
+ *
+ *      get_user_pages_unlocked(tsk, mm, ..., pages);
+ *
+ * It is functionally equivalent to get_user_pages_fast so
+ * get_user_pages_fast should be used instead if specific gup_flags
+ * (e.g. FOLL_FORCE) are not required.
   */
-int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
+long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
+                            struct page **pages, unsigned int gup_flags)
  {
         struct mm_struct *mm = current->mm;
-       unsigned long end, nstart, nend;
-       struct vm_area_struct *vma = NULL;
-       int locked = 0;
-       long ret = 0;
+       int locked = 1;
+       long ret;
  
-       end = start + len;
+       /*
+        * FIXME: Current FOLL_LONGTERM behavior is incompatible with
+        * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+        * vmas.  As there are no users of this flag in this call we simply
+        * disallow this option for now.
+        */
+       if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+               return -EINVAL;
  
-       for (nstart = start; nstart < end; nstart = nend) {
-               /*
-                * We want to fault in pages for [nstart; end) address range.
-                * Find first corresponding VMA.
-                */
-               if (!locked) {
-                       locked = 1;
-                       down_read(&mm->mmap_sem);
-                       vma = find_vma(mm, nstart);
-               } else if (nstart >= vma->vm_end)
-                       vma = vma->vm_next;
-               if (!vma || vma->vm_start >= end)
-                       break;
-               /*
-                * Set [nstart; nend) to intersection of desired address
-                * range with the first VMA. Also, skip undesirable VMA types.
-                */
-               nend = min(end, vma->vm_end);
-               if (vma->vm_flags & (VM_IO | VM_PFNMAP))
-                       continue;
-               if (nstart < vma->vm_start)
-                       nstart = vma->vm_start;
-               /*
-                * Now fault in a range of pages. populate_vma_page_range()
-                * double checks the vma flags, so that it won't mlock pages
-                * if the vma was already munlocked.
-                */
-               ret = populate_vma_page_range(vma, nstart, nend, &locked);
-               if (ret < 0) {
-                       if (ignore_errors) {
-                               ret = 0;
-                               continue;       /* continue at next VMA */
-                       }
-                       break;
-               }
-               nend = nstart + ret * PAGE_SIZE;
-               ret = 0;
-       }
+       down_read(&mm->mmap_sem);
+       ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL,
+                                     &locked, gup_flags | FOLL_TOUCH);
         if (locked)
                 up_read(&mm->mmap_sem);
-       return ret;     /* 0 or negative error code */
-}
-
-/**
- * get_dump_page() - pin user page in memory while writing it to core dump
- * @addr: user address
- *
- * Returns struct page pointer of user page pinned for dump,
- * to be freed afterwards by put_page().
- *
- * Returns NULL on any kind of failure - a hole must then be inserted into
- * the corefile, to preserve alignment with its headers; and also returns
- * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
- * allowing a hole to be left in the corefile to save diskspace.
- *
- * Called without mmap_sem, but after all other threads have been killed.
- */
-#ifdef CONFIG_ELF_CORE
-struct page *get_dump_page(unsigned long addr)
-{
-       struct vm_area_struct *vma;
-       struct page *page;
-
-       if (__get_user_pages(current, current->mm, addr, 1,
-                            FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
-                            NULL) < 1)
-               return NULL;
-       flush_cache_page(vma, addr, page_to_pfn(page));
-       return page;
+       return ret;
  }
-#endif /* CONFIG_ELF_CORE */
+EXPORT_SYMBOL(get_user_pages_unlocked);
  
  /*
- * Generic Fast GUP
+ * Fast GUP
   *
   * get_user_pages_fast attempts to pin user pages by walking the page
   * tables directly and avoids taking locks. Thus the walker needs to be
@@ -1683,20 +1733,64 @@ struct page *get_dump_page(unsigned long addr)
   *
   * This code is based heavily on the PowerPC implementation by Nick Piggin.
   */
-#ifdef CONFIG_HAVE_GENERIC_GUP
+#ifdef CONFIG_HAVE_FAST_GUP
+#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
+/*
+ * WARNING: only to be used in the get_user_pages_fast() implementation.
+ *
+ * With get_user_pages_fast(), we walk down the pagetables without taking any
+ * locks.  For this we would like to load the pointers atomically, but sometimes
+ * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE).  What
+ * we do have is the guarantee that a PTE will only either go from not present
+ * to present, or present to not present or both -- it will not switch to a
+ * completely different present page without a TLB flush in between; something
+ * that we are blocking by holding interrupts off.
+ *
+ * Setting ptes from not present to present goes:
+ *
+ *   ptep->pte_high = h;
+ *   smp_wmb();
+ *   ptep->pte_low = l;
+ *
+ * And present to not present goes:
+ *
+ *   ptep->pte_low = 0;
+ *   smp_wmb();
+ *   ptep->pte_high = 0;
+ *
+ * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'.
+ * We load pte_high *after* loading pte_low, which ensures we don't see an older
+ * value of pte_high.  *Then* we recheck pte_low, which ensures that we haven't
+ * picked up a changed pte high. We might have gotten rubbish values from
+ * pte_low and pte_high, but we are guaranteed that pte_low will not have the
+ * present bit set *unless* it is 'l'. Because get_user_pages_fast() only
+ * operates on present ptes we're safe.
+ */
+static inline pte_t gup_get_pte(pte_t *ptep)
+{
+       pte_t pte;
  
-#ifndef gup_get_pte
+       do {
+               pte.pte_low = ptep->pte_low;
+               smp_rmb();
+               pte.pte_high = ptep->pte_high;
+               smp_rmb();
+       } while (unlikely(pte.pte_low != ptep->pte_low));
+
+       return pte;
+}
+#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */
  /*
- * We assume that the PTE can be read atomically. If this is not the case for
- * your architecture, please provide the helper.
+ * We require that the PTE can be read atomically.
   */
  static inline pte_t gup_get_pte(pte_t *ptep)
  {
         return READ_ONCE(*ptep);
  }
-#endif
+#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
  
-static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
+static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
+                                           struct page **pages)
  {
         while ((*nr) - nr_start) {
                 struct page *page = pages[--(*nr)];
@@ -1877,6 +1971,90 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
  }
  #endif
  
+#ifdef CONFIG_ARCH_HAS_HUGEPD
+static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
+                                     unsigned long sz)
+{
+       unsigned long __boundary = (addr + sz) & ~(sz-1);
+       return (__boundary - 1 < end - 1) ? __boundary : end;
+}
+
+static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
+                      unsigned long end, int write, struct page **pages, int *nr)
+{
+       unsigned long pte_end;
+       struct page *head, *page;
+       pte_t pte;
+       int refs;
+
+       pte_end = (addr + sz) & ~(sz-1);
+       if (pte_end < end)
+               end = pte_end;
+
+       pte = READ_ONCE(*ptep);
+
+       if (!pte_access_permitted(pte, write))
+               return 0;
+
+       /* hugepages are never "special" */
+       VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+       refs = 0;
+       head = pte_page(pte);
+
+       page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
+       do {
+               VM_BUG_ON(compound_head(page) != head);
+               pages[*nr] = page;
+               (*nr)++;
+               page++;
+               refs++;
+       } while (addr += PAGE_SIZE, addr != end);
+
+       head = try_get_compound_head(head, refs);
+       if (!head) {
+               *nr -= refs;
+               return 0;
+       }
+
+       if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+               /* Could be optimized better */
+               *nr -= refs;
+               while (refs--)
+                       put_page(head);
+               return 0;
+       }
+
+       SetPageReferenced(head);
+       return 1;
+}
+
+static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
+               unsigned int pdshift, unsigned long end, int write,
+               struct page **pages, int *nr)
+{
+       pte_t *ptep;
+       unsigned long sz = 1UL << hugepd_shift(hugepd);
+       unsigned long next;
+
+       ptep = hugepte_offset(hugepd, addr, pdshift);
+       do {
+               next = hugepte_addr_end(addr, end, sz);
+               if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
+                       return 0;
+       } while (ptep++, addr = next, addr != end);
+
+       return 1;
+}
+#else
+static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
+               unsigned pdshift, unsigned long end, int write,
+               struct page **pages, int *nr)
+{
+       return 0;
+}
+#endif /* CONFIG_ARCH_HAS_HUGEPD */
+
  static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
                 unsigned long end, unsigned int flags, struct page **pages, int *nr)
  {
@@ -2117,19 +2295,21 @@ static void gup_pgd_range(unsigned long addr, unsigned long end,
                         return;
         } while (pgdp++, addr = next, addr != end);
  }
+#else
+static inline void gup_pgd_range(unsigned long addr, unsigned long end,
+               unsigned int flags, struct page **pages, int *nr)
+{
+}
+#endif /* CONFIG_HAVE_FAST_GUP */
  
  #ifndef gup_fast_permitted
  /*
   * Check if it's allowed to use __get_user_pages_fast() for the range, or
   * we need to fall back to the slow version:
   */
-bool gup_fast_permitted(unsigned long start, int nr_pages)
+static bool gup_fast_permitted(unsigned long start, unsigned long end)
  {
-       unsigned long len, end;
-
-       len = (unsigned long) nr_pages << PAGE_SHIFT;
-       end = start + len;
-       return end >= start;
+       return true;
  }
  #endif
  
@@ -2138,6 +2318,9 @@ bool gup_fast_permitted(unsigned long start, int nr_pages)
   * the regular GUP.
   * Note a difference with get_user_pages_fast: this always returns the
   * number of pages pinned, 0 if no pages were pinned.
+ *
+ * If the architecture does not support this function, simply return with no
+ * pages pinned.
   */
  int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
                           struct page **pages)
@@ -2146,10 +2329,12 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
         unsigned long flags;
         int nr = 0;
  
-       start &= PAGE_MASK;
+       start = untagged_addr(start) & PAGE_MASK;
         len = (unsigned long) nr_pages << PAGE_SHIFT;
         end = start + len;
  
+       if (end <= start)
+               return 0;
         if (unlikely(!access_ok((void __user *)start, len)))
                 return 0;
  
@@ -2165,7 +2350,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
          * block IPIs that come from THPs splitting.
          */
  
-       if (gup_fast_permitted(start, nr_pages)) {
+       if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) &&
+           gup_fast_permitted(start, end)) {
                 local_irq_save(flags);
                 gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr);
                 local_irq_restore(flags);
@@ -2173,6 +2359,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
  
         return nr;
  }
+EXPORT_SYMBOL_GPL(__get_user_pages_fast);
  
  static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
                                    unsigned int gup_flags, struct page **pages)
@@ -2219,18 +2406,21 @@ int get_user_pages_fast(unsigned long start, int nr_pages,
         unsigned long addr, len, end;
         int nr = 0, ret = 0;
  
-       start &= PAGE_MASK;
+       if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM)))
+               return -EINVAL;
+
+       start = untagged_addr(start) & PAGE_MASK;
         addr = start;
         len = (unsigned long) nr_pages << PAGE_SHIFT;
         end = start + len;
  
-       if (nr_pages <= 0)
+       if (end <= start)
                 return 0;
-
         if (unlikely(!access_ok((void __user *)start, len)))
                 return -EFAULT;
  
-       if (gup_fast_permitted(start, nr_pages)) {
+       if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) &&
+           gup_fast_permitted(start, end)) {
                 local_irq_disable();
                 gup_pgd_range(addr, end, gup_flags, pages, &nr);
                 local_irq_enable();
@@ -2256,5 +2446,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages,
  
         return ret;
  }
-
-#endif /* CONFIG_HAVE_GENERIC_GUP */
+EXPORT_SYMBOL_GPL(get_user_pages_fast);