mm/core, x86/mm/pkeys: Differentiate instruction fetches

[linux-2.6-block.git] / mm / gup.c
diff --git a/mm/gup.c b/mm/gup.c

index deafa2c91b362206b7ef56aec918509a9cc24dd2..7f1c4fb77cfa5e9c6ca4fd4c04c83e01a9b273b1 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1,9 +1,11 @@
+#define __DISABLE_GUP_DEPRECATED 1
  #include <linux/kernel.h>
  #include <linux/errno.h>
  #include <linux/err.h>
  #include <linux/spinlock.h>
  
  #include <linux/mm.h>
+#include <linux/memremap.h>
  #include <linux/pagemap.h>
  #include <linux/rmap.h>
  #include <linux/swap.h>
@@ -13,6 +15,7 @@
  #include <linux/rwsem.h>
  #include <linux/hugetlb.h>
  
+#include <asm/mmu_context.h>
  #include <asm/pgtable.h>
  #include <asm/tlbflush.h>
  
@@ -62,6 +65,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
                 unsigned long address, pmd_t *pmd, unsigned int flags)
  {
         struct mm_struct *mm = vma->vm_mm;
+       struct dev_pagemap *pgmap = NULL;
         struct page *page;
         spinlock_t *ptl;
         pte_t *ptep, pte;
@@ -98,7 +102,17 @@ retry:
         }
  
         page = vm_normal_page(vma, address, pte);
-       if (unlikely(!page)) {
+       if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
+               /*
+                * Only return device mapping pages in the FOLL_GET case since
+                * they are only valid while holding the pgmap reference.
+                */
+               pgmap = get_dev_pagemap(pte_pfn(pte), NULL);
+               if (pgmap)
+                       page = pte_page(pte);
+               else
+                       goto no_page;
+       } else if (unlikely(!page)) {
                 if (flags & FOLL_DUMP) {
                         /* Avoid special (like zero) pages in core dumps */
                         page = ERR_PTR(-EFAULT);
@@ -116,8 +130,28 @@ retry:
                 }
         }
  
-       if (flags & FOLL_GET)
-               get_page_foll(page);
+       if (flags & FOLL_SPLIT && PageTransCompound(page)) {
+               int ret;
+               get_page(page);
+               pte_unmap_unlock(ptep, ptl);
+               lock_page(page);
+               ret = split_huge_page(page);
+               unlock_page(page);
+               put_page(page);
+               if (ret)
+                       return ERR_PTR(ret);
+               goto retry;
+       }
+
+       if (flags & FOLL_GET) {
+               get_page(page);
+
+               /* drop the pgmap reference now that we hold the page */
+               if (pgmap) {
+                       put_dev_pagemap(pgmap);
+                       pgmap = NULL;
+               }
+       }
         if (flags & FOLL_TOUCH) {
                 if ((flags & FOLL_WRITE) &&
                     !pte_dirty(pte) && !PageDirty(page))
@@ -130,6 +164,10 @@ retry:
                 mark_page_accessed(page);
         }
         if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+               /* Do not mlock pte-mapped THP */
+               if (PageTransCompound(page))
+                       goto out;
+
                 /*
                  * The preliminary mapping check is mainly to avoid the
                  * pointless overhead of lock_page on the ZERO_PAGE
@@ -220,27 +258,45 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
         }
         if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
                 return no_page_table(vma, flags);
-       if (pmd_trans_huge(*pmd)) {
-               if (flags & FOLL_SPLIT) {
-                       split_huge_page_pmd(vma, address, pmd);
-                       return follow_page_pte(vma, address, pmd, flags);
-               }
+       if (pmd_devmap(*pmd)) {
                 ptl = pmd_lock(mm, pmd);
-               if (likely(pmd_trans_huge(*pmd))) {
-                       if (unlikely(pmd_trans_splitting(*pmd))) {
-                               spin_unlock(ptl);
-                               wait_split_huge_page(vma->anon_vma, pmd);
-                       } else {
-                               page = follow_trans_huge_pmd(vma, address,
-                                                            pmd, flags);
-                               spin_unlock(ptl);
-                               *page_mask = HPAGE_PMD_NR - 1;
-                               return page;
-                       }
-               } else
+               page = follow_devmap_pmd(vma, address, pmd, flags);
+               spin_unlock(ptl);
+               if (page)
+                       return page;
+       }
+       if (likely(!pmd_trans_huge(*pmd)))
+               return follow_page_pte(vma, address, pmd, flags);
+
+       ptl = pmd_lock(mm, pmd);
+       if (unlikely(!pmd_trans_huge(*pmd))) {
+               spin_unlock(ptl);
+               return follow_page_pte(vma, address, pmd, flags);
+       }
+       if (flags & FOLL_SPLIT) {
+               int ret;
+               page = pmd_page(*pmd);
+               if (is_huge_zero_page(page)) {
+                       spin_unlock(ptl);
+                       ret = 0;
+                       split_huge_pmd(vma, pmd, address);
+               } else {
+                       get_page(page);
                         spin_unlock(ptl);
+                       lock_page(page);
+                       ret = split_huge_page(page);
+                       unlock_page(page);
+                       put_page(page);
+               }
+
+               return ret ? ERR_PTR(ret) :
+                       follow_page_pte(vma, address, pmd, flags);
         }
-       return follow_page_pte(vma, address, pmd, flags);
+
+       page = follow_trans_huge_pmd(vma, address, pmd, flags);
+       spin_unlock(ptl);
+       *page_mask = HPAGE_PMD_NR - 1;
+       return page;
  }
  
  static int get_gate_page(struct mm_struct *mm, unsigned long address,
@@ -309,6 +365,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
                 return -ENOENT;
         if (*flags & FOLL_WRITE)
                 fault_flags |= FAULT_FLAG_WRITE;
+       if (*flags & FOLL_REMOTE)
+               fault_flags |= FAULT_FLAG_REMOTE;
         if (nonblocking)
                 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
         if (*flags & FOLL_NOWAIT)
@@ -359,11 +417,13 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
  static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
  {
         vm_flags_t vm_flags = vma->vm_flags;
+       int write = (gup_flags & FOLL_WRITE);
+       int foreign = (gup_flags & FOLL_REMOTE);
  
         if (vm_flags & (VM_IO | VM_PFNMAP))
                 return -EFAULT;
  
-       if (gup_flags & FOLL_WRITE) {
+       if (write) {
                 if (!(vm_flags & VM_WRITE)) {
                         if (!(gup_flags & FOLL_FORCE))
                                 return -EFAULT;
@@ -376,10 +436,8 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
                          * Anon pages in shared mappings are surprising: now
                          * just reject it.
                          */
-                       if (!is_cow_mapping(vm_flags)) {
-                               WARN_ON_ONCE(vm_flags & VM_MAYWRITE);
+                       if (!is_cow_mapping(vm_flags))
                                 return -EFAULT;
-                       }
                 }
         } else if (!(vm_flags & VM_READ)) {
                 if (!(gup_flags & FOLL_FORCE))
@@ -391,6 +449,12 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
                 if (!(vm_flags & VM_MAYREAD))
                         return -EFAULT;
         }
+       /*
+        * gups are always data accesses, not instruction
+        * fetches, so execute=false here
+        */
+       if (!arch_vma_access_permitted(vma, write, false, foreign))
+               return -EFAULT;
         return 0;
  }
  
@@ -557,6 +621,28 @@ next_page:
  }
  EXPORT_SYMBOL(__get_user_pages);
  
+bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags)
+{
+       bool write   = !!(fault_flags & FAULT_FLAG_WRITE);
+       bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
+       vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
+
+       if (!(vm_flags & vma->vm_flags))
+               return false;
+
+       /*
+        * The architecture might have a hardware protection
+        * mechanism other than read/write that can deny access.
+        *
+        * gup always represents data access, not instruction
+        * fetches, so execute=false here:
+        */
+       if (!arch_vma_access_permitted(vma, write, false, foreign))
+               return false;
+
+       return true;
+}
+
  /*
   * fixup_user_fault() - manually resolve a user page fault
   * @tsk:       the task_struct to use for page fault accounting, or
@@ -564,6 +650,8 @@ EXPORT_SYMBOL(__get_user_pages);
   * @mm:                mm_struct of target mm
   * @address:   user address
   * @fault_flags:flags to pass down to handle_mm_fault()
+ * @unlocked:  did we unlock the mmap_sem while retrying, maybe NULL if caller
+ *             does not allow retry
   *
   * This is meant to be called in the specific scenario where for locking reasons
   * we try to access user memory in atomic context (within a pagefault_disable()
@@ -575,31 +663,36 @@ EXPORT_SYMBOL(__get_user_pages);
   * The main difference with get_user_pages() is that this function will
   * unconditionally call handle_mm_fault() which will in turn perform all the
   * necessary SW fixup of the dirty and young bits in the PTE, while
- * handle_mm_fault() only guarantees to update these in the struct page.
+ * get_user_pages() only guarantees to update these in the struct page.
   *
   * This is important for some architectures where those bits also gate the
   * access permission to the page because they are maintained in software.  On
   * such architectures, gup() will not be enough to make a subsequent access
   * succeed.
   *
- * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault().
+ * This function will not return with an unlocked mmap_sem. So it has not the
+ * same semantics wrt the @mm->mmap_sem as does filemap_fault().
   */
  int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
-                    unsigned long address, unsigned int fault_flags)
+                    unsigned long address, unsigned int fault_flags,
+                    bool *unlocked)
  {
         struct vm_area_struct *vma;
-       vm_flags_t vm_flags;
-       int ret;
+       int ret, major = 0;
+
+       if (unlocked)
+               fault_flags |= FAULT_FLAG_ALLOW_RETRY;
  
+retry:
         vma = find_extend_vma(mm, address);
         if (!vma || address < vma->vm_start)
                 return -EFAULT;
  
-       vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ;
-       if (!(vm_flags & vma->vm_flags))
+       if (!vma_permits_fault(vma, fault_flags))
                 return -EFAULT;
  
         ret = handle_mm_fault(mm, vma, address, fault_flags);
+       major |= ret & VM_FAULT_MAJOR;
         if (ret & VM_FAULT_ERROR) {
                 if (ret & VM_FAULT_OOM)
                         return -ENOMEM;
@@ -609,8 +702,19 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
                         return -EFAULT;
                 BUG();
         }
+
+       if (ret & VM_FAULT_RETRY) {
+               down_read(&mm->mmap_sem);
+               if (!(fault_flags & FAULT_FLAG_TRIED)) {
+                       *unlocked = true;
+                       fault_flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                       fault_flags |= FAULT_FLAG_TRIED;
+                       goto retry;
+               }
+       }
+
         if (tsk) {
-               if (ret & VM_FAULT_MAJOR)
+               if (major)
                         tsk->maj_flt++;
                 else
                         tsk->min_flt++;
@@ -735,15 +839,15 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
   *      if (locked)
   *          up_read(&mm->mmap_sem);
   */
-long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
-                          unsigned long start, unsigned long nr_pages,
+long get_user_pages_locked6(unsigned long start, unsigned long nr_pages,
                            int write, int force, struct page **pages,
                            int *locked)
  {
-       return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
-                                      pages, NULL, locked, true, FOLL_TOUCH);
+       return __get_user_pages_locked(current, current->mm, start, nr_pages,
+                                      write, force, pages, NULL, locked, true,
+                                      FOLL_TOUCH);
  }
-EXPORT_SYMBOL(get_user_pages_locked);
+EXPORT_SYMBOL(get_user_pages_locked6);
  
  /*
   * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to
@@ -788,17 +892,16 @@ EXPORT_SYMBOL(__get_user_pages_unlocked);
   * or if "force" shall be set to 1 (get_user_pages_fast misses the
   * "force" parameter).
   */
-long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
-                            unsigned long start, unsigned long nr_pages,
+long get_user_pages_unlocked5(unsigned long start, unsigned long nr_pages,
                              int write, int force, struct page **pages)
  {
-       return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
-                                        force, pages, FOLL_TOUCH);
+       return __get_user_pages_unlocked(current, current->mm, start, nr_pages,
+                                        write, force, pages, FOLL_TOUCH);
  }
-EXPORT_SYMBOL(get_user_pages_unlocked);
+EXPORT_SYMBOL(get_user_pages_unlocked5);
  
  /*
- * get_user_pages() - pin user pages in memory
+ * get_user_pages_remote() - pin user pages in memory
   * @tsk:       the task_struct to use for page fault accounting, or
   *             NULL if faults are not to be recorded.
   * @mm:                mm_struct of target mm
@@ -852,14 +955,32 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
   * should use get_user_pages because it cannot pass
   * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
   */
-long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-               unsigned long start, unsigned long nr_pages, int write,
-               int force, struct page **pages, struct vm_area_struct **vmas)
+long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+               unsigned long start, unsigned long nr_pages,
+               int write, int force, struct page **pages,
+               struct vm_area_struct **vmas)
  {
         return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
-                                      pages, vmas, NULL, false, FOLL_TOUCH);
+                                      pages, vmas, NULL, false,
+                                      FOLL_TOUCH | FOLL_REMOTE);
  }
-EXPORT_SYMBOL(get_user_pages);
+EXPORT_SYMBOL(get_user_pages_remote);
+
+/*
+ * This is the same as get_user_pages_remote(), just with a
+ * less-flexible calling convention where we assume that the task
+ * and mm being operated on are the current task's.  We also
+ * obviously don't pass FOLL_REMOTE in here.
+ */
+long get_user_pages6(unsigned long start, unsigned long nr_pages,
+               int write, int force, struct page **pages,
+               struct vm_area_struct **vmas)
+{
+       return __get_user_pages_locked(current, current->mm, start, nr_pages,
+                                      write, force, pages, vmas, NULL, false,
+                                      FOLL_TOUCH);
+}
+EXPORT_SYMBOL(get_user_pages6);
  
  /**
   * populate_vma_page_range() -  populate a range of pages in the vma.
@@ -896,7 +1017,6 @@ long populate_vma_page_range(struct vm_area_struct *vma,
         gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
         if (vma->vm_flags & VM_LOCKONFAULT)
                 gup_flags &= ~FOLL_POPULATE;
-
         /*
          * We want to touch writable mappings with a write fault in order
          * to break COW, except for shared mappings because these don't COW
@@ -1036,9 +1156,6 @@ struct page *get_dump_page(unsigned long addr)
   *  *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
   *      pages containing page tables.
   *
- *  *) THP splits will broadcast an IPI, this can be achieved by overriding
- *      pmdp_splitting_flush.
- *
   *  *) ptes can be read atomically by the architecture.
   *
   *  *) access_ok is sufficient to validate userspace address ranges.
@@ -1066,7 +1183,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
                  * for an example see gup_get_pte in arch/x86/mm/gup.c
                  */
                 pte_t pte = READ_ONCE(*ptep);
-               struct page *page;
+               struct page *head, *page;
  
                 /*
                  * Similar to the PMD case below, NUMA hinting must take slow
@@ -1076,17 +1193,22 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
                         pte_protnone(pte) || (write && !pte_write(pte)))
                         goto pte_unmap;
  
+               if (!arch_pte_access_permitted(pte, write))
+                       goto pte_unmap;
+
                 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
                 page = pte_page(pte);
+               head = compound_head(page);
  
-               if (!page_cache_get_speculative(page))
+               if (!page_cache_get_speculative(head))
                         goto pte_unmap;
  
                 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
-                       put_page(page);
+                       put_page(head);
                         goto pte_unmap;
                 }
  
+               VM_BUG_ON_PAGE(compound_head(page) != head, page);
                 pages[*nr] = page;
                 (*nr)++;
  
@@ -1119,7 +1241,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
  static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
                 unsigned long end, int write, struct page **pages, int *nr)
  {
-       struct page *head, *page, *tail;
+       struct page *head, *page;
         int refs;
  
         if (write && !pmd_write(orig))
@@ -1128,7 +1250,6 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
         refs = 0;
         head = pmd_page(orig);
         page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
-       tail = page;
         do {
                 VM_BUG_ON_PAGE(compound_head(page) != head, page);
                 pages[*nr] = page;
@@ -1149,24 +1270,13 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
                 return 0;
         }
  
-       /*
-        * Any tail pages need their mapcount reference taken before we
-        * return. (This allows the THP code to bump their ref count when
-        * they are split into base pages).
-        */
-       while (refs--) {
-               if (PageTail(tail))
-                       get_huge_page_tail(tail);
-               tail++;
-       }
-
         return 1;
  }
  
  static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
                 unsigned long end, int write, struct page **pages, int *nr)
  {
-       struct page *head, *page, *tail;
+       struct page *head, *page;
         int refs;
  
         if (write && !pud_write(orig))
@@ -1175,7 +1285,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
         refs = 0;
         head = pud_page(orig);
         page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
-       tail = page;
         do {
                 VM_BUG_ON_PAGE(compound_head(page) != head, page);
                 pages[*nr] = page;
@@ -1196,12 +1305,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
                 return 0;
         }
  
-       while (refs--) {
-               if (PageTail(tail))
-                       get_huge_page_tail(tail);
-               tail++;
-       }
-
         return 1;
  }
  
@@ -1210,7 +1313,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
                         struct page **pages, int *nr)
  {
         int refs;
-       struct page *head, *page, *tail;
+       struct page *head, *page;
  
         if (write && !pgd_write(orig))
                 return 0;
@@ -1218,7 +1321,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
         refs = 0;
         head = pgd_page(orig);
         page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
-       tail = page;
         do {
                 VM_BUG_ON_PAGE(compound_head(page) != head, page);
                 pages[*nr] = page;
@@ -1239,12 +1341,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
                 return 0;
         }
  
-       while (refs--) {
-               if (PageTail(tail))
-                       get_huge_page_tail(tail);
-               tail++;
-       }
-
         return 1;
  }
  
@@ -1259,7 +1355,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                 pmd_t pmd = READ_ONCE(*pmdp);
  
                 next = pmd_addr_end(addr, end);
-               if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+               if (pmd_none(pmd))
                         return 0;
  
                 if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
@@ -1423,3 +1519,38 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
  }
  
  #endif /* CONFIG_HAVE_GENERIC_RCU_GUP */
+
+long get_user_pages8(struct task_struct *tsk, struct mm_struct *mm,
+                    unsigned long start, unsigned long nr_pages,
+                    int write, int force, struct page **pages,
+                    struct vm_area_struct **vmas)
+{
+       WARN_ONCE(tsk != current, "get_user_pages() called on remote task");
+       WARN_ONCE(mm != current->mm, "get_user_pages() called on remote mm");
+
+       return get_user_pages6(start, nr_pages, write, force, pages, vmas);
+}
+EXPORT_SYMBOL(get_user_pages8);
+
+long get_user_pages_locked8(struct task_struct *tsk, struct mm_struct *mm,
+                           unsigned long start, unsigned long nr_pages,
+                           int write, int force, struct page **pages, int *locked)
+{
+       WARN_ONCE(tsk != current, "get_user_pages_locked() called on remote task");
+       WARN_ONCE(mm != current->mm, "get_user_pages_locked() called on remote mm");
+
+       return get_user_pages_locked6(start, nr_pages, write, force, pages, locked);
+}
+EXPORT_SYMBOL(get_user_pages_locked8);
+
+long get_user_pages_unlocked7(struct task_struct *tsk, struct mm_struct *mm,
+                                 unsigned long start, unsigned long nr_pages,
+                                 int write, int force, struct page **pages)
+{
+       WARN_ONCE(tsk != current, "get_user_pages_unlocked() called on remote task");
+       WARN_ONCE(mm != current->mm, "get_user_pages_unlocked() called on remote mm");
+
+       return get_user_pages_unlocked5(start, nr_pages, write, force, pages);
+}
+EXPORT_SYMBOL(get_user_pages_unlocked7);
+