net/mlx4_core: Fix when to save some qp context flags for dynamic VST to VGT transitions
[linux-2.6-block.git] / mm / memory.c
index 793fe0f9841c09ce87b1d0e0a43f1b416ead6390..6bf2b471e30ca566a55160e4131bf7e7b9c3c4ea 100644 (file)
@@ -68,7 +68,7 @@
 #include <asm/io.h>
 #include <asm/mmu_context.h>
 #include <asm/pgalloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/pgtable.h>
@@ -300,15 +300,14 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
        struct mmu_gather_batch *batch;
 
        VM_BUG_ON(!tlb->end);
-
-       if (!tlb->page_size)
-               tlb->page_size = page_size;
-       else {
-               if (page_size != tlb->page_size)
-                       return true;
-       }
+       VM_WARN_ON(tlb->page_size != page_size);
 
        batch = tlb->active;
+       /*
+        * Add the page and check if we are full. If so
+        * force a flush.
+        */
+       batch->pages[batch->nr++] = page;
        if (batch->nr == batch->max) {
                if (!tlb_next_batch(tlb))
                        return true;
@@ -316,7 +315,6 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
        }
        VM_BUG_ON_PAGE(batch->nr > batch->max, page);
 
-       batch->pages[batch->nr++] = page;
        return false;
 }
 
@@ -528,7 +526,11 @@ void free_pgd_range(struct mmu_gather *tlb,
                end -= PMD_SIZE;
        if (addr > end - 1)
                return;
-
+       /*
+        * We add page table cache pages with PAGE_SIZE,
+        * (see pte_free_tlb()), flush the tlb if we need
+        */
+       tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
        pgd = pgd_offset(tlb->mm, addr);
        do {
                next = pgd_addr_end(addr, end);
@@ -1118,8 +1120,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
        pte_t *start_pte;
        pte_t *pte;
        swp_entry_t entry;
-       struct page *pending_page = NULL;
 
+       tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
 again:
        init_rss_vec(rss);
        start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
@@ -1172,7 +1174,6 @@ again:
                                print_bad_pte(vma, addr, ptent, page);
                        if (unlikely(__tlb_remove_page(tlb, page))) {
                                force_flush = 1;
-                               pending_page = page;
                                addr += PAGE_SIZE;
                                break;
                        }
@@ -1213,11 +1214,6 @@ again:
        if (force_flush) {
                force_flush = 0;
                tlb_flush_mmu_free(tlb);
-               if (pending_page) {
-                       /* remove the page with new size */
-                       __tlb_remove_pte_page(tlb, pending_page);
-                       pending_page = NULL;
-               }
                if (addr != end)
                        goto again;
        }
@@ -1240,7 +1236,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                        if (next - addr != HPAGE_PMD_SIZE) {
                                VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
                                    !rwsem_is_locked(&tlb->mm->mmap_sem), vma);
-                               split_huge_pmd(vma, pmd, addr);
+                               __split_huge_pmd(vma, pmd, addr, false, NULL);
                        } else if (zap_huge_pmd(tlb, vma, pmd, addr))
                                goto next;
                        /* fall through */
@@ -1637,8 +1633,8 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
 
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return -EFAULT;
-       if (track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)))
-               return -EINVAL;
+
+       track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
 
        ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
 
@@ -1649,11 +1645,15 @@ EXPORT_SYMBOL(vm_insert_pfn_prot);
 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                        pfn_t pfn)
 {
+       pgprot_t pgprot = vma->vm_page_prot;
+
        BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
 
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return -EFAULT;
 
+       track_pfn_insert(vma, &pgprot, pfn);
+
        /*
         * If we don't have pte special, then we have to use the pfn_valid()
         * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
@@ -1670,9 +1670,9 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                 * result in pfn_t_has_page() == false.
                 */
                page = pfn_to_page(pfn_t_to_pfn(pfn));
-               return insert_page(vma, addr, page, vma->vm_page_prot);
+               return insert_page(vma, addr, page, pgprot);
        }
-       return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
+       return insert_pfn(vma, addr, pfn, pgprot);
 }
 EXPORT_SYMBOL(vm_insert_mixed);
 
@@ -2034,20 +2034,17 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
  *
  * We do this without the lock held, so that it can sleep if it needs to.
  */
-static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
-              unsigned long address)
+static int do_page_mkwrite(struct vm_fault *vmf)
 {
-       struct vm_fault vmf;
        int ret;
+       struct page *page = vmf->page;
+       unsigned int old_flags = vmf->flags;
 
-       vmf.virtual_address = (void __user *)(address & PAGE_MASK);
-       vmf.pgoff = page->index;
-       vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
-       vmf.gfp_mask = __get_fault_gfp_mask(vma);
-       vmf.page = page;
-       vmf.cow_page = NULL;
+       vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
 
-       ret = vma->vm_ops->page_mkwrite(vma, &vmf);
+       ret = vmf->vma->vm_ops->page_mkwrite(vmf->vma, vmf);
+       /* Restore original flags so that caller is not surprised */
+       vmf->flags = old_flags;
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
                return ret;
        if (unlikely(!(ret & VM_FAULT_LOCKED))) {
@@ -2062,6 +2059,41 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
        return ret;
 }
 
+/*
+ * Handle dirtying of a page in shared file mapping on a write fault.
+ *
+ * The function expects the page to be locked and unlocks it.
+ */
+static void fault_dirty_shared_page(struct vm_area_struct *vma,
+                                   struct page *page)
+{
+       struct address_space *mapping;
+       bool dirtied;
+       bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
+
+       dirtied = set_page_dirty(page);
+       VM_BUG_ON_PAGE(PageAnon(page), page);
+       /*
+        * Take a local copy of the address_space - page.mapping may be zeroed
+        * by truncate after unlock_page().   The address_space itself remains
+        * pinned by vma->vm_file's reference.  We rely on unlock_page()'s
+        * release semantics to prevent the compiler from undoing this copying.
+        */
+       mapping = page_rmapping(page);
+       unlock_page(page);
+
+       if ((dirtied || page_mkwrite) && mapping) {
+               /*
+                * Some device drivers do not set page.mapping
+                * but still dirty their pages
+                */
+               balance_dirty_pages_ratelimited(mapping);
+       }
+
+       if (!page_mkwrite)
+               file_update_time(vma->vm_file);
+}
+
 /*
  * Handle write page faults for pages that can be reused in the current vma
  *
@@ -2070,11 +2102,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
  * case, all we need to do here is to mark the page as writable and update
  * any related book-keeping.
  */
-static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
-                       struct page *page, int page_mkwrite, int dirty_shared)
-       __releases(fe->ptl)
+static inline void wp_page_reuse(struct vm_fault *vmf)
+       __releases(vmf->ptl)
 {
-       struct vm_area_struct *vma = fe->vma;
+       struct vm_area_struct *vma = vmf->vma;
+       struct page *page = vmf->page;
        pte_t entry;
        /*
         * Clear the pages cpupid information as the existing
@@ -2084,39 +2116,12 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
        if (page)
                page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
 
-       flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
-       entry = pte_mkyoung(orig_pte);
+       flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
+       entry = pte_mkyoung(vmf->orig_pte);
        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-       if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1))
-               update_mmu_cache(vma, fe->address, fe->pte);
-       pte_unmap_unlock(fe->pte, fe->ptl);
-
-       if (dirty_shared) {
-               struct address_space *mapping;
-               int dirtied;
-
-               if (!page_mkwrite)
-                       lock_page(page);
-
-               dirtied = set_page_dirty(page);
-               VM_BUG_ON_PAGE(PageAnon(page), page);
-               mapping = page->mapping;
-               unlock_page(page);
-               put_page(page);
-
-               if ((dirtied || page_mkwrite) && mapping) {
-                       /*
-                        * Some device drivers do not set page.mapping
-                        * but still dirty their pages
-                        */
-                       balance_dirty_pages_ratelimited(mapping);
-               }
-
-               if (!page_mkwrite)
-                       file_update_time(vma->vm_file);
-       }
-
-       return VM_FAULT_WRITE;
+       if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
+               update_mmu_cache(vma, vmf->address, vmf->pte);
+       pte_unmap_unlock(vmf->pte, vmf->ptl);
 }
 
 /*
@@ -2135,31 +2140,32 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
  *   held to the old page, as well as updating the rmap.
  * - In any case, unlock the PTL and drop the reference we took to the old page.
  */
-static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
-               struct page *old_page)
+static int wp_page_copy(struct vm_fault *vmf)
 {
-       struct vm_area_struct *vma = fe->vma;
+       struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
+       struct page *old_page = vmf->page;
        struct page *new_page = NULL;
        pte_t entry;
        int page_copied = 0;
-       const unsigned long mmun_start = fe->address & PAGE_MASK;
+       const unsigned long mmun_start = vmf->address & PAGE_MASK;
        const unsigned long mmun_end = mmun_start + PAGE_SIZE;
        struct mem_cgroup *memcg;
 
        if (unlikely(anon_vma_prepare(vma)))
                goto oom;
 
-       if (is_zero_pfn(pte_pfn(orig_pte))) {
-               new_page = alloc_zeroed_user_highpage_movable(vma, fe->address);
+       if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
+               new_page = alloc_zeroed_user_highpage_movable(vma,
+                                                             vmf->address);
                if (!new_page)
                        goto oom;
        } else {
                new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
-                               fe->address);
+                               vmf->address);
                if (!new_page)
                        goto oom;
-               cow_user_page(new_page, old_page, fe->address, vma);
+               cow_user_page(new_page, old_page, vmf->address, vma);
        }
 
        if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
@@ -2172,8 +2178,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
        /*
         * Re-check the pte - we dropped the lock
         */
-       fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl);
-       if (likely(pte_same(*fe->pte, orig_pte))) {
+       vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
+       if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
                if (old_page) {
                        if (!PageAnon(old_page)) {
                                dec_mm_counter_fast(mm,
@@ -2183,7 +2189,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
                } else {
                        inc_mm_counter_fast(mm, MM_ANONPAGES);
                }
-               flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
+               flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
                entry = mk_pte(new_page, vma->vm_page_prot);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                /*
@@ -2192,8 +2198,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
                 * seen in the presence of one thread doing SMC and another
                 * thread doing COW.
                 */
-               ptep_clear_flush_notify(vma, fe->address, fe->pte);
-               page_add_new_anon_rmap(new_page, vma, fe->address, false);
+               ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
+               page_add_new_anon_rmap(new_page, vma, vmf->address, false);
                mem_cgroup_commit_charge(new_page, memcg, false, false);
                lru_cache_add_active_or_unevictable(new_page, vma);
                /*
@@ -2201,8 +2207,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
                 * mmu page tables (such as kvm shadow page tables), we want the
                 * new page to be mapped directly into the secondary page table.
                 */
-               set_pte_at_notify(mm, fe->address, fe->pte, entry);
-               update_mmu_cache(vma, fe->address, fe->pte);
+               set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
+               update_mmu_cache(vma, vmf->address, vmf->pte);
                if (old_page) {
                        /*
                         * Only after switching the pte to the new page may
@@ -2239,7 +2245,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
        if (new_page)
                put_page(new_page);
 
-       pte_unmap_unlock(fe->pte, fe->ptl);
+       pte_unmap_unlock(vmf->pte, vmf->ptl);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        if (old_page) {
                /*
@@ -2263,79 +2269,91 @@ oom:
        return VM_FAULT_OOM;
 }
 
+/**
+ * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
+ *                       writeable once the page is prepared
+ *
+ * @vmf: structure describing the fault
+ *
+ * This function handles all that is needed to finish a write page fault in a
+ * shared mapping due to PTE being read-only once the mapped page is prepared.
+ * It handles locking of PTE and modifying it. The function returns
+ * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE
+ * lock.
+ *
+ * The function expects the page to be locked or other protection against
+ * concurrent faults / writeback (such as DAX radix tree locks).
+ */
+int finish_mkwrite_fault(struct vm_fault *vmf)
+{
+       WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
+       vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
+                                      &vmf->ptl);
+       /*
+        * We might have raced with another page fault while we released the
+        * pte_offset_map_lock.
+        */
+       if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+               pte_unmap_unlock(vmf->pte, vmf->ptl);
+               return VM_FAULT_NOPAGE;
+       }
+       wp_page_reuse(vmf);
+       return 0;
+}
+
 /*
  * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
  * mapping
  */
-static int wp_pfn_shared(struct fault_env *fe,  pte_t orig_pte)
+static int wp_pfn_shared(struct vm_fault *vmf)
 {
-       struct vm_area_struct *vma = fe->vma;
+       struct vm_area_struct *vma = vmf->vma;
 
        if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
-               struct vm_fault vmf = {
-                       .page = NULL,
-                       .pgoff = linear_page_index(vma, fe->address),
-                       .virtual_address =
-                               (void __user *)(fe->address & PAGE_MASK),
-                       .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
-               };
                int ret;
 
-               pte_unmap_unlock(fe->pte, fe->ptl);
-               ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
-               if (ret & VM_FAULT_ERROR)
+               pte_unmap_unlock(vmf->pte, vmf->ptl);
+               vmf->flags |= FAULT_FLAG_MKWRITE;
+               ret = vma->vm_ops->pfn_mkwrite(vma, vmf);
+               if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
                        return ret;
-               fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
-                               &fe->ptl);
-               /*
-                * We might have raced with another page fault while we
-                * released the pte_offset_map_lock.
-                */
-               if (!pte_same(*fe->pte, orig_pte)) {
-                       pte_unmap_unlock(fe->pte, fe->ptl);
-                       return 0;
-               }
+               return finish_mkwrite_fault(vmf);
        }
-       return wp_page_reuse(fe, orig_pte, NULL, 0, 0);
+       wp_page_reuse(vmf);
+       return VM_FAULT_WRITE;
 }
 
-static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
-               struct page *old_page)
-       __releases(fe->ptl)
+static int wp_page_shared(struct vm_fault *vmf)
+       __releases(vmf->ptl)
 {
-       struct vm_area_struct *vma = fe->vma;
-       int page_mkwrite = 0;
+       struct vm_area_struct *vma = vmf->vma;
 
-       get_page(old_page);
+       get_page(vmf->page);
 
        if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
                int tmp;
 
-               pte_unmap_unlock(fe->pte, fe->ptl);
-               tmp = do_page_mkwrite(vma, old_page, fe->address);
+               pte_unmap_unlock(vmf->pte, vmf->ptl);
+               tmp = do_page_mkwrite(vmf);
                if (unlikely(!tmp || (tmp &
                                      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
-                       put_page(old_page);
+                       put_page(vmf->page);
                        return tmp;
                }
-               /*
-                * Since we dropped the lock we need to revalidate
-                * the PTE as someone else may have changed it.  If
-                * they did, we just return, as we can count on the
-                * MMU to tell us if they didn't also make it writable.
-                */
-               fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
-                                                &fe->ptl);
-               if (!pte_same(*fe->pte, orig_pte)) {
-                       unlock_page(old_page);
-                       pte_unmap_unlock(fe->pte, fe->ptl);
-                       put_page(old_page);
-                       return 0;
+               tmp = finish_mkwrite_fault(vmf);
+               if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
+                       unlock_page(vmf->page);
+                       put_page(vmf->page);
+                       return tmp;
                }
-               page_mkwrite = 1;
+       } else {
+               wp_page_reuse(vmf);
+               lock_page(vmf->page);
        }
+       fault_dirty_shared_page(vma, vmf->page);
+       put_page(vmf->page);
 
-       return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1);
+       return VM_FAULT_WRITE;
 }
 
 /*
@@ -2356,14 +2374,13 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
  * but allow concurrent faults), with pte both mapped and locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
-static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
-       __releases(fe->ptl)
+static int do_wp_page(struct vm_fault *vmf)
+       __releases(vmf->ptl)
 {
-       struct vm_area_struct *vma = fe->vma;
-       struct page *old_page;
+       struct vm_area_struct *vma = vmf->vma;
 
-       old_page = vm_normal_page(vma, fe->address, orig_pte);
-       if (!old_page) {
+       vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
+       if (!vmf->page) {
                /*
                 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
                 * VM_PFNMAP VMA.
@@ -2373,33 +2390,33 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
                 */
                if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                                     (VM_WRITE|VM_SHARED))
-                       return wp_pfn_shared(fe, orig_pte);
+                       return wp_pfn_shared(vmf);
 
-               pte_unmap_unlock(fe->pte, fe->ptl);
-               return wp_page_copy(fe, orig_pte, old_page);
+               pte_unmap_unlock(vmf->pte, vmf->ptl);
+               return wp_page_copy(vmf);
        }
 
        /*
         * Take out anonymous pages first, anonymous shared vmas are
         * not dirty accountable.
         */
-       if (PageAnon(old_page) && !PageKsm(old_page)) {
+       if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
                int total_mapcount;
-               if (!trylock_page(old_page)) {
-                       get_page(old_page);
-                       pte_unmap_unlock(fe->pte, fe->ptl);
-                       lock_page(old_page);
-                       fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
-                                       fe->address, &fe->ptl);
-                       if (!pte_same(*fe->pte, orig_pte)) {
-                               unlock_page(old_page);
-                               pte_unmap_unlock(fe->pte, fe->ptl);
-                               put_page(old_page);
+               if (!trylock_page(vmf->page)) {
+                       get_page(vmf->page);
+                       pte_unmap_unlock(vmf->pte, vmf->ptl);
+                       lock_page(vmf->page);
+                       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+                                       vmf->address, &vmf->ptl);
+                       if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+                               unlock_page(vmf->page);
+                               pte_unmap_unlock(vmf->pte, vmf->ptl);
+                               put_page(vmf->page);
                                return 0;
                        }
-                       put_page(old_page);
+                       put_page(vmf->page);
                }
-               if (reuse_swap_page(old_page, &total_mapcount)) {
+               if (reuse_swap_page(vmf->page, &total_mapcount)) {
                        if (total_mapcount == 1) {
                                /*
                                 * The page is all ours. Move it to
@@ -2408,24 +2425,25 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
                                 * Protected against the rmap code by
                                 * the page lock.
                                 */
-                               page_move_anon_rmap(old_page, vma);
+                               page_move_anon_rmap(vmf->page, vma);
                        }
-                       unlock_page(old_page);
-                       return wp_page_reuse(fe, orig_pte, old_page, 0, 0);
+                       unlock_page(vmf->page);
+                       wp_page_reuse(vmf);
+                       return VM_FAULT_WRITE;
                }
-               unlock_page(old_page);
+               unlock_page(vmf->page);
        } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                                        (VM_WRITE|VM_SHARED))) {
-               return wp_page_shared(fe, orig_pte, old_page);
+               return wp_page_shared(vmf);
        }
 
        /*
         * Ok, we need to copy. Oh, well..
         */
-       get_page(old_page);
+       get_page(vmf->page);
 
-       pte_unmap_unlock(fe->pte, fe->ptl);
-       return wp_page_copy(fe, orig_pte, old_page);
+       pte_unmap_unlock(vmf->pte, vmf->ptl);
+       return wp_page_copy(vmf);
 }
 
 static void unmap_mapping_range_vma(struct vm_area_struct *vma,
@@ -2513,9 +2531,9 @@ EXPORT_SYMBOL(unmap_mapping_range);
  * We return with the mmap_sem locked or unlocked in the same cases
  * as does filemap_fault().
  */
-int do_swap_page(struct fault_env *fe, pte_t orig_pte)
+int do_swap_page(struct vm_fault *vmf)
 {
-       struct vm_area_struct *vma = fe->vma;
+       struct vm_area_struct *vma = vmf->vma;
        struct page *page, *swapcache;
        struct mem_cgroup *memcg;
        swp_entry_t entry;
@@ -2524,17 +2542,18 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
        int exclusive = 0;
        int ret = 0;
 
-       if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte))
+       if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
                goto out;
 
-       entry = pte_to_swp_entry(orig_pte);
+       entry = pte_to_swp_entry(vmf->orig_pte);
        if (unlikely(non_swap_entry(entry))) {
                if (is_migration_entry(entry)) {
-                       migration_entry_wait(vma->vm_mm, fe->pmd, fe->address);
+                       migration_entry_wait(vma->vm_mm, vmf->pmd,
+                                            vmf->address);
                } else if (is_hwpoison_entry(entry)) {
                        ret = VM_FAULT_HWPOISON;
                } else {
-                       print_bad_pte(vma, fe->address, orig_pte, NULL);
+                       print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
                        ret = VM_FAULT_SIGBUS;
                }
                goto out;
@@ -2542,16 +2561,16 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
        delayacct_set_flag(DELAYACCT_PF_SWAPIN);
        page = lookup_swap_cache(entry);
        if (!page) {
-               page = swapin_readahead(entry,
-                                       GFP_HIGHUSER_MOVABLE, vma, fe->address);
+               page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma,
+                                       vmf->address);
                if (!page) {
                        /*
                         * Back out if somebody else faulted in this pte
                         * while we released the pte lock.
                         */
-                       fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
-                                       fe->address, &fe->ptl);
-                       if (likely(pte_same(*fe->pte, orig_pte)))
+                       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+                                       vmf->address, &vmf->ptl);
+                       if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
                                ret = VM_FAULT_OOM;
                        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
                        goto unlock;
@@ -2573,7 +2592,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
        }
 
        swapcache = page;
-       locked = lock_page_or_retry(page, vma->vm_mm, fe->flags);
+       locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
 
        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
        if (!locked) {
@@ -2590,7 +2609,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
        if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
                goto out_page;
 
-       page = ksm_might_need_to_copy(page, vma, fe->address);
+       page = ksm_might_need_to_copy(page, vma, vmf->address);
        if (unlikely(!page)) {
                ret = VM_FAULT_OOM;
                page = swapcache;
@@ -2606,9 +2625,9 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
        /*
         * Back out if somebody else already faulted in this pte.
         */
-       fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
-                       &fe->ptl);
-       if (unlikely(!pte_same(*fe->pte, orig_pte)))
+       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+                       &vmf->ptl);
+       if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
                goto out_nomap;
 
        if (unlikely(!PageUptodate(page))) {
@@ -2629,22 +2648,23 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
        inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
        dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
        pte = mk_pte(page, vma->vm_page_prot);
-       if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
+       if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
                pte = maybe_mkwrite(pte_mkdirty(pte), vma);
-               fe->flags &= ~FAULT_FLAG_WRITE;
+               vmf->flags &= ~FAULT_FLAG_WRITE;
                ret |= VM_FAULT_WRITE;
                exclusive = RMAP_EXCLUSIVE;
        }
        flush_icache_page(vma, page);
-       if (pte_swp_soft_dirty(orig_pte))
+       if (pte_swp_soft_dirty(vmf->orig_pte))
                pte = pte_mksoft_dirty(pte);
-       set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
+       set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
+       vmf->orig_pte = pte;
        if (page == swapcache) {
-               do_page_add_anon_rmap(page, vma, fe->address, exclusive);
+               do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
                mem_cgroup_commit_charge(page, memcg, true, false);
                activate_page(page);
        } else { /* ksm created a completely new copy */
-               page_add_new_anon_rmap(page, vma, fe->address, false);
+               page_add_new_anon_rmap(page, vma, vmf->address, false);
                mem_cgroup_commit_charge(page, memcg, false, false);
                lru_cache_add_active_or_unevictable(page, vma);
        }
@@ -2667,22 +2687,22 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
                put_page(swapcache);
        }
 
-       if (fe->flags & FAULT_FLAG_WRITE) {
-               ret |= do_wp_page(fe, pte);
+       if (vmf->flags & FAULT_FLAG_WRITE) {
+               ret |= do_wp_page(vmf);
                if (ret & VM_FAULT_ERROR)
                        ret &= VM_FAULT_ERROR;
                goto out;
        }
 
        /* No need to invalidate - it was non-present before */
-       update_mmu_cache(vma, fe->address, fe->pte);
+       update_mmu_cache(vma, vmf->address, vmf->pte);
 unlock:
-       pte_unmap_unlock(fe->pte, fe->ptl);
+       pte_unmap_unlock(vmf->pte, vmf->ptl);
 out:
        return ret;
 out_nomap:
        mem_cgroup_cancel_charge(page, memcg, false);
-       pte_unmap_unlock(fe->pte, fe->ptl);
+       pte_unmap_unlock(vmf->pte, vmf->ptl);
 out_page:
        unlock_page(page);
 out_release:
@@ -2733,9 +2753,9 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
  * but allow concurrent faults), and pte mapped but not yet locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
-static int do_anonymous_page(struct fault_env *fe)
+static int do_anonymous_page(struct vm_fault *vmf)
 {
-       struct vm_area_struct *vma = fe->vma;
+       struct vm_area_struct *vma = vmf->vma;
        struct mem_cgroup *memcg;
        struct page *page;
        pte_t entry;
@@ -2745,7 +2765,7 @@ static int do_anonymous_page(struct fault_env *fe)
                return VM_FAULT_SIGBUS;
 
        /* Check if we need to add a guard page to the stack */
-       if (check_stack_guard_page(vma, fe->address) < 0)
+       if (check_stack_guard_page(vma, vmf->address) < 0)
                return VM_FAULT_SIGSEGV;
 
        /*
@@ -2758,26 +2778,26 @@ static int do_anonymous_page(struct fault_env *fe)
         *
         * Here we only have down_read(mmap_sem).
         */
-       if (pte_alloc(vma->vm_mm, fe->pmd, fe->address))
+       if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))
                return VM_FAULT_OOM;
 
        /* See the comment in pte_alloc_one_map() */
-       if (unlikely(pmd_trans_unstable(fe->pmd)))
+       if (unlikely(pmd_trans_unstable(vmf->pmd)))
                return 0;
 
        /* Use the zero-page for reads */
-       if (!(fe->flags & FAULT_FLAG_WRITE) &&
+       if (!(vmf->flags & FAULT_FLAG_WRITE) &&
                        !mm_forbids_zeropage(vma->vm_mm)) {
-               entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address),
+               entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
                                                vma->vm_page_prot));
-               fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
-                               &fe->ptl);
-               if (!pte_none(*fe->pte))
+               vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+                               vmf->address, &vmf->ptl);
+               if (!pte_none(*vmf->pte))
                        goto unlock;
                /* Deliver the page fault to userland, check inside PT lock */
                if (userfaultfd_missing(vma)) {
-                       pte_unmap_unlock(fe->pte, fe->ptl);
-                       return handle_userfault(fe, VM_UFFD_MISSING);
+                       pte_unmap_unlock(vmf->pte, vmf->ptl);
+                       return handle_userfault(vmf, VM_UFFD_MISSING);
                }
                goto setpte;
        }
@@ -2785,7 +2805,7 @@ static int do_anonymous_page(struct fault_env *fe)
        /* Allocate our own private page. */
        if (unlikely(anon_vma_prepare(vma)))
                goto oom;
-       page = alloc_zeroed_user_highpage_movable(vma, fe->address);
+       page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
        if (!page)
                goto oom;
 
@@ -2803,30 +2823,30 @@ static int do_anonymous_page(struct fault_env *fe)
        if (vma->vm_flags & VM_WRITE)
                entry = pte_mkwrite(pte_mkdirty(entry));
 
-       fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
-                       &fe->ptl);
-       if (!pte_none(*fe->pte))
+       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+                       &vmf->ptl);
+       if (!pte_none(*vmf->pte))
                goto release;
 
        /* Deliver the page fault to userland, check inside PT lock */
        if (userfaultfd_missing(vma)) {
-               pte_unmap_unlock(fe->pte, fe->ptl);
+               pte_unmap_unlock(vmf->pte, vmf->ptl);
                mem_cgroup_cancel_charge(page, memcg, false);
                put_page(page);
-               return handle_userfault(fe, VM_UFFD_MISSING);
+               return handle_userfault(vmf, VM_UFFD_MISSING);
        }
 
        inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
-       page_add_new_anon_rmap(page, vma, fe->address, false);
+       page_add_new_anon_rmap(page, vma, vmf->address, false);
        mem_cgroup_commit_charge(page, memcg, false, false);
        lru_cache_add_active_or_unevictable(page, vma);
 setpte:
-       set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
+       set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
 
        /* No need to invalidate - it was non-present before */
-       update_mmu_cache(vma, fe->address, fe->pte);
+       update_mmu_cache(vma, vmf->address, vmf->pte);
 unlock:
-       pte_unmap_unlock(fe->pte, fe->ptl);
+       pte_unmap_unlock(vmf->pte, vmf->ptl);
        return 0;
 release:
        mem_cgroup_cancel_charge(page, memcg, false);
@@ -2843,62 +2863,50 @@ oom:
  * released depending on flags and vma->vm_ops->fault() return value.
  * See filemap_fault() and __lock_page_retry().
  */
-static int __do_fault(struct fault_env *fe, pgoff_t pgoff,
-               struct page *cow_page, struct page **page, void **entry)
+static int __do_fault(struct vm_fault *vmf)
 {
-       struct vm_area_struct *vma = fe->vma;
-       struct vm_fault vmf;
+       struct vm_area_struct *vma = vmf->vma;
        int ret;
 
-       vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK);
-       vmf.pgoff = pgoff;
-       vmf.flags = fe->flags;
-       vmf.page = NULL;
-       vmf.gfp_mask = __get_fault_gfp_mask(vma);
-       vmf.cow_page = cow_page;
-
-       ret = vma->vm_ops->fault(vma, &vmf);
-       if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
-               return ret;
-       if (ret & VM_FAULT_DAX_LOCKED) {
-               *entry = vmf.entry;
+       ret = vma->vm_ops->fault(vma, vmf);
+       if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
+                           VM_FAULT_DONE_COW)))
                return ret;
-       }
 
-       if (unlikely(PageHWPoison(vmf.page))) {
+       if (unlikely(PageHWPoison(vmf->page))) {
                if (ret & VM_FAULT_LOCKED)
-                       unlock_page(vmf.page);
-               put_page(vmf.page);
+                       unlock_page(vmf->page);
+               put_page(vmf->page);
+               vmf->page = NULL;
                return VM_FAULT_HWPOISON;
        }
 
        if (unlikely(!(ret & VM_FAULT_LOCKED)))
-               lock_page(vmf.page);
+               lock_page(vmf->page);
        else
-               VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
+               VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
 
-       *page = vmf.page;
        return ret;
 }
 
-static int pte_alloc_one_map(struct fault_env *fe)
+static int pte_alloc_one_map(struct vm_fault *vmf)
 {
-       struct vm_area_struct *vma = fe->vma;
+       struct vm_area_struct *vma = vmf->vma;
 
-       if (!pmd_none(*fe->pmd))
+       if (!pmd_none(*vmf->pmd))
                goto map_pte;
-       if (fe->prealloc_pte) {
-               fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
-               if (unlikely(!pmd_none(*fe->pmd))) {
-                       spin_unlock(fe->ptl);
+       if (vmf->prealloc_pte) {
+               vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+               if (unlikely(!pmd_none(*vmf->pmd))) {
+                       spin_unlock(vmf->ptl);
                        goto map_pte;
                }
 
                atomic_long_inc(&vma->vm_mm->nr_ptes);
-               pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte);
-               spin_unlock(fe->ptl);
-               fe->prealloc_pte = 0;
-       } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) {
+               pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
+               spin_unlock(vmf->ptl);
+               vmf->prealloc_pte = 0;
+       } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) {
                return VM_FAULT_OOM;
        }
 map_pte:
@@ -2913,11 +2921,11 @@ map_pte:
         * through an atomic read in C, which is what pmd_trans_unstable()
         * provides.
         */
-       if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
+       if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
                return VM_FAULT_NOPAGE;
 
-       fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
-                       &fe->ptl);
+       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+                       &vmf->ptl);
        return 0;
 }
 
@@ -2935,11 +2943,24 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
        return true;
 }
 
-static int do_set_pmd(struct fault_env *fe, struct page *page)
+static void deposit_prealloc_pte(struct vm_fault *vmf)
+{
+       struct vm_area_struct *vma = vmf->vma;
+
+       pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
+       /*
+        * We are going to consume the prealloc table,
+        * count that as nr_ptes.
+        */
+       atomic_long_inc(&vma->vm_mm->nr_ptes);
+       vmf->prealloc_pte = 0;
+}
+
+static int do_set_pmd(struct vm_fault *vmf, struct page *page)
 {
-       struct vm_area_struct *vma = fe->vma;
-       bool write = fe->flags & FAULT_FLAG_WRITE;
-       unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+       struct vm_area_struct *vma = vmf->vma;
+       bool write = vmf->flags & FAULT_FLAG_WRITE;
+       unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        pmd_t entry;
        int i, ret;
 
@@ -2949,8 +2970,19 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
        ret = VM_FAULT_FALLBACK;
        page = compound_head(page);
 
-       fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
-       if (unlikely(!pmd_none(*fe->pmd)))
+       /*
+        * Archs like ppc64 need additonal space to store information
+        * related to pte entry. Use the preallocated table for that.
+        */
+       if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
+               vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address);
+               if (!vmf->prealloc_pte)
+                       return VM_FAULT_OOM;
+               smp_wmb(); /* See comment in __pte_alloc() */
+       }
+
+       vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+       if (unlikely(!pmd_none(*vmf->pmd)))
                goto out;
 
        for (i = 0; i < HPAGE_PMD_NR; i++)
@@ -2962,20 +2994,25 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
 
        add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR);
        page_add_file_rmap(page, true);
+       /*
+        * deposit and withdraw with pmd lock held
+        */
+       if (arch_needs_pgtable_deposit())
+               deposit_prealloc_pte(vmf);
 
-       set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
+       set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
 
-       update_mmu_cache_pmd(vma, haddr, fe->pmd);
+       update_mmu_cache_pmd(vma, haddr, vmf->pmd);
 
        /* fault is handled */
        ret = 0;
        count_vm_event(THP_FILE_MAPPED);
 out:
-       spin_unlock(fe->ptl);
+       spin_unlock(vmf->ptl);
        return ret;
 }
 #else
-static int do_set_pmd(struct fault_env *fe, struct page *page)
+static int do_set_pmd(struct vm_fault *vmf, struct page *page)
 {
        BUILD_BUG();
        return 0;
@@ -2986,41 +3023,42 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
  * alloc_set_pte - setup new PTE entry for given page and add reverse page
  * mapping. If needed, the fucntion allocates page table or use pre-allocated.
  *
- * @fe: fault environment
+ * @vmf: fault environment
  * @memcg: memcg to charge page (only for private mappings)
  * @page: page to map
  *
- * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return.
+ * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
+ * return.
  *
  * Target users are page handler itself and implementations of
  * vm_ops->map_pages.
  */
-int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
+int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
                struct page *page)
 {
-       struct vm_area_struct *vma = fe->vma;
-       bool write = fe->flags & FAULT_FLAG_WRITE;
+       struct vm_area_struct *vma = vmf->vma;
+       bool write = vmf->flags & FAULT_FLAG_WRITE;
        pte_t entry;
        int ret;
 
-       if (pmd_none(*fe->pmd) && PageTransCompound(page) &&
+       if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
                        IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
                /* THP on COW? */
                VM_BUG_ON_PAGE(memcg, page);
 
-               ret = do_set_pmd(fe, page);
+               ret = do_set_pmd(vmf, page);
                if (ret != VM_FAULT_FALLBACK)
                        return ret;
        }
 
-       if (!fe->pte) {
-               ret = pte_alloc_one_map(fe);
+       if (!vmf->pte) {
+               ret = pte_alloc_one_map(vmf);
                if (ret)
                        return ret;
        }
 
        /* Re-check under ptl */
-       if (unlikely(!pte_none(*fe->pte)))
+       if (unlikely(!pte_none(*vmf->pte)))
                return VM_FAULT_NOPAGE;
 
        flush_icache_page(vma, page);
@@ -3030,21 +3068,53 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
        /* copy-on-write page */
        if (write && !(vma->vm_flags & VM_SHARED)) {
                inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
-               page_add_new_anon_rmap(page, vma, fe->address, false);
+               page_add_new_anon_rmap(page, vma, vmf->address, false);
                mem_cgroup_commit_charge(page, memcg, false, false);
                lru_cache_add_active_or_unevictable(page, vma);
        } else {
                inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
                page_add_file_rmap(page, false);
        }
-       set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
+       set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
 
        /* no need to invalidate: a not-present page won't be cached */
-       update_mmu_cache(vma, fe->address, fe->pte);
+       update_mmu_cache(vma, vmf->address, vmf->pte);
 
        return 0;
 }
 
+
+/**
+ * finish_fault - finish page fault once we have prepared the page to fault
+ *
+ * @vmf: structure describing the fault
+ *
+ * This function handles all that is needed to finish a page fault once the
+ * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
+ * given page, adds reverse page mapping, handles memcg charges and LRU
+ * addition. The function returns 0 on success, VM_FAULT_ code in case of
+ * error.
+ *
+ * The function expects the page to be locked and on success it consumes a
+ * reference of a page being mapped (for the PTE which maps it).
+ */
+int finish_fault(struct vm_fault *vmf)
+{
+       struct page *page;
+       int ret;
+
+       /* Did we COW the page? */
+       if ((vmf->flags & FAULT_FLAG_WRITE) &&
+           !(vmf->vma->vm_flags & VM_SHARED))
+               page = vmf->cow_page;
+       else
+               page = vmf->page;
+       ret = alloc_set_pte(vmf, vmf->memcg, page);
+       if (vmf->pte)
+               pte_unmap_unlock(vmf->pte, vmf->ptl);
+       return ret;
+}
+
 static unsigned long fault_around_bytes __read_mostly =
        rounddown_pow_of_two(65536);
 
@@ -3109,17 +3179,18 @@ late_initcall(fault_around_debugfs);
  * fault_around_pages() value (and therefore to page order).  This way it's
  * easier to guarantee that we don't cross page table boundaries.
  */
-static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
+static int do_fault_around(struct vm_fault *vmf)
 {
-       unsigned long address = fe->address, nr_pages, mask;
+       unsigned long address = vmf->address, nr_pages, mask;
+       pgoff_t start_pgoff = vmf->pgoff;
        pgoff_t end_pgoff;
        int off, ret = 0;
 
        nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
        mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
 
-       fe->address = max(address & mask, fe->vma->vm_start);
-       off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+       vmf->address = max(address & mask, vmf->vma->vm_start);
+       off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
        start_pgoff -= off;
 
        /*
@@ -3127,50 +3198,45 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
         *  or fault_around_pages() from start_pgoff, depending what is nearest.
         */
        end_pgoff = start_pgoff -
-               ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
+               ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
                PTRS_PER_PTE - 1;
-       end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1,
+       end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
                        start_pgoff + nr_pages - 1);
 
-       if (pmd_none(*fe->pmd)) {
-               fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address);
-               if (!fe->prealloc_pte)
+       if (pmd_none(*vmf->pmd)) {
+               vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
+                                                 vmf->address);
+               if (!vmf->prealloc_pte)
                        goto out;
                smp_wmb(); /* See comment in __pte_alloc() */
        }
 
-       fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff);
+       vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
 
-       /* preallocated pagetable is unused: free it */
-       if (fe->prealloc_pte) {
-               pte_free(fe->vma->vm_mm, fe->prealloc_pte);
-               fe->prealloc_pte = 0;
-       }
        /* Huge page is mapped? Page fault is solved */
-       if (pmd_trans_huge(*fe->pmd)) {
+       if (pmd_trans_huge(*vmf->pmd)) {
                ret = VM_FAULT_NOPAGE;
                goto out;
        }
 
        /* ->map_pages() haven't done anything useful. Cold page cache? */
-       if (!fe->pte)
+       if (!vmf->pte)
                goto out;
 
        /* check if the page fault is solved */
-       fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
-       if (!pte_none(*fe->pte))
+       vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
+       if (!pte_none(*vmf->pte))
                ret = VM_FAULT_NOPAGE;
-       pte_unmap_unlock(fe->pte, fe->ptl);
+       pte_unmap_unlock(vmf->pte, vmf->ptl);
 out:
-       fe->address = address;
-       fe->pte = NULL;
+       vmf->address = address;
+       vmf->pte = NULL;
        return ret;
 }
 
-static int do_read_fault(struct fault_env *fe, pgoff_t pgoff)
+static int do_read_fault(struct vm_fault *vmf)
 {
-       struct vm_area_struct *vma = fe->vma;
-       struct page *fault_page;
+       struct vm_area_struct *vma = vmf->vma;
        int ret = 0;
 
        /*
@@ -3179,80 +3245,67 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff)
         * something).
         */
        if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
-               ret = do_fault_around(fe, pgoff);
+               ret = do_fault_around(vmf);
                if (ret)
                        return ret;
        }
 
-       ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
+       ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;
 
-       ret |= alloc_set_pte(fe, NULL, fault_page);
-       if (fe->pte)
-               pte_unmap_unlock(fe->pte, fe->ptl);
-       unlock_page(fault_page);
+       ret |= finish_fault(vmf);
+       unlock_page(vmf->page);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
-               put_page(fault_page);
+               put_page(vmf->page);
        return ret;
 }
 
-static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff)
+static int do_cow_fault(struct vm_fault *vmf)
 {
-       struct vm_area_struct *vma = fe->vma;
-       struct page *fault_page, *new_page;
-       void *fault_entry;
-       struct mem_cgroup *memcg;
+       struct vm_area_struct *vma = vmf->vma;
        int ret;
 
        if (unlikely(anon_vma_prepare(vma)))
                return VM_FAULT_OOM;
 
-       new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address);
-       if (!new_page)
+       vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
+       if (!vmf->cow_page)
                return VM_FAULT_OOM;
 
-       if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
-                               &memcg, false)) {
-               put_page(new_page);
+       if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
+                               &vmf->memcg, false)) {
+               put_page(vmf->cow_page);
                return VM_FAULT_OOM;
        }
 
-       ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry);
+       ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                goto uncharge_out;
+       if (ret & VM_FAULT_DONE_COW)
+               return ret;
 
-       if (!(ret & VM_FAULT_DAX_LOCKED))
-               copy_user_highpage(new_page, fault_page, fe->address, vma);
-       __SetPageUptodate(new_page);
+       copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
+       __SetPageUptodate(vmf->cow_page);
 
-       ret |= alloc_set_pte(fe, memcg, new_page);
-       if (fe->pte)
-               pte_unmap_unlock(fe->pte, fe->ptl);
-       if (!(ret & VM_FAULT_DAX_LOCKED)) {
-               unlock_page(fault_page);
-               put_page(fault_page);
-       } else {
-               dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff);
-       }
+       ret |= finish_fault(vmf);
+       unlock_page(vmf->page);
+       put_page(vmf->page);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                goto uncharge_out;
        return ret;
 uncharge_out:
-       mem_cgroup_cancel_charge(new_page, memcg, false);
-       put_page(new_page);
+       mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
+       put_page(vmf->cow_page);
        return ret;
 }
 
-static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
+static int do_shared_fault(struct vm_fault *vmf)
 {
-       struct vm_area_struct *vma = fe->vma;
-       struct page *fault_page;
-       struct address_space *mapping;
-       int dirtied = 0;
+       struct vm_area_struct *vma = vmf->vma;
        int ret, tmp;
 
-       ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
+       ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;
 
@@ -3261,46 +3314,24 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
         * about to become writable
         */
        if (vma->vm_ops->page_mkwrite) {
-               unlock_page(fault_page);
-               tmp = do_page_mkwrite(vma, fault_page, fe->address);
+               unlock_page(vmf->page);
+               tmp = do_page_mkwrite(vmf);
                if (unlikely(!tmp ||
                                (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
-                       put_page(fault_page);
+                       put_page(vmf->page);
                        return tmp;
                }
        }
 
-       ret |= alloc_set_pte(fe, NULL, fault_page);
-       if (fe->pte)
-               pte_unmap_unlock(fe->pte, fe->ptl);
+       ret |= finish_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
                                        VM_FAULT_RETRY))) {
-               unlock_page(fault_page);
-               put_page(fault_page);
+               unlock_page(vmf->page);
+               put_page(vmf->page);
                return ret;
        }
 
-       if (set_page_dirty(fault_page))
-               dirtied = 1;
-       /*
-        * Take a local copy of the address_space - page.mapping may be zeroed
-        * by truncate after unlock_page().   The address_space itself remains
-        * pinned by vma->vm_file's reference.  We rely on unlock_page()'s
-        * release semantics to prevent the compiler from undoing this copying.
-        */
-       mapping = page_rmapping(fault_page);
-       unlock_page(fault_page);
-       if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
-               /*
-                * Some device drivers do not set page.mapping but still
-                * dirty their pages
-                */
-               balance_dirty_pages_ratelimited(mapping);
-       }
-
-       if (!vma->vm_ops->page_mkwrite)
-               file_update_time(vma->vm_file);
-
+       fault_dirty_shared_page(vma, vmf->page);
        return ret;
 }
 
@@ -3310,19 +3341,27 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
  * The mmap_sem may have been released depending on flags and our
  * return value.  See filemap_fault() and __lock_page_or_retry().
  */
-static int do_fault(struct fault_env *fe)
+static int do_fault(struct vm_fault *vmf)
 {
-       struct vm_area_struct *vma = fe->vma;
-       pgoff_t pgoff = linear_page_index(vma, fe->address);
+       struct vm_area_struct *vma = vmf->vma;
+       int ret;
 
        /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
        if (!vma->vm_ops->fault)
-               return VM_FAULT_SIGBUS;
-       if (!(fe->flags & FAULT_FLAG_WRITE))
-               return do_read_fault(fe, pgoff);
-       if (!(vma->vm_flags & VM_SHARED))
-               return do_cow_fault(fe, pgoff);
-       return do_shared_fault(fe, pgoff);
+               ret = VM_FAULT_SIGBUS;
+       else if (!(vmf->flags & FAULT_FLAG_WRITE))
+               ret = do_read_fault(vmf);
+       else if (!(vma->vm_flags & VM_SHARED))
+               ret = do_cow_fault(vmf);
+       else
+               ret = do_shared_fault(vmf);
+
+       /* preallocated pagetable is unused: free it */
+       if (vmf->prealloc_pte) {
+               pte_free(vma->vm_mm, vmf->prealloc_pte);
+               vmf->prealloc_pte = 0;
+       }
+       return ret;
 }
 
 static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
@@ -3340,14 +3379,15 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
        return mpol_misplaced(page, vma, addr);
 }
 
-static int do_numa_page(struct fault_env *fe, pte_t pte)
+static int do_numa_page(struct vm_fault *vmf)
 {
-       struct vm_area_struct *vma = fe->vma;
+       struct vm_area_struct *vma = vmf->vma;
        struct page *page = NULL;
        int page_nid = -1;
        int last_cpupid;
        int target_nid;
        bool migrated = false;
+       pte_t pte = vmf->orig_pte;
        bool was_writable = pte_write(pte);
        int flags = 0;
 
@@ -3360,10 +3400,10 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
        * page table entry is not accessible, so there would be no
        * concurrent hardware modifications to the PTE.
        */
-       fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd);
-       spin_lock(fe->ptl);
-       if (unlikely(!pte_same(*fe->pte, pte))) {
-               pte_unmap_unlock(fe->pte, fe->ptl);
+       vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
+       spin_lock(vmf->ptl);
+       if (unlikely(!pte_same(*vmf->pte, pte))) {
+               pte_unmap_unlock(vmf->pte, vmf->ptl);
                goto out;
        }
 
@@ -3372,18 +3412,18 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
        pte = pte_mkyoung(pte);
        if (was_writable)
                pte = pte_mkwrite(pte);
-       set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
-       update_mmu_cache(vma, fe->address, fe->pte);
+       set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
+       update_mmu_cache(vma, vmf->address, vmf->pte);
 
-       page = vm_normal_page(vma, fe->address, pte);
+       page = vm_normal_page(vma, vmf->address, pte);
        if (!page) {
-               pte_unmap_unlock(fe->pte, fe->ptl);
+               pte_unmap_unlock(vmf->pte, vmf->ptl);
                return 0;
        }
 
        /* TODO: handle PTE-mapped THP */
        if (PageCompound(page)) {
-               pte_unmap_unlock(fe->pte, fe->ptl);
+               pte_unmap_unlock(vmf->pte, vmf->ptl);
                return 0;
        }
 
@@ -3395,7 +3435,7 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
         * pte_dirty has unpredictable behaviour between PTE scan updates,
         * background writeback, dirty balancing and application behaviour.
         */
-       if (!(vma->vm_flags & VM_WRITE))
+       if (!pte_write(pte))
                flags |= TNF_NO_GROUP;
 
        /*
@@ -3407,9 +3447,9 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
 
        last_cpupid = page_cpupid_last(page);
        page_nid = page_to_nid(page);
-       target_nid = numa_migrate_prep(page, vma, fe->address, page_nid,
+       target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
                        &flags);
-       pte_unmap_unlock(fe->pte, fe->ptl);
+       pte_unmap_unlock(vmf->pte, vmf->ptl);
        if (target_nid == -1) {
                put_page(page);
                goto out;
@@ -3429,28 +3469,28 @@ out:
        return 0;
 }
 
-static int create_huge_pmd(struct fault_env *fe)
+static int create_huge_pmd(struct vm_fault *vmf)
 {
-       struct vm_area_struct *vma = fe->vma;
+       struct vm_area_struct *vma = vmf->vma;
        if (vma_is_anonymous(vma))
-               return do_huge_pmd_anonymous_page(fe);
+               return do_huge_pmd_anonymous_page(vmf);
        if (vma->vm_ops->pmd_fault)
-               return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd,
-                               fe->flags);
+               return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd,
+                               vmf->flags);
        return VM_FAULT_FALLBACK;
 }
 
-static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd)
+static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
 {
-       if (vma_is_anonymous(fe->vma))
-               return do_huge_pmd_wp_page(fe, orig_pmd);
-       if (fe->vma->vm_ops->pmd_fault)
-               return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd,
-                               fe->flags);
+       if (vma_is_anonymous(vmf->vma))
+               return do_huge_pmd_wp_page(vmf, orig_pmd);
+       if (vmf->vma->vm_ops->pmd_fault)
+               return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address,
+                                                  vmf->pmd, vmf->flags);
 
        /* COW handled on pte level: split pmd */
-       VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma);
-       split_huge_pmd(fe->vma, fe->pmd, fe->address);
+       VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
+       __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
 
        return VM_FAULT_FALLBACK;
 }
@@ -3475,21 +3515,21 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
  * The mmap_sem may have been released depending on flags and our return value.
  * See filemap_fault() and __lock_page_or_retry().
  */
-static int handle_pte_fault(struct fault_env *fe)
+static int handle_pte_fault(struct vm_fault *vmf)
 {
        pte_t entry;
 
-       if (unlikely(pmd_none(*fe->pmd))) {
+       if (unlikely(pmd_none(*vmf->pmd))) {
                /*
                 * Leave __pte_alloc() until later: because vm_ops->fault may
                 * want to allocate huge page, and if we expose page table
                 * for an instant, it will be difficult to retract from
                 * concurrent faults and from rmap lookups.
                 */
-               fe->pte = NULL;
+               vmf->pte = NULL;
        } else {
                /* See comment in pte_alloc_one_map() */
-               if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
+               if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
                        return 0;
                /*
                 * A regular pmd is established and it can't morph into a huge
@@ -3497,9 +3537,8 @@ static int handle_pte_fault(struct fault_env *fe)
                 * mmap_sem read mode and khugepaged takes it in write mode.
                 * So now it's safe to run pte_offset_map().
                 */
-               fe->pte = pte_offset_map(fe->pmd, fe->address);
-
-               entry = *fe->pte;
+               vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
+               vmf->orig_pte = *vmf->pte;
 
                /*
                 * some architectures can have larger ptes than wordsize,
@@ -3510,38 +3549,39 @@ static int handle_pte_fault(struct fault_env *fe)
                 * ptl lock held. So here a barrier will do.
                 */
                barrier();
-               if (pte_none(entry)) {
-                       pte_unmap(fe->pte);
-                       fe->pte = NULL;
+               if (pte_none(vmf->orig_pte)) {
+                       pte_unmap(vmf->pte);
+                       vmf->pte = NULL;
                }
        }
 
-       if (!fe->pte) {
-               if (vma_is_anonymous(fe->vma))
-                       return do_anonymous_page(fe);
+       if (!vmf->pte) {
+               if (vma_is_anonymous(vmf->vma))
+                       return do_anonymous_page(vmf);
                else
-                       return do_fault(fe);
+                       return do_fault(vmf);
        }
 
-       if (!pte_present(entry))
-               return do_swap_page(fe, entry);
+       if (!pte_present(vmf->orig_pte))
+               return do_swap_page(vmf);
 
-       if (pte_protnone(entry) && vma_is_accessible(fe->vma))
-               return do_numa_page(fe, entry);
+       if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
+               return do_numa_page(vmf);
 
-       fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd);
-       spin_lock(fe->ptl);
-       if (unlikely(!pte_same(*fe->pte, entry)))
+       vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+       spin_lock(vmf->ptl);
+       entry = vmf->orig_pte;
+       if (unlikely(!pte_same(*vmf->pte, entry)))
                goto unlock;
-       if (fe->flags & FAULT_FLAG_WRITE) {
+       if (vmf->flags & FAULT_FLAG_WRITE) {
                if (!pte_write(entry))
-                       return do_wp_page(fe, entry);
+                       return do_wp_page(vmf);
                entry = pte_mkdirty(entry);
        }
        entry = pte_mkyoung(entry);
-       if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry,
-                               fe->flags & FAULT_FLAG_WRITE)) {
-               update_mmu_cache(fe->vma, fe->address, fe->pte);
+       if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
+                               vmf->flags & FAULT_FLAG_WRITE)) {
+               update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
        } else {
                /*
                 * This is needed only for protection faults but the arch code
@@ -3549,11 +3589,11 @@ static int handle_pte_fault(struct fault_env *fe)
                 * This still avoids useless tlb flushes for .text page faults
                 * with threads.
                 */
-               if (fe->flags & FAULT_FLAG_WRITE)
-                       flush_tlb_fix_spurious_fault(fe->vma, fe->address);
+               if (vmf->flags & FAULT_FLAG_WRITE)
+                       flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
        }
 unlock:
-       pte_unmap_unlock(fe->pte, fe->ptl);
+       pte_unmap_unlock(vmf->pte, vmf->ptl);
        return 0;
 }
 
@@ -3566,10 +3606,12 @@ unlock:
 static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                unsigned int flags)
 {
-       struct fault_env fe = {
+       struct vm_fault vmf = {
                .vma = vma,
-               .address = address,
+               .address = address & PAGE_MASK,
                .flags = flags,
+               .pgoff = linear_page_index(vma, address),
+               .gfp_mask = __get_fault_gfp_mask(vma),
        };
        struct mm_struct *mm = vma->vm_mm;
        pgd_t *pgd;
@@ -3579,35 +3621,35 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
        pud = pud_alloc(mm, pgd, address);
        if (!pud)
                return VM_FAULT_OOM;
-       fe.pmd = pmd_alloc(mm, pud, address);
-       if (!fe.pmd)
+       vmf.pmd = pmd_alloc(mm, pud, address);
+       if (!vmf.pmd)
                return VM_FAULT_OOM;
-       if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) {
-               int ret = create_huge_pmd(&fe);
+       if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
+               int ret = create_huge_pmd(&vmf);
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        } else {
-               pmd_t orig_pmd = *fe.pmd;
+               pmd_t orig_pmd = *vmf.pmd;
                int ret;
 
                barrier();
                if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
                        if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
-                               return do_huge_pmd_numa_page(&fe, orig_pmd);
+                               return do_huge_pmd_numa_page(&vmf, orig_pmd);
 
-                       if ((fe.flags & FAULT_FLAG_WRITE) &&
+                       if ((vmf.flags & FAULT_FLAG_WRITE) &&
                                        !pmd_write(orig_pmd)) {
-                               ret = wp_huge_pmd(&fe, orig_pmd);
+                               ret = wp_huge_pmd(&vmf, orig_pmd);
                                if (!(ret & VM_FAULT_FALLBACK))
                                        return ret;
                        } else {
-                               huge_pmd_set_accessed(&fe, orig_pmd);
+                               huge_pmd_set_accessed(&vmf, orig_pmd);
                                return 0;
                        }
                }
        }
 
-       return handle_pte_fault(&fe);
+       return handle_pte_fault(&vmf);
 }
 
 /*
@@ -3658,6 +3700,19 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                         mem_cgroup_oom_synchronize(false);
        }
 
+       /*
+        * This mm has been already reaped by the oom reaper and so the
+        * refault cannot be trusted in general. Anonymous refaults would
+        * lose data and give a zero page instead e.g. This is especially
+        * problem for use_mm() because regular tasks will just die and
+        * the corrupted data will not be visible anywhere while kthread
+        * will outlive the oom victim and potentially propagate the date
+        * further.
+        */
+       if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR)
+                               && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags)))
+               ret = VM_FAULT_SIGBUS;
+
        return ret;
 }
 EXPORT_SYMBOL_GPL(handle_mm_fault);
@@ -3717,8 +3772,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
 
-static int __follow_pte(struct mm_struct *mm, unsigned long address,
-               pte_t **ptepp, spinlock_t **ptlp)
+static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
+               pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
 {
        pgd_t *pgd;
        pud_t *pud;
@@ -3735,11 +3790,20 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address,
 
        pmd = pmd_offset(pud, address);
        VM_BUG_ON(pmd_trans_huge(*pmd));
-       if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
-               goto out;
 
-       /* We cannot handle huge page PFN maps. Luckily they don't exist. */
-       if (pmd_huge(*pmd))
+       if (pmd_huge(*pmd)) {
+               if (!pmdpp)
+                       goto out;
+
+               *ptlp = pmd_lock(mm, pmd);
+               if (pmd_huge(*pmd)) {
+                       *pmdpp = pmd;
+                       return 0;
+               }
+               spin_unlock(*ptlp);
+       }
+
+       if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
                goto out;
 
        ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
@@ -3762,9 +3826,23 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address,
 
        /* (void) is needed to make gcc happy */
        (void) __cond_lock(*ptlp,
-                          !(res = __follow_pte(mm, address, ptepp, ptlp)));
+                          !(res = __follow_pte_pmd(mm, address, ptepp, NULL,
+                                          ptlp)));
+       return res;
+}
+
+int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
+                            pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
+{
+       int res;
+
+       /* (void) is needed to make gcc happy */
+       (void) __cond_lock(*ptlp,
+                          !(res = __follow_pte_pmd(mm, address, ptepp, pmdpp,
+                                          ptlp)));
        return res;
 }
+EXPORT_SYMBOL(follow_pte_pmd);
 
 /**
  * follow_pfn - look up PFN at a user virtual address
@@ -3851,11 +3929,12 @@ EXPORT_SYMBOL_GPL(generic_access_phys);
  * Access another process' address space as given in mm.  If non-NULL, use the
  * given task for page fault accounting.
  */
-static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
-               unsigned long addr, void *buf, int len, int write)
+int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+               unsigned long addr, void *buf, int len, unsigned int gup_flags)
 {
        struct vm_area_struct *vma;
        void *old_buf = buf;
+       int write = gup_flags & FOLL_WRITE;
 
        down_read(&mm->mmap_sem);
        /* ignore errors, just check how much was successfully transferred */
@@ -3865,7 +3944,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
                struct page *page = NULL;
 
                ret = get_user_pages_remote(tsk, mm, addr, 1,
-                               write, 1, &page, &vma);
+                               gup_flags, &page, &vma, NULL);
                if (ret <= 0) {
 #ifndef CONFIG_HAVE_IOREMAP_PROT
                        break;
@@ -3917,14 +3996,14 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
  * @addr:      start address to access
  * @buf:       source or destination buffer
  * @len:       number of bytes to transfer
- * @write:     whether the access is a write
+ * @gup_flags: flags modifying lookup behaviour
  *
  * The caller must hold a reference on @mm.
  */
 int access_remote_vm(struct mm_struct *mm, unsigned long addr,
-               void *buf, int len, int write)
+               void *buf, int len, unsigned int gup_flags)
 {
-       return __access_remote_vm(NULL, mm, addr, buf, len, write);
+       return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
 }
 
 /*
@@ -3933,7 +4012,7 @@ int access_remote_vm(struct mm_struct *mm, unsigned long addr,
  * Do not walk the page table directly, use get_user_pages
  */
 int access_process_vm(struct task_struct *tsk, unsigned long addr,
-               void *buf, int len, int write)
+               void *buf, int len, unsigned int gup_flags)
 {
        struct mm_struct *mm;
        int ret;
@@ -3942,11 +4021,13 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr,
        if (!mm)
                return 0;
 
-       ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
+       ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
+
        mmput(mm);
 
        return ret;
 }
+EXPORT_SYMBOL_GPL(access_process_vm);
 
 /*
  * Print the name of a VMA.