userfaultfd: non-cooperative: add event for memory unmaps

[linux-block.git] / mm / mremap.c
diff --git a/mm/mremap.c b/mm/mremap.c

index da22ad2a5678265ea9f2d0aa5ece9e14c519a494..8233b0105c8258ec5757c42c0a65e34b2908272c 100644 (file)
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -22,6 +22,7 @@
  #include <linux/mmu_notifier.h>
  #include <linux/uaccess.h>
  #include <linux/mm-arch-hooks.h>
+#include <linux/userfaultfd_k.h>
  
  #include <asm/cacheflush.h>
  #include <asm/tlbflush.h>
@@ -104,11 +105,13 @@ static pte_t move_soft_dirty_pte(pte_t pte)
  static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                 unsigned long old_addr, unsigned long old_end,
                 struct vm_area_struct *new_vma, pmd_t *new_pmd,
-               unsigned long new_addr, bool need_rmap_locks)
+               unsigned long new_addr, bool need_rmap_locks, bool *need_flush)
  {
         struct mm_struct *mm = vma->vm_mm;
         pte_t *old_pte, *new_pte, pte;
         spinlock_t *old_ptl, *new_ptl;
+       bool force_flush = false;
+       unsigned long len = old_end - old_addr;
  
         /*
          * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
@@ -146,7 +149,19 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                                    new_pte++, new_addr += PAGE_SIZE) {
                 if (pte_none(*old_pte))
                         continue;
+
                 pte = ptep_get_and_clear(mm, old_addr, old_pte);
+               /*
+                * If we are remapping a dirty PTE, make sure
+                * to flush TLB before we drop the PTL for the
+                * old PTE or we may race with page_mkclean().
+                *
+                * This check has to be done after we removed the
+                * old PTE from page tables or another thread may
+                * dirty it after the check and before the removal.
+                */
+               if (pte_present(pte) && pte_dirty(pte))
+                       force_flush = true;
                 pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
                 pte = move_soft_dirty_pte(pte);
                 set_pte_at(mm, new_addr, new_pte, pte);
@@ -156,6 +171,10 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
         if (new_ptl != old_ptl)
                 spin_unlock(new_ptl);
         pte_unmap(new_pte - 1);
+       if (force_flush)
+               flush_tlb_range(vma, old_end - len, old_end);
+       else
+               *need_flush = true;
         pte_unmap_unlock(old_pte - 1, old_ptl);
         if (need_rmap_locks)
                 drop_rmap_locks(vma);
@@ -201,13 +220,12 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                                 if (need_rmap_locks)
                                         take_rmap_locks(vma);
                                 moved = move_huge_pmd(vma, old_addr, new_addr,
-                                                   old_end, old_pmd, new_pmd);
+                                                   old_end, old_pmd, new_pmd,
+                                                   &need_flush);
                                 if (need_rmap_locks)
                                         drop_rmap_locks(vma);
-                               if (moved) {
-                                       need_flush = true;
+                               if (moved)
                                         continue;
-                               }
                         }
                         split_huge_pmd(vma, old_pmd, old_addr);
                         if (pmd_trans_unstable(old_pmd))
@@ -220,11 +238,10 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                         extent = next - new_addr;
                 if (extent > LATENCY_LIMIT)
                         extent = LATENCY_LIMIT;
-               move_ptes(vma, old_pmd, old_addr, old_addr + extent,
-                         new_vma, new_pmd, new_addr, need_rmap_locks);
-               need_flush = true;
+               move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
+                         new_pmd, new_addr, need_rmap_locks, &need_flush);
         }
-       if (likely(need_flush))
+       if (need_flush)
                 flush_tlb_range(vma, old_end-len, old_addr);
  
         mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
@@ -234,7 +251,9 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
  
  static unsigned long move_vma(struct vm_area_struct *vma,
                 unsigned long old_addr, unsigned long old_len,
-               unsigned long new_len, unsigned long new_addr, bool *locked)
+               unsigned long new_len, unsigned long new_addr,
+               bool *locked, struct vm_userfaultfd_ctx *uf,
+               struct list_head *uf_unmap)
  {
         struct mm_struct *mm = vma->vm_mm;
         struct vm_area_struct *new_vma;
@@ -293,6 +312,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                 old_addr = new_addr;
                 new_addr = err;
         } else {
+               mremap_userfaultfd_prep(new_vma, uf);
                 arch_remap(mm, old_addr, old_addr + old_len,
                            new_addr, new_addr + new_len);
         }
@@ -322,7 +342,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
         if (unlikely(vma->vm_flags & VM_PFNMAP))
                 untrack_pfn_moved(vma);
  
-       if (do_munmap(mm, old_addr, old_len) < 0) {
+       if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
                 /* OOM: unable to split vma, just get accounts right */
                 vm_unacct_memory(excess >> PAGE_SHIFT);
                 excess = 0;
@@ -397,7 +417,9 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
  }
  
  static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
-               unsigned long new_addr, unsigned long new_len, bool *locked)
+               unsigned long new_addr, unsigned long new_len, bool *locked,
+               struct vm_userfaultfd_ctx *uf,
+               struct list_head *uf_unmap)
  {
         struct mm_struct *mm = current->mm;
         struct vm_area_struct *vma;
@@ -415,12 +437,12 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
         if (addr + old_len > new_addr && new_addr + new_len > addr)
                 goto out;
  
-       ret = do_munmap(mm, new_addr, new_len);
+       ret = do_munmap(mm, new_addr, new_len, NULL);
         if (ret)
                 goto out;
  
         if (old_len >= new_len) {
-               ret = do_munmap(mm, addr+new_len, old_len - new_len);
+               ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
                 if (ret && old_len != new_len)
                         goto out;
                 old_len = new_len;
@@ -442,7 +464,8 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
         if (offset_in_page(ret))
                 goto out1;
  
-       ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
+       ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
+                      uf_unmap);
         if (!(offset_in_page(ret)))
                 goto out;
  out1:
@@ -481,6 +504,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
         unsigned long ret = -EINVAL;
         unsigned long charged = 0;
         bool locked = false;
+       struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
+       LIST_HEAD(uf_unmap);
  
         if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
                 return ret;
@@ -507,7 +532,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
  
         if (flags & MREMAP_FIXED) {
                 ret = mremap_to(addr, old_len, new_addr, new_len,
-                               &locked);
+                               &locked, &uf, &uf_unmap);
                 goto out;
         }
  
@@ -517,7 +542,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
          * do_munmap does all the needed commit accounting
          */
         if (old_len >= new_len) {
-               ret = do_munmap(mm, addr+new_len, old_len - new_len);
+               ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap);
                 if (ret && old_len != new_len)
                         goto out;
                 ret = addr;
@@ -576,7 +601,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
                         goto out;
                 }
  
-               ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
+               ret = move_vma(vma, addr, old_len, new_len, new_addr,
+                              &locked, &uf, &uf_unmap);
         }
  out:
         if (offset_in_page(ret)) {
@@ -586,5 +612,7 @@ out:
         up_write(&current->mm->mmap_sem);
         if (locked && new_len > old_len)
                 mm_populate(new_addr + old_len, new_len - old_len);
+       mremap_userfaultfd_complete(&uf, addr, new_addr, old_len);
+       userfaultfd_unmap_complete(mm, &uf_unmap);
         return ret;
  }