userfaultfd: non-cooperative: add event for memory unmaps
[linux-block.git] / mm / mremap.c
index da22ad2a5678265ea9f2d0aa5ece9e14c519a494..8233b0105c8258ec5757c42c0a65e34b2908272c 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/uaccess.h>
 #include <linux/mm-arch-hooks.h>
+#include <linux/userfaultfd_k.h>
 
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
@@ -104,11 +105,13 @@ static pte_t move_soft_dirty_pte(pte_t pte)
 static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                unsigned long old_addr, unsigned long old_end,
                struct vm_area_struct *new_vma, pmd_t *new_pmd,
-               unsigned long new_addr, bool need_rmap_locks)
+               unsigned long new_addr, bool need_rmap_locks, bool *need_flush)
 {
        struct mm_struct *mm = vma->vm_mm;
        pte_t *old_pte, *new_pte, pte;
        spinlock_t *old_ptl, *new_ptl;
+       bool force_flush = false;
+       unsigned long len = old_end - old_addr;
 
        /*
         * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
@@ -146,7 +149,19 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                                   new_pte++, new_addr += PAGE_SIZE) {
                if (pte_none(*old_pte))
                        continue;
+
                pte = ptep_get_and_clear(mm, old_addr, old_pte);
+               /*
+                * If we are remapping a dirty PTE, make sure
+                * to flush TLB before we drop the PTL for the
+                * old PTE or we may race with page_mkclean().
+                *
+                * This check has to be done after we removed the
+                * old PTE from page tables or another thread may
+                * dirty it after the check and before the removal.
+                */
+               if (pte_present(pte) && pte_dirty(pte))
+                       force_flush = true;
                pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
                pte = move_soft_dirty_pte(pte);
                set_pte_at(mm, new_addr, new_pte, pte);
@@ -156,6 +171,10 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
        if (new_ptl != old_ptl)
                spin_unlock(new_ptl);
        pte_unmap(new_pte - 1);
+       if (force_flush)
+               flush_tlb_range(vma, old_end - len, old_end);
+       else
+               *need_flush = true;
        pte_unmap_unlock(old_pte - 1, old_ptl);
        if (need_rmap_locks)
                drop_rmap_locks(vma);
@@ -201,13 +220,12 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                                if (need_rmap_locks)
                                        take_rmap_locks(vma);
                                moved = move_huge_pmd(vma, old_addr, new_addr,
-                                                   old_end, old_pmd, new_pmd);
+                                                   old_end, old_pmd, new_pmd,
+                                                   &need_flush);
                                if (need_rmap_locks)
                                        drop_rmap_locks(vma);
-                               if (moved) {
-                                       need_flush = true;
+                               if (moved)
                                        continue;
-                               }
                        }
                        split_huge_pmd(vma, old_pmd, old_addr);
                        if (pmd_trans_unstable(old_pmd))
@@ -220,11 +238,10 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                        extent = next - new_addr;
                if (extent > LATENCY_LIMIT)
                        extent = LATENCY_LIMIT;
-               move_ptes(vma, old_pmd, old_addr, old_addr + extent,
-                         new_vma, new_pmd, new_addr, need_rmap_locks);
-               need_flush = true;
+               move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
+                         new_pmd, new_addr, need_rmap_locks, &need_flush);
        }
-       if (likely(need_flush))
+       if (need_flush)
                flush_tlb_range(vma, old_end-len, old_addr);
 
        mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
@@ -234,7 +251,9 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 
 static unsigned long move_vma(struct vm_area_struct *vma,
                unsigned long old_addr, unsigned long old_len,
-               unsigned long new_len, unsigned long new_addr, bool *locked)
+               unsigned long new_len, unsigned long new_addr,
+               bool *locked, struct vm_userfaultfd_ctx *uf,
+               struct list_head *uf_unmap)
 {
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *new_vma;
@@ -293,6 +312,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                old_addr = new_addr;
                new_addr = err;
        } else {
+               mremap_userfaultfd_prep(new_vma, uf);
                arch_remap(mm, old_addr, old_addr + old_len,
                           new_addr, new_addr + new_len);
        }
@@ -322,7 +342,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
        if (unlikely(vma->vm_flags & VM_PFNMAP))
                untrack_pfn_moved(vma);
 
-       if (do_munmap(mm, old_addr, old_len) < 0) {
+       if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
                /* OOM: unable to split vma, just get accounts right */
                vm_unacct_memory(excess >> PAGE_SHIFT);
                excess = 0;
@@ -397,7 +417,9 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 }
 
 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
-               unsigned long new_addr, unsigned long new_len, bool *locked)
+               unsigned long new_addr, unsigned long new_len, bool *locked,
+               struct vm_userfaultfd_ctx *uf,
+               struct list_head *uf_unmap)
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
@@ -415,12 +437,12 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
        if (addr + old_len > new_addr && new_addr + new_len > addr)
                goto out;
 
-       ret = do_munmap(mm, new_addr, new_len);
+       ret = do_munmap(mm, new_addr, new_len, NULL);
        if (ret)
                goto out;
 
        if (old_len >= new_len) {
-               ret = do_munmap(mm, addr+new_len, old_len - new_len);
+               ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
                if (ret && old_len != new_len)
                        goto out;
                old_len = new_len;
@@ -442,7 +464,8 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
        if (offset_in_page(ret))
                goto out1;
 
-       ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
+       ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
+                      uf_unmap);
        if (!(offset_in_page(ret)))
                goto out;
 out1:
@@ -481,6 +504,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
        unsigned long ret = -EINVAL;
        unsigned long charged = 0;
        bool locked = false;
+       struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
+       LIST_HEAD(uf_unmap);
 
        if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
                return ret;
@@ -507,7 +532,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 
        if (flags & MREMAP_FIXED) {
                ret = mremap_to(addr, old_len, new_addr, new_len,
-                               &locked);
+                               &locked, &uf, &uf_unmap);
                goto out;
        }
 
@@ -517,7 +542,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
         * do_munmap does all the needed commit accounting
         */
        if (old_len >= new_len) {
-               ret = do_munmap(mm, addr+new_len, old_len - new_len);
+               ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap);
                if (ret && old_len != new_len)
                        goto out;
                ret = addr;
@@ -576,7 +601,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
                        goto out;
                }
 
-               ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
+               ret = move_vma(vma, addr, old_len, new_len, new_addr,
+                              &locked, &uf, &uf_unmap);
        }
 out:
        if (offset_in_page(ret)) {
@@ -586,5 +612,7 @@ out:
        up_write(&current->mm->mmap_sem);
        if (locked && new_len > old_len)
                mm_populate(new_addr + old_len, new_len - old_len);
+       mremap_userfaultfd_complete(&uf, addr, new_addr, old_len);
+       userfaultfd_unmap_complete(mm, &uf_unmap);
        return ret;
 }