mm, hugetlb: fix racy resv_huge_pages underflow on UFFDIO_COPY
authorMina Almasry <almasrymina@google.com>
Thu, 1 Jul 2021 01:48:19 +0000 (18:48 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 1 Jul 2021 03:47:26 +0000 (20:47 -0700)
On UFFDIO_COPY, if we fail to copy the page contents while holding the
hugetlb_fault_mutex, we will drop the mutex and return to the caller after
allocating a page that consumed a reservation.  In this case there may be
a fault that double consumes the reservation.  To handle this, we free the
allocated page, fix the reservations, and allocate a temporary hugetlb
page and return that to the caller.  When the caller does the copy outside
of the lock, we again check the cache, and allocate a page consuming the
reservation, and copy over the contents.

Test:
Hacked the code locally such that resv_huge_pages underflows produce
a warning and the copy_huge_page_from_user() always fails, then:

./tools/testing/selftests/vm/userfaultfd hugetlb_shared 10
        2 /tmp/kokonut_test/huge/userfaultfd_test && echo test success
./tools/testing/selftests/vm/userfaultfd hugetlb 10
2 /tmp/kokonut_test/huge/userfaultfd_test && echo test success

Both tests succeed and produce no warnings. After the
test runs number of free/resv hugepages is correct.

[yuehaibing@huawei.com: remove set but not used variable 'vm_alloc_shared']
Link: https://lkml.kernel.org/r/20210601141610.28332-1-yuehaibing@huawei.com
[almasrymina@google.com: fix allocation error check and copy func name]
Link: https://lkml.kernel.org/r/20210605010626.1459873-1-almasrymina@google.com
Link: https://lkml.kernel.org/r/20210528005029.88088-1-almasrymina@google.com
Signed-off-by: Mina Almasry <almasrymina@google.com>
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
include/linux/migrate.h
mm/hugetlb.c
mm/migrate.c
mm/userfaultfd.c

index 4bb4e519e3f56db06fc0237754bf82b923a4fd87..7b7b7397727851d9ee573ba2bea79519eff8d1b0 100644 (file)
@@ -51,6 +51,7 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
                                  struct page *newpage, struct page *page);
 extern int migrate_page_move_mapping(struct address_space *mapping,
                struct page *newpage, struct page *page, int extra_count);
+extern void copy_huge_page(struct page *dst, struct page *src);
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
@@ -77,6 +78,9 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
        return -ENOSYS;
 }
 
+static inline void copy_huge_page(struct page *dst, struct page *src)
+{
+}
 #endif /* CONFIG_MIGRATION */
 
 #ifdef CONFIG_COMPACTION
index 88f2178ad7c992921aea02dd93471761d111b8ce..b14f4d1749b215fa55ec4d4bcf5b5c43acb6ed17 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/numa.h>
 #include <linux/llist.h>
 #include <linux/cma.h>
+#include <linux/migrate.h>
 
 #include <asm/page.h>
 #include <asm/pgalloc.h>
@@ -5076,20 +5077,17 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                            struct page **pagep)
 {
        bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
-       struct address_space *mapping;
-       pgoff_t idx;
+       struct hstate *h = hstate_vma(dst_vma);
+       struct address_space *mapping = dst_vma->vm_file->f_mapping;
+       pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
        unsigned long size;
        int vm_shared = dst_vma->vm_flags & VM_SHARED;
-       struct hstate *h = hstate_vma(dst_vma);
        pte_t _dst_pte;
        spinlock_t *ptl;
-       int ret;
+       int ret = -ENOMEM;
        struct page *page;
        int writable;
 
-       mapping = dst_vma->vm_file->f_mapping;
-       idx = vma_hugecache_offset(h, dst_vma, dst_addr);
-
        if (is_continue) {
                ret = -EFAULT;
                page = find_lock_page(mapping, idx);
@@ -5118,12 +5116,44 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                /* fallback to copy_from_user outside mmap_lock */
                if (unlikely(ret)) {
                        ret = -ENOENT;
+                       /* Free the allocated page which may have
+                        * consumed a reservation.
+                        */
+                       restore_reserve_on_error(h, dst_vma, dst_addr, page);
+                       put_page(page);
+
+                       /* Allocate a temporary page to hold the copied
+                        * contents.
+                        */
+                       page = alloc_huge_page_vma(h, dst_vma, dst_addr);
+                       if (!page) {
+                               ret = -ENOMEM;
+                               goto out;
+                       }
                        *pagep = page;
-                       /* don't free the page */
+                       /* Set the outparam pagep and return to the caller to
+                        * copy the contents outside the lock. Don't free the
+                        * page.
+                        */
                        goto out;
                }
        } else {
-               page = *pagep;
+               if (vm_shared &&
+                   hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
+                       put_page(*pagep);
+                       ret = -EEXIST;
+                       *pagep = NULL;
+                       goto out;
+               }
+
+               page = alloc_huge_page(dst_vma, dst_addr, 0);
+               if (IS_ERR(page)) {
+                       ret = -ENOMEM;
+                       *pagep = NULL;
+                       goto out;
+               }
+               copy_huge_page(page, *pagep);
+               put_page(*pagep);
                *pagep = NULL;
        }
 
index 75a15f0a2698767afe9dbffdef6ab9e78af3830b..8fc766e52e527e127d17f0e1c1d6a8f54d0b15ee 100644 (file)
@@ -553,7 +553,7 @@ static void __copy_gigantic_page(struct page *dst, struct page *src,
        }
 }
 
-static void copy_huge_page(struct page *dst, struct page *src)
+void copy_huge_page(struct page *dst, struct page *src)
 {
        int i;
        int nr_pages;
index 63a73e164d5510fba964eec777b7486b634a1343..da5535d2d99033965df01c1c81d046be3029177d 100644 (file)
@@ -209,7 +209,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
                                              unsigned long len,
                                              enum mcopy_atomic_mode mode)
 {
-       int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
        int vm_shared = dst_vma->vm_flags & VM_SHARED;
        ssize_t err;
        pte_t *dst_pte;
@@ -308,7 +307,6 @@ retry:
 
                mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                i_mmap_unlock_read(mapping);
-               vm_alloc_shared = vm_shared;
 
                cond_resched();
 
@@ -346,54 +344,8 @@ retry:
 out_unlock:
        mmap_read_unlock(dst_mm);
 out:
-       if (page) {
-               /*
-                * We encountered an error and are about to free a newly
-                * allocated huge page.
-                *
-                * Reservation handling is very subtle, and is different for
-                * private and shared mappings.  See the routine
-                * restore_reserve_on_error for details.  Unfortunately, we
-                * can not call restore_reserve_on_error now as it would
-                * require holding mmap_lock.
-                *
-                * If a reservation for the page existed in the reservation
-                * map of a private mapping, the map was modified to indicate
-                * the reservation was consumed when the page was allocated.
-                * We clear the HPageRestoreReserve flag now so that the global
-                * reserve count will not be incremented in free_huge_page.
-                * The reservation map will still indicate the reservation
-                * was consumed and possibly prevent later page allocation.
-                * This is better than leaking a global reservation.  If no
-                * reservation existed, it is still safe to clear
-                * HPageRestoreReserve as no adjustments to reservation counts
-                * were made during allocation.
-                *
-                * The reservation map for shared mappings indicates which
-                * pages have reservations.  When a huge page is allocated
-                * for an address with a reservation, no change is made to
-                * the reserve map.  In this case HPageRestoreReserve will be
-                * set to indicate that the global reservation count should be
-                * incremented when the page is freed.  This is the desired
-                * behavior.  However, when a huge page is allocated for an
-                * address without a reservation a reservation entry is added
-                * to the reservation map, and HPageRestoreReserve will not be
-                * set. When the page is freed, the global reserve count will
-                * NOT be incremented and it will appear as though we have
-                * leaked reserved page.  In this case, set HPageRestoreReserve
-                * so that the global reserve count will be incremented to
-                * match the reservation map entry which was created.
-                *
-                * Note that vm_alloc_shared is based on the flags of the vma
-                * for which the page was originally allocated.  dst_vma could
-                * be different or NULL on error.
-                */
-               if (vm_alloc_shared)
-                       SetHPageRestoreReserve(page);
-               else
-                       ClearHPageRestoreReserve(page);
+       if (page)
                put_page(page);
-       }
        BUG_ON(copied < 0);
        BUG_ON(err > 0);
        BUG_ON(!copied && !err);