mm: let mm_find_pmd fix buggy race with THP fault
authorHugh Dickins <hughd@google.com>
Mon, 23 Jun 2014 20:22:05 +0000 (13:22 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 23 Jun 2014 23:47:44 +0000 (16:47 -0700)
Trinity has reported:

    BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
    IP: __lock_acquire (kernel/locking/lockdep.c:3070 (discriminator 1))
    CPU: 6 PID: 16173 Comm: trinity-c364 Tainted: G        W
                            3.15.0-rc1-next-20140415-sasha-00020-gaa90d09 #398
    lock_acquire (arch/x86/include/asm/current.h:14
                  kernel/locking/lockdep.c:3602)
    _raw_spin_lock (include/linux/spinlock_api_smp.h:143
                    kernel/locking/spinlock.c:151)
    remove_migration_pte (mm/migrate.c:137)
    rmap_walk (mm/rmap.c:1628 mm/rmap.c:1699)
    remove_migration_ptes (mm/migrate.c:224)
    migrate_pages (mm/migrate.c:922 mm/migrate.c:960 mm/migrate.c:1126)
    migrate_misplaced_page (mm/migrate.c:1733)
    __handle_mm_fault (mm/memory.c:3762 mm/memory.c:3812 mm/memory.c:3925)
    handle_mm_fault (mm/memory.c:3948)
    __get_user_pages (mm/memory.c:1851)
    __mlock_vma_pages_range (mm/mlock.c:255)
    __mm_populate (mm/mlock.c:711)
    SyS_mlockall (include/linux/mm.h:1799 mm/mlock.c:817 mm/mlock.c:791)

I believe this comes about because, whereas collapsing and splitting THP
functions take anon_vma lock in write mode (which excludes concurrent
rmap walks), faulting THP functions (write protection and misplaced
NUMA) do not - and mostly they do not need to.

But they do use a pmdp_clear_flush(), set_pmd_at() sequence which, for
an instant (indeed, for a long instant, given the inter-CPU TLB flush in
there), leaves *pmd neither present not trans_huge.

Which can confuse a concurrent rmap walk, as when removing migration
ptes, seen in the dumped trace.  Although that rmap walk has a 4k page
to insert, anon_vmas containing THPs are in no way segregated from
4k-page anon_vmas, so the 4k-intent mm_find_pmd() does need to cope with
that instant when a trans_huge pmd is temporarily absent.

I don't think we need strengthen the locking at the THP end: it's easily
handled with an ACCESS_ONCE() before testing both conditions.

And since mm_find_pmd() had only one caller who wanted a THP rather than
a pmd, let's slightly repurpose it to fail when it hits a THP or
non-present pmd, and open code split_huge_page_address() again.

Signed-off-by: Hugh Dickins <hughd@google.com>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Bob Liu <bob.liu@oracle.com>
Cc: Christoph Lameter <cl@gentwo.org>
Cc: Dave Jones <davej@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
mm/huge_memory.c
mm/ksm.c
mm/migrate.c
mm/rmap.c

index bade35ef563b084d7b9c38be10dc8747c168591a..33514d88fef9b041cef11c74717091eec4805f80 100644 (file)
@@ -2423,8 +2423,6 @@ static void collapse_huge_page(struct mm_struct *mm,
        pmd = mm_find_pmd(mm, address);
        if (!pmd)
                goto out;
-       if (pmd_trans_huge(*pmd))
-               goto out;
 
        anon_vma_lock_write(vma->anon_vma);
 
@@ -2523,8 +2521,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
        pmd = mm_find_pmd(mm, address);
        if (!pmd)
                goto out;
-       if (pmd_trans_huge(*pmd))
-               goto out;
 
        memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2877,12 +2873,22 @@ void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
 static void split_huge_page_address(struct mm_struct *mm,
                                    unsigned long address)
 {
+       pgd_t *pgd;
+       pud_t *pud;
        pmd_t *pmd;
 
        VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
 
-       pmd = mm_find_pmd(mm, address);
-       if (!pmd)
+       pgd = pgd_offset(mm, address);
+       if (!pgd_present(*pgd))
+               return;
+
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               return;
+
+       pmd = pmd_offset(pud, address);
+       if (!pmd_present(*pmd))
                return;
        /*
         * Caller holds the mmap_sem write mode, so a huge pmd cannot
index 68710e80994afed815c58b59a1a3cf421df8101f..346ddc9e4c0da44ed0c24b63d49cd40bc3d813de 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -945,7 +945,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        pmd = mm_find_pmd(mm, addr);
        if (!pmd)
                goto out;
-       BUG_ON(pmd_trans_huge(*pmd));
 
        mmun_start = addr;
        mmun_end   = addr + PAGE_SIZE;
index 63f0cd5599997ef8d1a42110c1fc9ccd6d920ac1..9e0beaa918454abbcd63e94ee6cefb5f108f751f 100644 (file)
@@ -120,8 +120,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
                pmd = mm_find_pmd(mm, addr);
                if (!pmd)
                        goto out;
-               if (pmd_trans_huge(*pmd))
-                       goto out;
 
                ptep = pte_offset_map(pmd, addr);
 
index bf05fc872ae822cda0b5bd7b6fa473a824870009..b7e94ebbd09e88c3b356e36fe89ed72b89e14474 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -569,6 +569,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
        pgd_t *pgd;
        pud_t *pud;
        pmd_t *pmd = NULL;
+       pmd_t pmde;
 
        pgd = pgd_offset(mm, address);
        if (!pgd_present(*pgd))
@@ -579,7 +580,13 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
                goto out;
 
        pmd = pmd_offset(pud, address);
-       if (!pmd_present(*pmd))
+       /*
+        * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at()
+        * without holding anon_vma lock for write.  So when looking for a
+        * genuine pmde (in which to find pte), test present and !THP together.
+        */
+       pmde = ACCESS_ONCE(*pmd);
+       if (!pmd_present(pmde) || pmd_trans_huge(pmde))
                pmd = NULL;
 out:
        return pmd;
@@ -615,9 +622,6 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
        if (!pmd)
                return NULL;
 
-       if (pmd_trans_huge(*pmd))
-               return NULL;
-
        pte = pte_offset_map(pmd, address);
        /* Make a quick check before getting the lock */
        if (!sync && !pte_present(*pte)) {