mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable
[linux-2.6-block.git] / mm / huge_memory.c
index 199b261a257e77811a38e9ec9015068ed6d5f733..a24c9cb9c83eb7c82b0c6aabd28eeefacb7cfb75 100644 (file)
@@ -600,7 +600,7 @@ out:
 }
 __setup("transparent_hugepage=", setup_transparent_hugepage);
 
-static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 {
        if (likely(vma->vm_flags & VM_WRITE))
                pmd = pmd_mkwrite(pmd);
@@ -1023,10 +1023,12 @@ out:
 int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                unsigned long addr, pmd_t pmd, pmd_t *pmdp)
 {
-       struct page *page = NULL;
+       struct page *page;
        unsigned long haddr = addr & HPAGE_PMD_MASK;
        int target_nid;
        int current_nid = -1;
+       bool migrated;
+       bool page_locked = false;
 
        spin_lock(&mm->page_table_lock);
        if (unlikely(!pmd_same(pmd, *pmdp)))
@@ -1034,42 +1036,61 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
        page = pmd_page(pmd);
        get_page(page);
-       spin_unlock(&mm->page_table_lock);
        current_nid = page_to_nid(page);
        count_vm_numa_event(NUMA_HINT_FAULTS);
        if (current_nid == numa_node_id())
                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
 
        target_nid = mpol_misplaced(page, vma, haddr);
-       if (target_nid == -1)
+       if (target_nid == -1) {
+               put_page(page);
                goto clear_pmdnuma;
+       }
 
-       /*
-        * Due to lacking code to migrate thp pages, we'll split
-        * (which preserves the special PROT_NONE) and re-take the
-        * fault on the normal pages.
-        */
-       split_huge_page(page);
-       put_page(page);
-
-       return 0;
+       /* Acquire the page lock to serialise THP migrations */
+       spin_unlock(&mm->page_table_lock);
+       lock_page(page);
+       page_locked = true;
 
-clear_pmdnuma:
+       /* Confirm the PTE did not while locked */
        spin_lock(&mm->page_table_lock);
-       if (unlikely(!pmd_same(pmd, *pmdp)))
+       if (unlikely(!pmd_same(pmd, *pmdp))) {
+               unlock_page(page);
+               put_page(page);
                goto out_unlock;
+       }
+       spin_unlock(&mm->page_table_lock);
+
+       /* Migrate the THP to the requested node */
+       migrated = migrate_misplaced_transhuge_page(mm, vma,
+                               pmdp, pmd, addr,
+                               page, target_nid);
+       if (migrated)
+               current_nid = target_nid;
+       else {
+               spin_lock(&mm->page_table_lock);
+               if (unlikely(!pmd_same(pmd, *pmdp))) {
+                       unlock_page(page);
+                       goto out_unlock;
+               }
+               goto clear_pmdnuma;
+       }
+
+       task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
+       return 0;
 
+clear_pmdnuma:
        pmd = pmd_mknonnuma(pmd);
        set_pmd_at(mm, haddr, pmdp, pmd);
        VM_BUG_ON(pmd_numa(*pmdp));
        update_mmu_cache_pmd(vma, addr, pmdp);
+       if (page_locked)
+               unlock_page(page);
 
 out_unlock:
        spin_unlock(&mm->page_table_lock);
-       if (page) {
-               put_page(page);
-               task_numa_fault(numa_node_id(), HPAGE_PMD_NR, false);
-       }
+       if (current_nid != -1)
+               task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
        return 0;
 }
 
@@ -1271,7 +1292,7 @@ static int __split_huge_page_splitting(struct page *page,
                 * We can't temporarily set the pmd to null in order
                 * to split it, the pmd must remain marked huge at all
                 * times or the VM won't take the pmd_trans_huge paths
-                * and it won't wait on the anon_vma->root->mutex to
+                * and it won't wait on the anon_vma->root->rwsem to
                 * serialize against split_huge_page*.
                 */
                pmdp_splitting_flush(vma, address, pmd);
@@ -1474,7 +1495,7 @@ static int __split_huge_page_map(struct page *page,
        return ret;
 }
 
-/* must be called with anon_vma->root->mutex hold */
+/* must be called with anon_vma->root->rwsem held */
 static void __split_huge_page(struct page *page,
                              struct anon_vma *anon_vma)
 {
@@ -1528,7 +1549,7 @@ int split_huge_page(struct page *page)
        int ret = 1;
 
        BUG_ON(!PageAnon(page));
-       anon_vma = page_lock_anon_vma(page);
+       anon_vma = page_lock_anon_vma_read(page);
        if (!anon_vma)
                goto out;
        ret = 0;
@@ -1541,7 +1562,7 @@ int split_huge_page(struct page *page)
 
        BUG_ON(PageCompound(page));
 out_unlock:
-       page_unlock_anon_vma(anon_vma);
+       page_unlock_anon_vma_read(anon_vma);
 out:
        return ret;
 }
@@ -2053,7 +2074,7 @@ static void collapse_huge_page(struct mm_struct *mm,
        if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
                goto out;
 
-       anon_vma_lock(vma->anon_vma);
+       anon_vma_lock_write(vma->anon_vma);
 
        pte = pte_offset_map(pmd, address);
        ptl = pte_lockptr(mm, pmd);