mm: THP page cache support for ppc64
[linux-2.6-block.git] / mm / memory.c
index e18c57bdc75c4c96e3ef79546afe11fab5a3c07a..32e9b7aec36680ce9c7c9492e05e0359487e8998 100644 (file)
@@ -300,15 +300,14 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
        struct mmu_gather_batch *batch;
 
        VM_BUG_ON(!tlb->end);
-
-       if (!tlb->page_size)
-               tlb->page_size = page_size;
-       else {
-               if (page_size != tlb->page_size)
-                       return true;
-       }
+       VM_WARN_ON(tlb->page_size != page_size);
 
        batch = tlb->active;
+       /*
+        * Add the page and check if we are full. If so
+        * force a flush.
+        */
+       batch->pages[batch->nr++] = page;
        if (batch->nr == batch->max) {
                if (!tlb_next_batch(tlb))
                        return true;
@@ -316,7 +315,6 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
        }
        VM_BUG_ON_PAGE(batch->nr > batch->max, page);
 
-       batch->pages[batch->nr++] = page;
        return false;
 }
 
@@ -528,7 +526,11 @@ void free_pgd_range(struct mmu_gather *tlb,
                end -= PMD_SIZE;
        if (addr > end - 1)
                return;
-
+       /*
+        * We add page table cache pages with PAGE_SIZE,
+        * (see pte_free_tlb()), flush the tlb if we need
+        */
+       tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
        pgd = pgd_offset(tlb->mm, addr);
        do {
                next = pgd_addr_end(addr, end);
@@ -1118,8 +1120,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
        pte_t *start_pte;
        pte_t *pte;
        swp_entry_t entry;
-       struct page *pending_page = NULL;
 
+       tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
 again:
        init_rss_vec(rss);
        start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
@@ -1172,7 +1174,6 @@ again:
                                print_bad_pte(vma, addr, ptent, page);
                        if (unlikely(__tlb_remove_page(tlb, page))) {
                                force_flush = 1;
-                               pending_page = page;
                                addr += PAGE_SIZE;
                                break;
                        }
@@ -1213,11 +1214,6 @@ again:
        if (force_flush) {
                force_flush = 0;
                tlb_flush_mmu_free(tlb);
-               if (pending_page) {
-                       /* remove the page with new size */
-                       __tlb_remove_pte_page(tlb, pending_page);
-                       pending_page = NULL;
-               }
                if (addr != end)
                        goto again;
        }
@@ -1240,7 +1236,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                        if (next - addr != HPAGE_PMD_SIZE) {
                                VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
                                    !rwsem_is_locked(&tlb->mm->mmap_sem), vma);
-                               split_huge_pmd(vma, pmd, addr);
+                               __split_huge_pmd(vma, pmd, addr, false, NULL);
                        } else if (zap_huge_pmd(tlb, vma, pmd, addr))
                                goto next;
                        /* fall through */
@@ -1637,8 +1633,8 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
 
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return -EFAULT;
-       if (track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)))
-               return -EINVAL;
+
+       track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
 
        ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
 
@@ -1655,8 +1651,8 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
 
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return -EFAULT;
-       if (track_pfn_insert(vma, &pgprot, pfn))
-               return -EINVAL;
+
+       track_pfn_insert(vma, &pgprot, pfn);
 
        /*
         * If we don't have pte special, then we have to use the pfn_valid()
@@ -2939,6 +2935,19 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
        return true;
 }
 
+static void deposit_prealloc_pte(struct fault_env *fe)
+{
+       struct vm_area_struct *vma = fe->vma;
+
+       pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte);
+       /*
+        * We are going to consume the prealloc table,
+        * count that as nr_ptes.
+        */
+       atomic_long_inc(&vma->vm_mm->nr_ptes);
+       fe->prealloc_pte = 0;
+}
+
 static int do_set_pmd(struct fault_env *fe, struct page *page)
 {
        struct vm_area_struct *vma = fe->vma;
@@ -2953,6 +2962,17 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
        ret = VM_FAULT_FALLBACK;
        page = compound_head(page);
 
+       /*
+        * Archs like ppc64 need additonal space to store information
+        * related to pte entry. Use the preallocated table for that.
+        */
+       if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) {
+               fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address);
+               if (!fe->prealloc_pte)
+                       return VM_FAULT_OOM;
+               smp_wmb(); /* See comment in __pte_alloc() */
+       }
+
        fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
        if (unlikely(!pmd_none(*fe->pmd)))
                goto out;
@@ -2966,6 +2986,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
 
        add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR);
        page_add_file_rmap(page, true);
+       /*
+        * deposit and withdraw with pmd lock held
+        */
+       if (arch_needs_pgtable_deposit())
+               deposit_prealloc_pte(fe);
 
        set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
 
@@ -2975,6 +3000,13 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
        ret = 0;
        count_vm_event(THP_FILE_MAPPED);
 out:
+       /*
+        * If we are going to fallback to pte mapping, do a
+        * withdraw with pmd lock held.
+        */
+       if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK)
+               fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm,
+                                                              fe->pmd);
        spin_unlock(fe->ptl);
        return ret;
 }
@@ -3014,18 +3046,20 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
 
                ret = do_set_pmd(fe, page);
                if (ret != VM_FAULT_FALLBACK)
-                       return ret;
+                       goto fault_handled;
        }
 
        if (!fe->pte) {
                ret = pte_alloc_one_map(fe);
                if (ret)
-                       return ret;
+                       goto fault_handled;
        }
 
        /* Re-check under ptl */
-       if (unlikely(!pte_none(*fe->pte)))
-               return VM_FAULT_NOPAGE;
+       if (unlikely(!pte_none(*fe->pte))) {
+               ret = VM_FAULT_NOPAGE;
+               goto fault_handled;
+       }
 
        flush_icache_page(vma, page);
        entry = mk_pte(page, vma->vm_page_prot);
@@ -3045,8 +3079,15 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
 
        /* no need to invalidate: a not-present page won't be cached */
        update_mmu_cache(vma, fe->address, fe->pte);
+       ret = 0;
 
-       return 0;
+fault_handled:
+       /* preallocated pagetable is unused: free it */
+       if (fe->prealloc_pte) {
+               pte_free(fe->vma->vm_mm, fe->prealloc_pte);
+               fe->prealloc_pte = 0;
+       }
+       return ret;
 }
 
 static unsigned long fault_around_bytes __read_mostly =
@@ -3145,11 +3186,6 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
 
        fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff);
 
-       /* preallocated pagetable is unused: free it */
-       if (fe->prealloc_pte) {
-               pte_free(fe->vma->vm_mm, fe->prealloc_pte);
-               fe->prealloc_pte = 0;
-       }
        /* Huge page is mapped? Page fault is solved */
        if (pmd_trans_huge(*fe->pmd)) {
                ret = VM_FAULT_NOPAGE;
@@ -3454,7 +3490,7 @@ static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd)
 
        /* COW handled on pte level: split pmd */
        VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma);
-       split_huge_pmd(fe->vma, fe->pmd, fe->address);
+       __split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL);
 
        return VM_FAULT_FALLBACK;
 }