Merge branch 'entropy'
[linux-2.6-block.git] / mm / khugepaged.c
index ccede2425c3f88da0529a1040025771539748e8b..0a1b4b484ac5b4a0eed5e5148f04849d9e09607b 100644 (file)
@@ -48,6 +48,7 @@ enum scan_result {
        SCAN_CGROUP_CHARGE_FAIL,
        SCAN_EXCEED_SWAP_PTE,
        SCAN_TRUNCATED,
+       SCAN_PAGE_HAS_PRIVATE,
 };
 
 #define CREATE_TRACE_POINTS
@@ -76,6 +77,8 @@ static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
 
 static struct kmem_cache *mm_slot_cache __read_mostly;
 
+#define MAX_PTE_MAPPED_THP 8
+
 /**
  * struct mm_slot - hash lookup from mm to mm_slot
  * @hash: hash collision list
@@ -86,6 +89,10 @@ struct mm_slot {
        struct hlist_node hash;
        struct list_head mm_node;
        struct mm_struct *mm;
+
+       /* pte-mapped THP in this mm */
+       int nr_pte_mapped_thp;
+       unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP];
 };
 
 /**
@@ -404,7 +411,11 @@ static bool hugepage_vma_check(struct vm_area_struct *vma,
            (vm_flags & VM_NOHUGEPAGE) ||
            test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
                return false;
-       if (shmem_file(vma->vm_file)) {
+
+       if (shmem_file(vma->vm_file) ||
+           (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
+            vma->vm_file &&
+            (vm_flags & VM_DENYWRITE))) {
                if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
                        return false;
                return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
@@ -456,8 +467,9 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
        unsigned long hstart, hend;
 
        /*
-        * khugepaged does not yet work on non-shmem files or special
-        * mappings. And file-private shmem THP is not supported.
+        * khugepaged only supports read-only files for non-shmem files.
+        * khugepaged does not yet work on special mappings. And
+        * file-private shmem THP is not supported.
         */
        if (!hugepage_vma_check(vma, vm_flags))
                return 0;
@@ -1248,6 +1260,159 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
 }
 
 #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
+/*
+ * Notify khugepaged that given addr of the mm is pte-mapped THP. Then
+ * khugepaged should try to collapse the page table.
+ */
+static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
+                                        unsigned long addr)
+{
+       struct mm_slot *mm_slot;
+
+       VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
+
+       spin_lock(&khugepaged_mm_lock);
+       mm_slot = get_mm_slot(mm);
+       if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
+               mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
+       spin_unlock(&khugepaged_mm_lock);
+       return 0;
+}
+
+/**
+ * Try to collapse a pte-mapped THP for mm at address haddr.
+ *
+ * This function checks whether all the PTEs in the PMD are pointing to the
+ * right THP. If so, retract the page table so the THP can refault in with
+ * as pmd-mapped.
+ */
+void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
+{
+       unsigned long haddr = addr & HPAGE_PMD_MASK;
+       struct vm_area_struct *vma = find_vma(mm, haddr);
+       struct page *hpage = NULL;
+       pte_t *start_pte, *pte;
+       pmd_t *pmd, _pmd;
+       spinlock_t *ptl;
+       int count = 0;
+       int i;
+
+       if (!vma || !vma->vm_file ||
+           vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
+               return;
+
+       /*
+        * This vm_flags may not have VM_HUGEPAGE if the page was not
+        * collapsed by this mm. But we can still collapse if the page is
+        * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
+        * will not fail the vma for missing VM_HUGEPAGE
+        */
+       if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
+               return;
+
+       pmd = mm_find_pmd(mm, haddr);
+       if (!pmd)
+               return;
+
+       start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
+
+       /* step 1: check all mapped PTEs are to the right huge page */
+       for (i = 0, addr = haddr, pte = start_pte;
+            i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+               struct page *page;
+
+               /* empty pte, skip */
+               if (pte_none(*pte))
+                       continue;
+
+               /* page swapped out, abort */
+               if (!pte_present(*pte))
+                       goto abort;
+
+               page = vm_normal_page(vma, addr, *pte);
+
+               if (!page || !PageCompound(page))
+                       goto abort;
+
+               if (!hpage) {
+                       hpage = compound_head(page);
+                       /*
+                        * The mapping of the THP should not change.
+                        *
+                        * Note that uprobe, debugger, or MAP_PRIVATE may
+                        * change the page table, but the new page will
+                        * not pass PageCompound() check.
+                        */
+                       if (WARN_ON(hpage->mapping != vma->vm_file->f_mapping))
+                               goto abort;
+               }
+
+               /*
+                * Confirm the page maps to the correct subpage.
+                *
+                * Note that uprobe, debugger, or MAP_PRIVATE may change
+                * the page table, but the new page will not pass
+                * PageCompound() check.
+                */
+               if (WARN_ON(hpage + i != page))
+                       goto abort;
+               count++;
+       }
+
+       /* step 2: adjust rmap */
+       for (i = 0, addr = haddr, pte = start_pte;
+            i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+               struct page *page;
+
+               if (pte_none(*pte))
+                       continue;
+               page = vm_normal_page(vma, addr, *pte);
+               page_remove_rmap(page, false);
+       }
+
+       pte_unmap_unlock(start_pte, ptl);
+
+       /* step 3: set proper refcount and mm_counters. */
+       if (hpage) {
+               page_ref_sub(hpage, count);
+               add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
+       }
+
+       /* step 4: collapse pmd */
+       ptl = pmd_lock(vma->vm_mm, pmd);
+       _pmd = pmdp_collapse_flush(vma, addr, pmd);
+       spin_unlock(ptl);
+       mm_dec_nr_ptes(mm);
+       pte_free(mm, pmd_pgtable(_pmd));
+       return;
+
+abort:
+       pte_unmap_unlock(start_pte, ptl);
+}
+
+static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+{
+       struct mm_struct *mm = mm_slot->mm;
+       int i;
+
+       if (likely(mm_slot->nr_pte_mapped_thp == 0))
+               return 0;
+
+       if (!down_write_trylock(&mm->mmap_sem))
+               return -EBUSY;
+
+       if (unlikely(khugepaged_test_exit(mm)))
+               goto out;
+
+       for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
+               collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]);
+
+out:
+       mm_slot->nr_pte_mapped_thp = 0;
+       up_write(&mm->mmap_sem);
+       return 0;
+}
+
 static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 {
        struct vm_area_struct *vma;
@@ -1256,7 +1421,22 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 
        i_mmap_lock_write(mapping);
        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
-               /* probably overkill */
+               /*
+                * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
+                * got written to. These VMAs are likely not worth investing
+                * down_write(mmap_sem) as PMD-mapping is likely to be split
+                * later.
+                *
+                * Not that vma->anon_vma check is racy: it can be set up after
+                * the check but before we took mmap_sem by the fault path.
+                * But page lock would prevent establishing any new ptes of the
+                * page, so we are safe.
+                *
+                * An alternative would be drop the check, but check that page
+                * table is clear before calling pmdp_collapse_flush() under
+                * ptl. It has higher chance to recover THP for the VMA, but
+                * has higher cost too.
+                */
                if (vma->anon_vma)
                        continue;
                addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
@@ -1269,9 +1449,10 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
                        continue;
                /*
                 * We need exclusive mmap_sem to retract page table.
-                * If trylock fails we would end up with pte-mapped THP after
-                * re-fault. Not ideal, but it's more important to not disturb
-                * the system too much.
+                *
+                * We use trylock due to lock inversion: we need to acquire
+                * mmap_sem while holding page lock. Fault path does it in
+                * reverse order. Trylock is a way to avoid deadlock.
                 */
                if (down_write_trylock(&vma->vm_mm->mmap_sem)) {
                        spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
@@ -1281,18 +1462,21 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
                        up_write(&vma->vm_mm->mmap_sem);
                        mm_dec_nr_ptes(vma->vm_mm);
                        pte_free(vma->vm_mm, pmd_pgtable(_pmd));
+               } else {
+                       /* Try again later */
+                       khugepaged_add_pte_mapped_thp(vma->vm_mm, addr);
                }
        }
        i_mmap_unlock_write(mapping);
 }
 
 /**
- * collapse_shmem - collapse small tmpfs/shmem pages into huge one.
+ * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
  *
  * Basic scheme is simple, details are more complex:
  *  - allocate and lock a new huge page;
  *  - scan page cache replacing old pages with the new one
- *    + swap in pages if necessary;
+ *    + swap/gup in pages if necessary;
  *    + fill in gaps;
  *    + keep old pages around in case rollback is required;
  *  - if replacing succeeds:
@@ -1304,10 +1488,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
  *    + restore gaps in the page cache;
  *    + unlock and free huge page;
  */
-static void collapse_shmem(struct mm_struct *mm,
-               struct address_space *mapping, pgoff_t start,
+static void collapse_file(struct mm_struct *mm,
+               struct file *file, pgoff_t start,
                struct page **hpage, int node)
 {
+       struct address_space *mapping = file->f_mapping;
        gfp_t gfp;
        struct page *new_page;
        struct mem_cgroup *memcg;
@@ -1315,7 +1500,9 @@ static void collapse_shmem(struct mm_struct *mm,
        LIST_HEAD(pagelist);
        XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
        int nr_none = 0, result = SCAN_SUCCEED;
+       bool is_shmem = shmem_file(file);
 
+       VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
        VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
 
        /* Only allocate from the target node */
@@ -1347,7 +1534,8 @@ static void collapse_shmem(struct mm_struct *mm,
        } while (1);
 
        __SetPageLocked(new_page);
-       __SetPageSwapBacked(new_page);
+       if (is_shmem)
+               __SetPageSwapBacked(new_page);
        new_page->index = start;
        new_page->mapping = mapping;
 
@@ -1362,41 +1550,75 @@ static void collapse_shmem(struct mm_struct *mm,
                struct page *page = xas_next(&xas);
 
                VM_BUG_ON(index != xas.xa_index);
-               if (!page) {
-                       /*
-                        * Stop if extent has been truncated or hole-punched,
-                        * and is now completely empty.
-                        */
-                       if (index == start) {
-                               if (!xas_next_entry(&xas, end - 1)) {
-                                       result = SCAN_TRUNCATED;
+               if (is_shmem) {
+                       if (!page) {
+                               /*
+                                * Stop if extent has been truncated or
+                                * hole-punched, and is now completely
+                                * empty.
+                                */
+                               if (index == start) {
+                                       if (!xas_next_entry(&xas, end - 1)) {
+                                               result = SCAN_TRUNCATED;
+                                               goto xa_locked;
+                                       }
+                                       xas_set(&xas, index);
+                               }
+                               if (!shmem_charge(mapping->host, 1)) {
+                                       result = SCAN_FAIL;
                                        goto xa_locked;
                                }
-                               xas_set(&xas, index);
+                               xas_store(&xas, new_page);
+                               nr_none++;
+                               continue;
                        }
-                       if (!shmem_charge(mapping->host, 1)) {
-                               result = SCAN_FAIL;
+
+                       if (xa_is_value(page) || !PageUptodate(page)) {
+                               xas_unlock_irq(&xas);
+                               /* swap in or instantiate fallocated page */
+                               if (shmem_getpage(mapping->host, index, &page,
+                                                 SGP_NOHUGE)) {
+                                       result = SCAN_FAIL;
+                                       goto xa_unlocked;
+                               }
+                       } else if (trylock_page(page)) {
+                               get_page(page);
+                               xas_unlock_irq(&xas);
+                       } else {
+                               result = SCAN_PAGE_LOCK;
                                goto xa_locked;
                        }
-                       xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
-                       nr_none++;
-                       continue;
-               }
-
-               if (xa_is_value(page) || !PageUptodate(page)) {
-                       xas_unlock_irq(&xas);
-                       /* swap in or instantiate fallocated page */
-                       if (shmem_getpage(mapping->host, index, &page,
-                                               SGP_NOHUGE)) {
+               } else {        /* !is_shmem */
+                       if (!page || xa_is_value(page)) {
+                               xas_unlock_irq(&xas);
+                               page_cache_sync_readahead(mapping, &file->f_ra,
+                                                         file, index,
+                                                         PAGE_SIZE);
+                               /* drain pagevecs to help isolate_lru_page() */
+                               lru_add_drain();
+                               page = find_lock_page(mapping, index);
+                               if (unlikely(page == NULL)) {
+                                       result = SCAN_FAIL;
+                                       goto xa_unlocked;
+                               }
+                       } else if (!PageUptodate(page)) {
+                               xas_unlock_irq(&xas);
+                               wait_on_page_locked(page);
+                               if (!trylock_page(page)) {
+                                       result = SCAN_PAGE_LOCK;
+                                       goto xa_unlocked;
+                               }
+                               get_page(page);
+                       } else if (PageDirty(page)) {
                                result = SCAN_FAIL;
-                               goto xa_unlocked;
+                               goto xa_locked;
+                       } else if (trylock_page(page)) {
+                               get_page(page);
+                               xas_unlock_irq(&xas);
+                       } else {
+                               result = SCAN_PAGE_LOCK;
+                               goto xa_locked;
                        }
-               } else if (trylock_page(page)) {
-                       get_page(page);
-                       xas_unlock_irq(&xas);
-               } else {
-                       result = SCAN_PAGE_LOCK;
-                       goto xa_locked;
                }
 
                /*
@@ -1425,6 +1647,12 @@ static void collapse_shmem(struct mm_struct *mm,
                        goto out_unlock;
                }
 
+               if (page_has_private(page) &&
+                   !try_to_release_page(page, GFP_KERNEL)) {
+                       result = SCAN_PAGE_HAS_PRIVATE;
+                       goto out_unlock;
+               }
+
                if (page_mapped(page))
                        unmap_mapping_pages(mapping, index, 1, false);
 
@@ -1454,7 +1682,7 @@ static void collapse_shmem(struct mm_struct *mm,
                list_add_tail(&page->lru, &pagelist);
 
                /* Finally, replace with the new page. */
-               xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
+               xas_store(&xas, new_page);
                continue;
 out_unlock:
                unlock_page(page);
@@ -1462,12 +1690,20 @@ out_unlock:
                goto xa_unlocked;
        }
 
-       __inc_node_page_state(new_page, NR_SHMEM_THPS);
+       if (is_shmem)
+               __inc_node_page_state(new_page, NR_SHMEM_THPS);
+       else {
+               __inc_node_page_state(new_page, NR_FILE_THPS);
+               filemap_nr_thps_inc(mapping);
+       }
+
        if (nr_none) {
                struct zone *zone = page_zone(new_page);
 
                __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none);
-               __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none);
+               if (is_shmem)
+                       __mod_node_page_state(zone->zone_pgdat,
+                                             NR_SHMEM, nr_none);
        }
 
 xa_locked:
@@ -1505,10 +1741,15 @@ xa_unlocked:
 
                SetPageUptodate(new_page);
                page_ref_add(new_page, HPAGE_PMD_NR - 1);
-               set_page_dirty(new_page);
                mem_cgroup_commit_charge(new_page, memcg, false, true);
+
+               if (is_shmem) {
+                       set_page_dirty(new_page);
+                       lru_cache_add_anon(new_page);
+               } else {
+                       lru_cache_add_file(new_page);
+               }
                count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
-               lru_cache_add_anon(new_page);
 
                /*
                 * Remove pte page tables, so we can re-fault the page as huge.
@@ -1523,7 +1764,9 @@ xa_unlocked:
                /* Something went wrong: roll back page cache changes */
                xas_lock_irq(&xas);
                mapping->nrpages -= nr_none;
-               shmem_uncharge(mapping->host, nr_none);
+
+               if (is_shmem)
+                       shmem_uncharge(mapping->host, nr_none);
 
                xas_set(&xas, start);
                xas_for_each(&xas, page, end - 1) {
@@ -1563,11 +1806,11 @@ out:
        /* TODO: tracepoints */
 }
 
-static void khugepaged_scan_shmem(struct mm_struct *mm,
-               struct address_space *mapping,
-               pgoff_t start, struct page **hpage)
+static void khugepaged_scan_file(struct mm_struct *mm,
+               struct file *file, pgoff_t start, struct page **hpage)
 {
        struct page *page = NULL;
+       struct address_space *mapping = file->f_mapping;
        XA_STATE(xas, &mapping->i_pages, start);
        int present, swap;
        int node = NUMA_NO_NODE;
@@ -1606,7 +1849,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
                        break;
                }
 
-               if (page_count(page) != 1 + page_mapcount(page)) {
+               if (page_count(page) !=
+                   1 + page_mapcount(page) + page_has_private(page)) {
                        result = SCAN_PAGE_COUNT;
                        break;
                }
@@ -1631,19 +1875,23 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
                        result = SCAN_EXCEED_NONE_PTE;
                } else {
                        node = khugepaged_find_target_node();
-                       collapse_shmem(mm, mapping, start, hpage, node);
+                       collapse_file(mm, file, start, hpage, node);
                }
        }
 
        /* TODO: tracepoints */
 }
 #else
-static void khugepaged_scan_shmem(struct mm_struct *mm,
-               struct address_space *mapping,
-               pgoff_t start, struct page **hpage)
+static void khugepaged_scan_file(struct mm_struct *mm,
+               struct file *file, pgoff_t start, struct page **hpage)
 {
        BUILD_BUG();
 }
+
+static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+{
+       return 0;
+}
 #endif
 
 static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
@@ -1668,6 +1916,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
                khugepaged_scan.mm_slot = mm_slot;
        }
        spin_unlock(&khugepaged_mm_lock);
+       khugepaged_collapse_pte_mapped_thps(mm_slot);
 
        mm = mm_slot->mm;
        /*
@@ -1713,17 +1962,18 @@ skip:
                        VM_BUG_ON(khugepaged_scan.address < hstart ||
                                  khugepaged_scan.address + HPAGE_PMD_SIZE >
                                  hend);
-                       if (shmem_file(vma->vm_file)) {
+                       if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
                                struct file *file;
                                pgoff_t pgoff = linear_page_index(vma,
                                                khugepaged_scan.address);
-                               if (!shmem_huge_enabled(vma))
+
+                               if (shmem_file(vma->vm_file)
+                                   && !shmem_huge_enabled(vma))
                                        goto skip;
                                file = get_file(vma->vm_file);
                                up_read(&mm->mmap_sem);
                                ret = 1;
-                               khugepaged_scan_shmem(mm, file->f_mapping,
-                                               pgoff, hpage);
+                               khugepaged_scan_file(mm, file, pgoff, hpage);
                                fput(file);
                        } else {
                                ret = khugepaged_scan_pmd(mm, vma,