Merge tag 'x86_mm_for_6.2_v2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
[linux-block.git] / mm / khugepaged.c
index 52237a777ebd5c3ce6f6bef0af9deb477b55b5ac..5cb401aa2b9d8b424462f4bded8511491d1c7045 100644 (file)
@@ -97,8 +97,8 @@ struct collapse_control {
        /* Num pages scanned per node */
        u32 node_load[MAX_NUMNODES];
 
-       /* Last target selected in hpage_collapse_find_target_node() */
-       int last_target_node;
+       /* nodemask for allocation fallback */
+       nodemask_t alloc_nmask;
 };
 
 /**
@@ -734,7 +734,6 @@ static void khugepaged_alloc_sleep(void)
 
 struct collapse_control khugepaged_collapse_control = {
        .is_khugepaged = true,
-       .last_target_node = NUMA_NO_NODE,
 };
 
 static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc)
@@ -783,16 +782,11 @@ static int hpage_collapse_find_target_node(struct collapse_control *cc)
                        target_node = nid;
                }
 
-       /* do some balance if several nodes have the same hit record */
-       if (target_node <= cc->last_target_node)
-               for (nid = cc->last_target_node + 1; nid < MAX_NUMNODES;
-                    nid++)
-                       if (max_value == cc->node_load[nid]) {
-                               target_node = nid;
-                               break;
-                       }
+       for_each_online_node(nid) {
+               if (max_value == cc->node_load[nid])
+                       node_set(nid, cc->alloc_nmask);
+       }
 
-       cc->last_target_node = target_node;
        return target_node;
 }
 #else
@@ -802,9 +796,10 @@ static int hpage_collapse_find_target_node(struct collapse_control *cc)
 }
 #endif
 
-static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node)
+static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node,
+                                     nodemask_t *nmask)
 {
-       *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
+       *hpage = __alloc_pages(gfp, HPAGE_PMD_ORDER, node, nmask);
        if (unlikely(!*hpage)) {
                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                return false;
@@ -955,12 +950,11 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
 static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
                              struct collapse_control *cc)
 {
-       /* Only allocate from the target node */
        gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
-                    GFP_TRANSHUGE) | __GFP_THISNODE;
+                    GFP_TRANSHUGE);
        int node = hpage_collapse_find_target_node(cc);
 
-       if (!hpage_collapse_alloc_page(hpage, gfp, node))
+       if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask))
                return SCAN_ALLOC_HUGE_PAGE_FAIL;
        if (unlikely(mem_cgroup_charge(page_folio(*hpage), mm, gfp)))
                return SCAN_CGROUP_CHARGE_FAIL;
@@ -1057,6 +1051,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
        _pmd = pmdp_collapse_flush(vma, address, pmd);
        spin_unlock(pmd_ptl);
        mmu_notifier_invalidate_range_end(&range);
+       tlb_remove_table_sync_one();
 
        spin_lock(pte_ptl);
        result =  __collapse_huge_page_isolate(vma, address, pte, cc,
@@ -1144,6 +1139,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
                goto out;
 
        memset(cc->node_load, 0, sizeof(cc->node_load));
+       nodes_clear(cc->alloc_nmask);
        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
        for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
             _pte++, _address += PAGE_SIZE) {
@@ -1242,15 +1238,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
                /*
                 * Check if the page has any GUP (or other external) pins.
                 *
-                * Here the check is racy it may see total_mapcount > refcount
-                * in some cases.
-                * For example, one process with one forked child process.
-                * The parent has the PMD split due to MADV_DONTNEED, then
-                * the child is trying unmap the whole PMD, but khugepaged
-                * may be scanning the parent between the child has
-                * PageDoubleMap flag cleared and dec the mapcount.  So
-                * khugepaged may see total_mapcount > refcount.
-                *
+                * Here the check may be racy:
+                * it may see total_mapcount > refcount in some cases?
                 * But such case is ephemeral we could always retry collapse
                 * later.  However it may report false positive if the page
                 * has excessive GUP pins (i.e. 512).  Anyway the same check
@@ -1384,16 +1373,43 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
        return SCAN_SUCCEED;
 }
 
+/*
+ * A note about locking:
+ * Trying to take the page table spinlocks would be useless here because those
+ * are only used to synchronize:
+ *
+ *  - modifying terminal entries (ones that point to a data page, not to another
+ *    page table)
+ *  - installing *new* non-terminal entries
+ *
+ * Instead, we need roughly the same kind of protection as free_pgtables() or
+ * mm_take_all_locks() (but only for a single VMA):
+ * The mmap lock together with this VMA's rmap locks covers all paths towards
+ * the page table entries we're messing with here, except for hardware page
+ * table walks and lockless_pages_from_mm().
+ */
 static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
                                  unsigned long addr, pmd_t *pmdp)
 {
-       spinlock_t *ptl;
        pmd_t pmd;
+       struct mmu_notifier_range range;
 
        mmap_assert_write_locked(mm);
-       ptl = pmd_lock(vma->vm_mm, pmdp);
+       if (vma->vm_file)
+               lockdep_assert_held_write(&vma->vm_file->f_mapping->i_mmap_rwsem);
+       /*
+        * All anon_vmas attached to the VMA have the same root and are
+        * therefore locked by the same lock.
+        */
+       if (vma->anon_vma)
+               lockdep_assert_held_write(&vma->anon_vma->root->rwsem);
+
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, addr,
+                               addr + HPAGE_PMD_SIZE);
+       mmu_notifier_invalidate_range_start(&range);
        pmd = pmdp_collapse_flush(vma, addr, pmdp);
-       spin_unlock(ptl);
+       tlb_remove_table_sync_one();
+       mmu_notifier_invalidate_range_end(&range);
        mm_dec_nr_ptes(mm);
        page_table_check_pte_clear_range(mm, addr, pmd);
        pte_free(mm, pmd_pgtable(pmd));
@@ -1444,6 +1460,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
        if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
                return SCAN_VMA_CHECK;
 
+       /*
+        * Symmetry with retract_page_tables(): Exclude MAP_PRIVATE mappings
+        * that got written to. Without this, we'd have to also lock the
+        * anon_vma if one exists.
+        */
+       if (vma->anon_vma)
+               return SCAN_VMA_CHECK;
+
        /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
        if (userfaultfd_wp(vma))
                return SCAN_PTE_UFFD_WP;
@@ -1477,6 +1501,20 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
                goto drop_hpage;
        }
 
+       /*
+        * We need to lock the mapping so that from here on, only GUP-fast and
+        * hardware page walks can access the parts of the page tables that
+        * we're operating on.
+        * See collapse_and_free_pmd().
+        */
+       i_mmap_lock_write(vma->vm_file->f_mapping);
+
+       /*
+        * This spinlock should be unnecessary: Nobody else should be accessing
+        * the page tables under spinlock protection here, only
+        * lockless_pages_from_mm() and the hardware page walker can access page
+        * tables while all the high-level locks are held in write mode.
+        */
        start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
        result = SCAN_FAIL;
 
@@ -1531,6 +1569,8 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
        /* step 4: remove pte entries */
        collapse_and_free_pmd(mm, vma, haddr, pmd);
 
+       i_mmap_unlock_write(vma->vm_file->f_mapping);
+
 maybe_install_pmd:
        /* step 5: install pmd entry */
        result = install_pmd
@@ -1544,6 +1584,7 @@ drop_hpage:
 
 abort:
        pte_unmap_unlock(start_pte, ptl);
+       i_mmap_unlock_write(vma->vm_file->f_mapping);
        goto drop_hpage;
 }
 
@@ -1600,7 +1641,8 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
                 * An alternative would be drop the check, but check that page
                 * table is clear before calling pmdp_collapse_flush() under
                 * ptl. It has higher chance to recover THP for the VMA, but
-                * has higher cost too.
+                * has higher cost too. It would also probably require locking
+                * the anon_vma.
                 */
                if (vma->anon_vma) {
                        result = SCAN_PAGE_ANON;
@@ -1702,12 +1744,12 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
 {
        struct address_space *mapping = file->f_mapping;
        struct page *hpage;
-       pgoff_t index, end = start + HPAGE_PMD_NR;
+       pgoff_t index = 0, end = start + HPAGE_PMD_NR;
        LIST_HEAD(pagelist);
        XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
        int nr_none = 0, result = SCAN_SUCCEED;
        bool is_shmem = shmem_file(file);
-       int nr;
+       int nr = 0;
 
        VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
        VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
@@ -1747,6 +1789,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
        xas_set(&xas, start);
        for (index = start; index < end; index++) {
                struct page *page = xas_next(&xas);
+               struct folio *folio;
 
                VM_BUG_ON(index != xas.xa_index);
                if (is_shmem) {
@@ -1773,8 +1816,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
                        }
 
                        if (xa_is_value(page) || !PageUptodate(page)) {
-                               struct folio *folio;
-
                                xas_unlock_irq(&xas);
                                /* swap in or instantiate fallocated page */
                                if (shmem_get_folio(mapping->host, index,
@@ -1862,13 +1903,15 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
                        goto out_unlock;
                }
 
-               if (page_mapping(page) != mapping) {
+               folio = page_folio(page);
+
+               if (folio_mapping(folio) != mapping) {
                        result = SCAN_TRUNCATED;
                        goto out_unlock;
                }
 
-               if (!is_shmem && (PageDirty(page) ||
-                                 PageWriteback(page))) {
+               if (!is_shmem && (folio_test_dirty(folio) ||
+                                 folio_test_writeback(folio))) {
                        /*
                         * khugepaged only works on read-only fd, so this
                         * page is dirty because it hasn't been flushed
@@ -1878,20 +1921,20 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
                        goto out_unlock;
                }
 
-               if (isolate_lru_page(page)) {
+               if (folio_isolate_lru(folio)) {
                        result = SCAN_DEL_PAGE_LRU;
                        goto out_unlock;
                }
 
-               if (page_has_private(page) &&
-                   !try_to_release_page(page, GFP_KERNEL)) {
+               if (folio_has_private(folio) &&
+                   !filemap_release_folio(folio, GFP_KERNEL)) {
                        result = SCAN_PAGE_HAS_PRIVATE;
-                       putback_lru_page(page);
+                       folio_putback_lru(folio);
                        goto out_unlock;
                }
 
-               if (page_mapped(page))
-                       try_to_unmap(page_folio(page),
+               if (folio_mapped(folio))
+                       try_to_unmap(folio,
                                        TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);
 
                xas_lock_irq(&xas);
@@ -1970,6 +2013,7 @@ xa_unlocked:
 
        if (result == SCAN_SUCCEED) {
                struct page *page, *tmp;
+               struct folio *folio;
 
                /*
                 * Replacing old pages with new one has succeeded, now we
@@ -1997,11 +2041,13 @@ xa_unlocked:
                        index++;
                }
 
-               SetPageUptodate(hpage);
-               page_ref_add(hpage, HPAGE_PMD_NR - 1);
+               folio = page_folio(hpage);
+               folio_mark_uptodate(folio);
+               folio_ref_add(folio, HPAGE_PMD_NR - 1);
+
                if (is_shmem)
-                       set_page_dirty(hpage);
-               lru_cache_add(hpage);
+                       folio_mark_dirty(folio);
+               folio_add_lru(folio);
 
                /*
                 * Remove pte page tables, so we can re-fault the page as huge.
@@ -2059,7 +2105,8 @@ out:
                mem_cgroup_uncharge(page_folio(hpage));
                put_page(hpage);
        }
-       /* TODO: tracepoints */
+
+       trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result);
        return result;
 }
 
@@ -2077,6 +2124,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
        present = 0;
        swap = 0;
        memset(cc->node_load, 0, sizeof(cc->node_load));
+       nodes_clear(cc->alloc_nmask);
        rcu_read_lock();
        xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
                if (xas_retry(&xas, page))
@@ -2157,8 +2205,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
                }
        }
 
-       trace_mm_khugepaged_scan_file(mm, page, file->f_path.dentry->d_iname,
-                                     present, swap, result);
+       trace_mm_khugepaged_scan_file(mm, page, file, present, swap, result);
        return result;
 }
 #else
@@ -2528,6 +2575,11 @@ void khugepaged_min_free_kbytes_update(void)
        mutex_unlock(&khugepaged_mutex);
 }
 
+bool current_is_khugepaged(void)
+{
+       return kthread_func(current) == khugepaged;
+}
+
 static int madvise_collapse_errno(enum scan_result r)
 {
        /*
@@ -2576,7 +2628,6 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
        if (!cc)
                return -ENOMEM;
        cc->is_khugepaged = false;
-       cc->last_target_node = NUMA_NO_NODE;
 
        mmgrab(mm);
        lru_add_drain_all();
@@ -2602,6 +2653,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
                }
                mmap_assert_locked(mm);
                memset(cc->node_load, 0, sizeof(cc->node_load));
+               nodes_clear(cc->alloc_nmask);
                if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
                        struct file *file = get_file(vma->vm_file);
                        pgoff_t pgoff = linear_page_index(vma, addr);