mm: remember exclusively mapped anonymous pages with PG_anon_exclusive
[linux-block.git] / mm / rmap.c
index 90f92c53476f9caf9b9c68101440c6926d96c7d0..0d63e7ce35cc2dc9757e4873bfab24a13072a518 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1088,6 +1088,7 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
 void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
 {
        struct anon_vma *anon_vma = vma->anon_vma;
+       struct page *subpage = page;
 
        page = compound_head(page);
 
@@ -1101,6 +1102,7 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
         * folio_test_anon()) will not see one without the other.
         */
        WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
+       SetPageAnonExclusive(subpage);
 }
 
 /**
@@ -1118,7 +1120,7 @@ static void __page_set_anon_rmap(struct page *page,
        BUG_ON(!anon_vma);
 
        if (PageAnon(page))
-               return;
+               goto out;
 
        /*
         * If the page isn't exclusively mapped into this vma,
@@ -1137,6 +1139,9 @@ static void __page_set_anon_rmap(struct page *page,
        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
        WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
        page->index = linear_page_index(vma, address);
+out:
+       if (exclusive)
+               SetPageAnonExclusive(page);
 }
 
 /**
@@ -1198,6 +1203,8 @@ void page_add_anon_rmap(struct page *page,
        } else {
                first = atomic_inc_and_test(&page->_mapcount);
        }
+       VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page);
+       VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page);
 
        if (first) {
                int nr = compound ? thp_nr_pages(page) : 1;
@@ -1459,7 +1466,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
        pte_t pteval;
        struct page *subpage;
-       bool ret = true;
+       bool anon_exclusive, ret = true;
        struct mmu_notifier_range range;
        enum ttu_flags flags = (enum ttu_flags)(long)arg;
 
@@ -1515,6 +1522,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
                subpage = folio_page(folio,
                                        pte_pfn(*pvmw.pte) - folio_pfn(folio));
                address = pvmw.address;
+               anon_exclusive = folio_test_anon(folio) &&
+                                PageAnonExclusive(subpage);
 
                if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
                        /*
@@ -1550,9 +1559,12 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
                        }
                }
 
-               /* Nuke the page table entry. */
+               /*
+                * Nuke the page table entry. When having to clear
+                * PageAnonExclusive(), we always have to flush.
+                */
                flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
-               if (should_defer_flush(mm, flags)) {
+               if (should_defer_flush(mm, flags) && !anon_exclusive) {
                        /*
                         * We clear the PTE but do not flush so potentially
                         * a remote CPU could still be writing to the folio.
@@ -1677,6 +1689,24 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
+                       if (anon_exclusive &&
+                           page_try_share_anon_rmap(subpage)) {
+                               swap_free(entry);
+                               set_pte_at(mm, address, pvmw.pte, pteval);
+                               ret = false;
+                               page_vma_mapped_walk_done(&pvmw);
+                               break;
+                       }
+                       /*
+                        * Note: We *don't* remember yet if the page was mapped
+                        * exclusively in the swap entry, so swapin code has
+                        * to re-determine that manually and might detect the
+                        * page as possibly shared, for example, if there are
+                        * other references on the page or if the page is under
+                        * writeback. We made sure that there are no GUP pins
+                        * on the page that would rely on it, so for GUP pins
+                        * this is fine.
+                        */
                        if (list_empty(&mm->mmlist)) {
                                spin_lock(&mmlist_lock);
                                if (list_empty(&mm->mmlist))
@@ -1776,7 +1806,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
        pte_t pteval;
        struct page *subpage;
-       bool ret = true;
+       bool anon_exclusive, ret = true;
        struct mmu_notifier_range range;
        enum ttu_flags flags = (enum ttu_flags)(long)arg;
 
@@ -1837,6 +1867,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
                subpage = folio_page(folio,
                                pte_pfn(*pvmw.pte) - folio_pfn(folio));
                address = pvmw.address;
+               anon_exclusive = folio_test_anon(folio) &&
+                                PageAnonExclusive(subpage);
 
                if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
                        /*
@@ -1888,6 +1920,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
                        swp_entry_t entry;
                        pte_t swp_pte;
 
+                       if (anon_exclusive)
+                               BUG_ON(page_try_share_anon_rmap(subpage));
+
                        /*
                         * Store the pfn of the page in a special migration
                         * pte. do_swap_page() will wait until the migration
@@ -1896,6 +1931,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
                        entry = pte_to_swp_entry(pteval);
                        if (is_writable_device_private_entry(entry))
                                entry = make_writable_migration_entry(pfn);
+                       else if (anon_exclusive)
+                               entry = make_readable_exclusive_migration_entry(pfn);
                        else
                                entry = make_readable_migration_entry(pfn);
                        swp_pte = swp_entry_to_pte(entry);
@@ -1960,6 +1997,15 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
+                       VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
+                                      !anon_exclusive, subpage);
+                       if (anon_exclusive &&
+                           page_try_share_anon_rmap(subpage)) {
+                               set_pte_at(mm, address, pvmw.pte, pteval);
+                               ret = false;
+                               page_vma_mapped_walk_done(&pvmw);
+                               break;
+                       }
 
                        /*
                         * Store the pfn of the page in a special migration
@@ -1969,6 +2015,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
                        if (pte_write(pteval))
                                entry = make_writable_migration_entry(
                                                        page_to_pfn(subpage));
+                       else if (anon_exclusive)
+                               entry = make_readable_exclusive_migration_entry(
+                                                       page_to_pfn(subpage));
                        else
                                entry = make_readable_migration_entry(
                                                        page_to_pfn(subpage));
@@ -2405,6 +2454,8 @@ void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
        BUG_ON(!anon_vma);
        /* address might be in next vma when migration races vma_adjust */
        first = atomic_inc_and_test(compound_mapcount_ptr(page));
+       VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page);
+       VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page);
        if (first)
                __page_set_anon_rmap(page, vma, address,
                                     !!(flags & RMAP_EXCLUSIVE));