Merge tag 'pci-v5.19-fixes-1' of git://git.kernel.org/pub/scm/linux/kernel/git/helgaa...
[linux-block.git] / mm / rmap.c
index fedb82371efed2de7aadd5066e5fcbc4600a494b..5bcb334cd6f216f037dcf52771f8dedfb16b7e8e 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -73,6 +73,7 @@
 #include <linux/page_idle.h>
 #include <linux/memremap.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/mm_inline.h>
 
 #include <asm/tlbflush.h>
 
@@ -298,7 +299,7 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
                 * Reuse existing anon_vma if its degree lower than two,
                 * that means it has no vma and only one anon_vma child.
                 *
-                * Do not chose parent anon_vma, otherwise first child
+                * Do not choose parent anon_vma, otherwise first child
                 * will always reuse it. Root anon_vma is never reused:
                 * it has self-parent reference and at least one child.
                 */
@@ -526,9 +527,11 @@ out:
  *
  * Its a little more complex as it tries to keep the fast path to a single
  * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
- * reference like with page_get_anon_vma() and then block on the mutex.
+ * reference like with page_get_anon_vma() and then block on the mutex
+ * on !rwc->try_lock case.
  */
-struct anon_vma *folio_lock_anon_vma_read(struct folio *folio)
+struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
+                                         struct rmap_walk_control *rwc)
 {
        struct anon_vma *anon_vma = NULL;
        struct anon_vma *root_anon_vma;
@@ -556,6 +559,12 @@ struct anon_vma *folio_lock_anon_vma_read(struct folio *folio)
                goto out;
        }
 
+       if (rwc && rwc->try_lock) {
+               anon_vma = NULL;
+               rwc->contended = true;
+               goto out;
+       }
+
        /* trylock failed, we got to sleep */
        if (!atomic_inc_not_zero(&anon_vma->refcount)) {
                anon_vma = NULL;
@@ -882,7 +891,8 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
  *
  * Quick test_and_clear_referenced for all mappings of a folio,
  *
- * Return: The number of mappings which referenced the folio.
+ * Return: The number of mappings which referenced the folio. Return -1 if
+ * the function bailed out due to rmap lock contention.
  */
 int folio_referenced(struct folio *folio, int is_locked,
                     struct mem_cgroup *memcg, unsigned long *vm_flags)
@@ -896,6 +906,7 @@ int folio_referenced(struct folio *folio, int is_locked,
                .rmap_one = folio_referenced_one,
                .arg = (void *)&pra,
                .anon_lock = folio_lock_anon_vma_read,
+               .try_lock = true,
        };
 
        *vm_flags = 0;
@@ -926,15 +937,15 @@ int folio_referenced(struct folio *folio, int is_locked,
        if (we_locked)
                folio_unlock(folio);
 
-       return pra.referenced;
+       return rwc.contended ? -1 : pra.referenced;
 }
 
-static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
-                           unsigned long address, void *arg)
+static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
 {
-       DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
+       int cleaned = 0;
+       struct vm_area_struct *vma = pvmw->vma;
        struct mmu_notifier_range range;
-       int *cleaned = arg;
+       unsigned long address = pvmw->address;
 
        /*
         * We have to assume the worse case ie pmd for invalidation. Note that
@@ -942,16 +953,16 @@ static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
         */
        mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
                                0, vma, vma->vm_mm, address,
-                               vma_address_end(&pvmw));
+                               vma_address_end(pvmw));
        mmu_notifier_invalidate_range_start(&range);
 
-       while (page_vma_mapped_walk(&pvmw)) {
+       while (page_vma_mapped_walk(pvmw)) {
                int ret = 0;
 
-               address = pvmw.address;
-               if (pvmw.pte) {
+               address = pvmw->address;
+               if (pvmw->pte) {
                        pte_t entry;
-                       pte_t *pte = pvmw.pte;
+                       pte_t *pte = pvmw->pte;
 
                        if (!pte_dirty(*pte) && !pte_write(*pte))
                                continue;
@@ -964,13 +975,14 @@ static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
                        ret = 1;
                } else {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-                       pmd_t *pmd = pvmw.pmd;
+                       pmd_t *pmd = pvmw->pmd;
                        pmd_t entry;
 
                        if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
                                continue;
 
-                       flush_cache_page(vma, address, folio_pfn(folio));
+                       flush_cache_range(vma, address,
+                                         address + HPAGE_PMD_SIZE);
                        entry = pmdp_invalidate(vma, address, pmd);
                        entry = pmd_wrprotect(entry);
                        entry = pmd_mkclean(entry);
@@ -990,11 +1002,22 @@ static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
                 * See Documentation/vm/mmu_notifier.rst
                 */
                if (ret)
-                       (*cleaned)++;
+                       cleaned++;
        }
 
        mmu_notifier_invalidate_range_end(&range);
 
+       return cleaned;
+}
+
+static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
+                            unsigned long address, void *arg)
+{
+       DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
+       int *cleaned = arg;
+
+       *cleaned += page_vma_mkclean_one(&pvmw);
+
        return true;
 }
 
@@ -1031,6 +1054,38 @@ int folio_mkclean(struct folio *folio)
 }
 EXPORT_SYMBOL_GPL(folio_mkclean);
 
+/**
+ * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of
+ *                     [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff)
+ *                     within the @vma of shared mappings. And since clean PTEs
+ *                     should also be readonly, write protects them too.
+ * @pfn: start pfn.
+ * @nr_pages: number of physically contiguous pages srarting with @pfn.
+ * @pgoff: page offset that the @pfn mapped with.
+ * @vma: vma that @pfn mapped within.
+ *
+ * Returns the number of cleaned PTEs (including PMDs).
+ */
+int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
+                     struct vm_area_struct *vma)
+{
+       struct page_vma_mapped_walk pvmw = {
+               .pfn            = pfn,
+               .nr_pages       = nr_pages,
+               .pgoff          = pgoff,
+               .vma            = vma,
+               .flags          = PVMW_SYNC,
+       };
+
+       if (invalid_mkclean_vma(vma, NULL))
+               return 0;
+
+       pvmw.address = vma_pgoff_address(pgoff, nr_pages, vma);
+       VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma);
+
+       return page_vma_mkclean_one(&pvmw);
+}
+
 /**
  * page_move_anon_rmap - move a page to our anon_vma
  * @page:      the page to move to our anon_vma
@@ -1044,6 +1099,7 @@ EXPORT_SYMBOL_GPL(folio_mkclean);
 void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
 {
        struct anon_vma *anon_vma = vma->anon_vma;
+       struct page *subpage = page;
 
        page = compound_head(page);
 
@@ -1057,6 +1113,7 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
         * folio_test_anon()) will not see one without the other.
         */
        WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
+       SetPageAnonExclusive(subpage);
 }
 
 /**
@@ -1074,7 +1131,7 @@ static void __page_set_anon_rmap(struct page *page,
        BUG_ON(!anon_vma);
 
        if (PageAnon(page))
-               return;
+               goto out;
 
        /*
         * If the page isn't exclusively mapped into this vma,
@@ -1093,6 +1150,9 @@ static void __page_set_anon_rmap(struct page *page,
        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
        WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
        page->index = linear_page_index(vma, address);
+out:
+       if (exclusive)
+               SetPageAnonExclusive(page);
 }
 
 /**
@@ -1127,7 +1187,7 @@ static void __page_check_anon_rmap(struct page *page,
  * @page:      the page to add the mapping to
  * @vma:       the vm area in which the mapping is added
  * @address:   the user virtual address mapped
- * @compound:  charge the page as compound or small page
+ * @flags:     the rmap flags
  *
  * The caller needs to hold the pte lock, and the page must be locked in
  * the anon_vma case: to serialize mapping,index checking after setting,
@@ -1135,18 +1195,7 @@ static void __page_check_anon_rmap(struct page *page,
  * (but PageKsm is never downgraded to PageAnon).
  */
 void page_add_anon_rmap(struct page *page,
-       struct vm_area_struct *vma, unsigned long address, bool compound)
-{
-       do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
-}
-
-/*
- * Special version of the above for do_swap_page, which often runs
- * into pages that are exclusively owned by the current process.
- * Everybody else should continue to use page_add_anon_rmap above.
- */
-void do_page_add_anon_rmap(struct page *page,
-       struct vm_area_struct *vma, unsigned long address, int flags)
+       struct vm_area_struct *vma, unsigned long address, rmap_t flags)
 {
        bool compound = flags & RMAP_COMPOUND;
        bool first;
@@ -1165,6 +1214,8 @@ void do_page_add_anon_rmap(struct page *page,
        } else {
                first = atomic_inc_and_test(&page->_mapcount);
        }
+       VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page);
+       VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page);
 
        if (first) {
                int nr = compound ? thp_nr_pages(page) : 1;
@@ -1185,7 +1236,7 @@ void do_page_add_anon_rmap(struct page *page,
        /* address might be in next vma when migration races vma_adjust */
        else if (first)
                __page_set_anon_rmap(page, vma, address,
-                               flags & RMAP_EXCLUSIVE);
+                                    !!(flags & RMAP_EXCLUSIVE));
        else
                __page_check_anon_rmap(page, vma, address);
 
@@ -1193,19 +1244,22 @@ void do_page_add_anon_rmap(struct page *page,
 }
 
 /**
- * page_add_new_anon_rmap - add pte mapping to a new anonymous page
+ * page_add_new_anon_rmap - add mapping to a new anonymous page
  * @page:      the page to add the mapping to
  * @vma:       the vm area in which the mapping is added
  * @address:   the user virtual address mapped
- * @compound:  charge the page as compound or small page
+ *
+ * If it's a compound page, it is accounted as a compound page. As the page
+ * is new, it's assume to get mapped exclusively by a single process.
  *
  * Same as page_add_anon_rmap but must only be called on *new* pages.
  * This means the inc-and-test can be bypassed.
  * Page does not have to be locked.
  */
 void page_add_new_anon_rmap(struct page *page,
-       struct vm_area_struct *vma, unsigned long address, bool compound)
+       struct vm_area_struct *vma, unsigned long address)
 {
+       const bool compound = PageCompound(page);
        int nr = compound ? thp_nr_pages(page) : 1;
 
        VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
@@ -1218,8 +1272,6 @@ void page_add_new_anon_rmap(struct page *page,
 
                __mod_lruvec_page_state(page, NR_ANON_THPS, nr);
        } else {
-               /* Anon THP always mapped first with PMD */
-               VM_BUG_ON_PAGE(PageTransCompound(page), page);
                /* increment count (starts at -1) */
                atomic_set(&page->_mapcount, 0);
        }
@@ -1425,7 +1477,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
        pte_t pteval;
        struct page *subpage;
-       bool ret = true;
+       bool anon_exclusive, ret = true;
        struct mmu_notifier_range range;
        enum ttu_flags flags = (enum ttu_flags)(long)arg;
 
@@ -1481,59 +1533,81 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
                subpage = folio_page(folio,
                                        pte_pfn(*pvmw.pte) - folio_pfn(folio));
                address = pvmw.address;
+               anon_exclusive = folio_test_anon(folio) &&
+                                PageAnonExclusive(subpage);
 
-               if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
+               if (folio_test_hugetlb(folio)) {
                        /*
-                        * To call huge_pmd_unshare, i_mmap_rwsem must be
-                        * held in write mode.  Caller needs to explicitly
-                        * do this outside rmap routines.
+                        * The try_to_unmap() is only passed a hugetlb page
+                        * in the case where the hugetlb page is poisoned.
                         */
-                       VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
-                       if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
-                               /*
-                                * huge_pmd_unshare unmapped an entire PMD
-                                * page.  There is no way of knowing exactly
-                                * which PMDs may be cached for this mm, so
-                                * we must flush them all.  start/end were
-                                * already adjusted above to cover this range.
-                                */
-                               flush_cache_range(vma, range.start, range.end);
-                               flush_tlb_range(vma, range.start, range.end);
-                               mmu_notifier_invalidate_range(mm, range.start,
-                                                             range.end);
+                       VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage);
+                       /*
+                        * huge_pmd_unshare may unmap an entire PMD page.
+                        * There is no way of knowing exactly which PMDs may
+                        * be cached for this mm, so we must flush them all.
+                        * start/end were already adjusted above to cover this
+                        * range.
+                        */
+                       flush_cache_range(vma, range.start, range.end);
 
+                       if (!folio_test_anon(folio)) {
                                /*
-                                * The ref count of the PMD page was dropped
-                                * which is part of the way map counting
-                                * is done for shared PMDs.  Return 'true'
-                                * here.  When there is no other sharing,
-                                * huge_pmd_unshare returns false and we will
-                                * unmap the actual page and drop map count
-                                * to zero.
+                                * To call huge_pmd_unshare, i_mmap_rwsem must be
+                                * held in write mode.  Caller needs to explicitly
+                                * do this outside rmap routines.
                                 */
-                               page_vma_mapped_walk_done(&pvmw);
-                               break;
+                               VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+
+                               if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
+                                       flush_tlb_range(vma, range.start, range.end);
+                                       mmu_notifier_invalidate_range(mm, range.start,
+                                                                     range.end);
+
+                                       /*
+                                        * The ref count of the PMD page was dropped
+                                        * which is part of the way map counting
+                                        * is done for shared PMDs.  Return 'true'
+                                        * here.  When there is no other sharing,
+                                        * huge_pmd_unshare returns false and we will
+                                        * unmap the actual page and drop map count
+                                        * to zero.
+                                        */
+                                       page_vma_mapped_walk_done(&pvmw);
+                                       break;
+                               }
                        }
-               }
-
-               /* Nuke the page table entry. */
-               flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
-               if (should_defer_flush(mm, flags)) {
+                       pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
+               } else {
+                       flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
                        /*
-                        * We clear the PTE but do not flush so potentially
-                        * a remote CPU could still be writing to the folio.
-                        * If the entry was previously clean then the
-                        * architecture must guarantee that a clear->dirty
-                        * transition on a cached TLB entry is written through
-                        * and traps if the PTE is unmapped.
+                        * Nuke the page table entry. When having to clear
+                        * PageAnonExclusive(), we always have to flush.
                         */
-                       pteval = ptep_get_and_clear(mm, address, pvmw.pte);
+                       if (should_defer_flush(mm, flags) && !anon_exclusive) {
+                               /*
+                                * We clear the PTE but do not flush so potentially
+                                * a remote CPU could still be writing to the folio.
+                                * If the entry was previously clean then the
+                                * architecture must guarantee that a clear->dirty
+                                * transition on a cached TLB entry is written through
+                                * and traps if the PTE is unmapped.
+                                */
+                               pteval = ptep_get_and_clear(mm, address, pvmw.pte);
 
-                       set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
-               } else {
-                       pteval = ptep_clear_flush(vma, address, pvmw.pte);
+                               set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
+                       } else {
+                               pteval = ptep_clear_flush(vma, address, pvmw.pte);
+                       }
                }
 
+               /*
+                * Now the pte is cleared. If this pte was uffd-wp armed,
+                * we may want to replace a none pte with a marker pte if
+                * it's file-backed, so we don't lose the tracking info.
+                */
+               pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
+
                /* Set the dirty flag on the folio now the pte is gone. */
                if (pte_dirty(pteval))
                        folio_mark_dirty(folio);
@@ -1637,11 +1711,31 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
                                break;
                        }
                        if (arch_unmap_one(mm, vma, address, pteval) < 0) {
+                               swap_free(entry);
+                               set_pte_at(mm, address, pvmw.pte, pteval);
+                               ret = false;
+                               page_vma_mapped_walk_done(&pvmw);
+                               break;
+                       }
+                       if (anon_exclusive &&
+                           page_try_share_anon_rmap(subpage)) {
+                               swap_free(entry);
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
+                       /*
+                        * Note: We *don't* remember if the page was mapped
+                        * exclusively in the swap pte if the architecture
+                        * doesn't support __HAVE_ARCH_PTE_SWP_EXCLUSIVE. In
+                        * that case, swapin code has to re-determine that
+                        * manually and might detect the page as possibly
+                        * shared, for example, if there are other references on
+                        * the page or if the page is under writeback. We made
+                        * sure that there are no GUP pins on the page that
+                        * would rely on it, so for GUP pins this is fine.
+                        */
                        if (list_empty(&mm->mmlist)) {
                                spin_lock(&mmlist_lock);
                                if (list_empty(&mm->mmlist))
@@ -1651,6 +1745,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
                        dec_mm_counter(mm, MM_ANONPAGES);
                        inc_mm_counter(mm, MM_SWAPENTS);
                        swp_pte = swp_entry_to_pte(entry);
+                       if (anon_exclusive)
+                               swp_pte = pte_swp_mkexclusive(swp_pte);
                        if (pte_soft_dirty(pteval))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
                        if (pte_uffd_wp(pteval))
@@ -1741,7 +1837,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
        pte_t pteval;
        struct page *subpage;
-       bool ret = true;
+       bool anon_exclusive, ret = true;
        struct mmu_notifier_range range;
        enum ttu_flags flags = (enum ttu_flags)(long)arg;
 
@@ -1791,7 +1887,11 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
                        VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
                                        !folio_test_pmd_mappable(folio), folio);
 
-                       set_pmd_migration_entry(&pvmw, subpage);
+                       if (set_pmd_migration_entry(&pvmw, subpage)) {
+                               ret = false;
+                               page_vma_mapped_walk_done(&pvmw);
+                               break;
+                       }
                        continue;
                }
 #endif
@@ -1802,44 +1902,53 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
                subpage = folio_page(folio,
                                pte_pfn(*pvmw.pte) - folio_pfn(folio));
                address = pvmw.address;
+               anon_exclusive = folio_test_anon(folio) &&
+                                PageAnonExclusive(subpage);
 
-               if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
+               if (folio_test_hugetlb(folio)) {
                        /*
-                        * To call huge_pmd_unshare, i_mmap_rwsem must be
-                        * held in write mode.  Caller needs to explicitly
-                        * do this outside rmap routines.
+                        * huge_pmd_unshare may unmap an entire PMD page.
+                        * There is no way of knowing exactly which PMDs may
+                        * be cached for this mm, so we must flush them all.
+                        * start/end were already adjusted above to cover this
+                        * range.
                         */
-                       VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
-                       if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
-                               /*
-                                * huge_pmd_unshare unmapped an entire PMD
-                                * page.  There is no way of knowing exactly
-                                * which PMDs may be cached for this mm, so
-                                * we must flush them all.  start/end were
-                                * already adjusted above to cover this range.
-                                */
-                               flush_cache_range(vma, range.start, range.end);
-                               flush_tlb_range(vma, range.start, range.end);
-                               mmu_notifier_invalidate_range(mm, range.start,
-                                                             range.end);
+                       flush_cache_range(vma, range.start, range.end);
 
+                       if (!folio_test_anon(folio)) {
                                /*
-                                * The ref count of the PMD page was dropped
-                                * which is part of the way map counting
-                                * is done for shared PMDs.  Return 'true'
-                                * here.  When there is no other sharing,
-                                * huge_pmd_unshare returns false and we will
-                                * unmap the actual page and drop map count
-                                * to zero.
+                                * To call huge_pmd_unshare, i_mmap_rwsem must be
+                                * held in write mode.  Caller needs to explicitly
+                                * do this outside rmap routines.
                                 */
-                               page_vma_mapped_walk_done(&pvmw);
-                               break;
+                               VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+
+                               if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
+                                       flush_tlb_range(vma, range.start, range.end);
+                                       mmu_notifier_invalidate_range(mm, range.start,
+                                                                     range.end);
+
+                                       /*
+                                        * The ref count of the PMD page was dropped
+                                        * which is part of the way map counting
+                                        * is done for shared PMDs.  Return 'true'
+                                        * here.  When there is no other sharing,
+                                        * huge_pmd_unshare returns false and we will
+                                        * unmap the actual page and drop map count
+                                        * to zero.
+                                        */
+                                       page_vma_mapped_walk_done(&pvmw);
+                                       break;
+                               }
                        }
-               }
 
-               /* Nuke the page table entry. */
-               flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
-               pteval = ptep_clear_flush(vma, address, pvmw.pte);
+                       /* Nuke the hugetlb page table entry */
+                       pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
+               } else {
+                       flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
+                       /* Nuke the page table entry. */
+                       pteval = ptep_clear_flush(vma, address, pvmw.pte);
+               }
 
                /* Set the dirty flag on the folio now the pte is gone. */
                if (pte_dirty(pteval))
@@ -1853,6 +1962,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
                        swp_entry_t entry;
                        pte_t swp_pte;
 
+                       if (anon_exclusive)
+                               BUG_ON(page_try_share_anon_rmap(subpage));
+
                        /*
                         * Store the pfn of the page in a special migration
                         * pte. do_swap_page() will wait until the migration
@@ -1861,6 +1973,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
                        entry = pte_to_swp_entry(pteval);
                        if (is_writable_device_private_entry(entry))
                                entry = make_writable_migration_entry(pfn);
+                       else if (anon_exclusive)
+                               entry = make_readable_exclusive_migration_entry(pfn);
                        else
                                entry = make_readable_migration_entry(pfn);
                        swp_pte = swp_entry_to_pte(entry);
@@ -1920,7 +2034,22 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
                        pte_t swp_pte;
 
                        if (arch_unmap_one(mm, vma, address, pteval) < 0) {
-                               set_pte_at(mm, address, pvmw.pte, pteval);
+                               if (folio_test_hugetlb(folio))
+                                       set_huge_pte_at(mm, address, pvmw.pte, pteval);
+                               else
+                                       set_pte_at(mm, address, pvmw.pte, pteval);
+                               ret = false;
+                               page_vma_mapped_walk_done(&pvmw);
+                               break;
+                       }
+                       VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
+                                      !anon_exclusive, subpage);
+                       if (anon_exclusive &&
+                           page_try_share_anon_rmap(subpage)) {
+                               if (folio_test_hugetlb(folio))
+                                       set_huge_pte_at(mm, address, pvmw.pte, pteval);
+                               else
+                                       set_pte_at(mm, address, pvmw.pte, pteval);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
@@ -1934,6 +2063,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
                        if (pte_write(pteval))
                                entry = make_writable_migration_entry(
                                                        page_to_pfn(subpage));
+                       else if (anon_exclusive)
+                               entry = make_readable_exclusive_migration_entry(
+                                                       page_to_pfn(subpage));
                        else
                                entry = make_readable_migration_entry(
                                                        page_to_pfn(subpage));
@@ -1943,7 +2075,11 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
                        if (pte_uffd_wp(pteval))
                                swp_pte = pte_swp_mkuffd_wp(swp_pte);
-                       set_pte_at(mm, address, pvmw.pte, swp_pte);
+                       if (folio_test_hugetlb(folio))
+                               set_huge_swap_pte_at(mm, address, pvmw.pte,
+                                                    swp_pte, vma_mmu_pagesize(vma));
+                       else
+                               set_pte_at(mm, address, pvmw.pte, swp_pte);
                        trace_set_migration_pte(address, pte_val(swp_pte),
                                                compound_order(&folio->page));
                        /*
@@ -2148,7 +2284,7 @@ static bool folio_make_device_exclusive(struct folio *folio,
 
 /**
  * make_device_exclusive_range() - Mark a range for exclusive use by a device
- * @mm: mm_struct of assoicated target process
+ * @mm: mm_struct of associated target process
  * @start: start of the region to mark for exclusive device access
  * @end: end address of region
  * @pages: returns the pages which were successfully marked for exclusive access
@@ -2210,12 +2346,12 @@ void __put_anon_vma(struct anon_vma *anon_vma)
 }
 
 static struct anon_vma *rmap_walk_anon_lock(struct folio *folio,
-                                       const struct rmap_walk_control *rwc)
+                                           struct rmap_walk_control *rwc)
 {
        struct anon_vma *anon_vma;
 
        if (rwc->anon_lock)
-               return rwc->anon_lock(folio);
+               return rwc->anon_lock(folio, rwc);
 
        /*
         * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read()
@@ -2227,7 +2363,17 @@ static struct anon_vma *rmap_walk_anon_lock(struct folio *folio,
        if (!anon_vma)
                return NULL;
 
+       if (anon_vma_trylock_read(anon_vma))
+               goto out;
+
+       if (rwc->try_lock) {
+               anon_vma = NULL;
+               rwc->contended = true;
+               goto out;
+       }
+
        anon_vma_lock_read(anon_vma);
+out:
        return anon_vma;
 }
 
@@ -2241,7 +2387,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct folio *folio,
  * contained in the anon_vma struct it points to.
  */
 static void rmap_walk_anon(struct folio *folio,
-               const struct rmap_walk_control *rwc, bool locked)
+               struct rmap_walk_control *rwc, bool locked)
 {
        struct anon_vma *anon_vma;
        pgoff_t pgoff_start, pgoff_end;
@@ -2289,7 +2435,7 @@ static void rmap_walk_anon(struct folio *folio,
  * contained in the address_space struct it points to.
  */
 static void rmap_walk_file(struct folio *folio,
-               const struct rmap_walk_control *rwc, bool locked)
+               struct rmap_walk_control *rwc, bool locked)
 {
        struct address_space *mapping = folio_mapping(folio);
        pgoff_t pgoff_start, pgoff_end;
@@ -2308,8 +2454,18 @@ static void rmap_walk_file(struct folio *folio,
 
        pgoff_start = folio_pgoff(folio);
        pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
-       if (!locked)
+       if (!locked) {
+               if (i_mmap_trylock_read(mapping))
+                       goto lookup;
+
+               if (rwc->try_lock) {
+                       rwc->contended = true;
+                       return;
+               }
+
                i_mmap_lock_read(mapping);
+       }
+lookup:
        vma_interval_tree_foreach(vma, &mapping->i_mmap,
                        pgoff_start, pgoff_end) {
                unsigned long address = vma_address(&folio->page, vma);
@@ -2331,7 +2487,7 @@ done:
                i_mmap_unlock_read(mapping);
 }
 
-void rmap_walk(struct folio *folio, const struct rmap_walk_control *rwc)
+void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc)
 {
        if (unlikely(folio_test_ksm(folio)))
                rmap_walk_ksm(folio, rwc);
@@ -2342,7 +2498,7 @@ void rmap_walk(struct folio *folio, const struct rmap_walk_control *rwc)
 }
 
 /* Like rmap_walk, but caller holds relevant rmap lock */
-void rmap_walk_locked(struct folio *folio, const struct rmap_walk_control *rwc)
+void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc)
 {
        /* no ksm support for now */
        VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio);
@@ -2357,9 +2513,11 @@ void rmap_walk_locked(struct folio *folio, const struct rmap_walk_control *rwc)
  * The following two functions are for anonymous (private mapped) hugepages.
  * Unlike common anonymous pages, anonymous hugepages have no accounting code
  * and no lru code, because we handle hugepages differently from common pages.
+ *
+ * RMAP_COMPOUND is ignored.
  */
-void hugepage_add_anon_rmap(struct page *page,
-                           struct vm_area_struct *vma, unsigned long address)
+void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
+                           unsigned long address, rmap_t flags)
 {
        struct anon_vma *anon_vma = vma->anon_vma;
        int first;
@@ -2368,8 +2526,11 @@ void hugepage_add_anon_rmap(struct page *page,
        BUG_ON(!anon_vma);
        /* address might be in next vma when migration races vma_adjust */
        first = atomic_inc_and_test(compound_mapcount_ptr(page));
+       VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page);
+       VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page);
        if (first)
-               __page_set_anon_rmap(page, vma, address, 0);
+               __page_set_anon_rmap(page, vma, address,
+                                    !!(flags & RMAP_EXCLUSIVE));
 }
 
 void hugepage_add_new_anon_rmap(struct page *page,