HID: i2c-hid: use "struct i2c_hid" as argument in most calls
[linux-2.6-block.git] / mm / migrate.c
index 1852d787e6ab697d35c9e5a6992d2634f8695720..18ce840914f0d9b1b5bee825a0a0bd260899e583 100644 (file)
@@ -50,6 +50,7 @@
 #include <linux/ptrace.h>
 #include <linux/oom.h>
 #include <linux/memory.h>
+#include <linux/random.h>
 
 #include <asm/tlbflush.h>
 
@@ -236,20 +237,19 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
 
                        pte = pte_mkhuge(pte);
                        pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
-                       set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
                        if (PageAnon(new))
                                hugepage_add_anon_rmap(new, vma, pvmw.address);
                        else
                                page_dup_rmap(new, true);
+                       set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
                } else
 #endif
                {
-                       set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
-
                        if (PageAnon(new))
                                page_add_anon_rmap(new, vma, pvmw.address, false);
                        else
                                page_add_file_rmap(new, false);
+                       set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
                }
                if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
                        mlock_vma_page(new);
@@ -291,7 +291,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
 {
        pte_t pte;
        swp_entry_t entry;
-       struct page *page;
+       struct folio *folio;
 
        spin_lock(ptl);
        pte = *ptep;
@@ -302,18 +302,17 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
        if (!is_migration_entry(entry))
                goto out;
 
-       page = pfn_swap_entry_to_page(entry);
-       page = compound_head(page);
+       folio = page_folio(pfn_swap_entry_to_page(entry));
 
        /*
         * Once page cache replacement of page migration started, page_count
-        * is zero; but we must not call put_and_wait_on_page_locked() without
-        * a ref. Use get_page_unless_zero(), and just fault again if it fails.
+        * is zero; but we must not call folio_put_wait_locked() without
+        * a ref. Use folio_try_get(), and just fault again if it fails.
         */
-       if (!get_page_unless_zero(page))
+       if (!folio_try_get(folio))
                goto out;
        pte_unmap_unlock(ptep, ptl);
-       put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
+       folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);
        return;
 out:
        pte_unmap_unlock(ptep, ptl);
@@ -338,16 +337,16 @@ void migration_entry_wait_huge(struct vm_area_struct *vma,
 void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
 {
        spinlock_t *ptl;
-       struct page *page;
+       struct folio *folio;
 
        ptl = pmd_lock(mm, pmd);
        if (!is_pmd_migration_entry(*pmd))
                goto unlock;
-       page = pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd));
-       if (!get_page_unless_zero(page))
+       folio = page_folio(pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd)));
+       if (!folio_try_get(folio))
                goto unlock;
        spin_unlock(ptl);
-       put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
+       folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);
        return;
 unlock:
        spin_unlock(ptl);
@@ -364,7 +363,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
         */
        expected_count += is_device_private_page(page);
        if (mapping)
-               expected_count += thp_nr_pages(page) + page_has_private(page);
+               expected_count += compound_nr(page) + page_has_private(page);
 
        return expected_count;
 }
@@ -377,83 +376,70 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
  * 2 for pages with a mapping
  * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
  */
-int migrate_page_move_mapping(struct address_space *mapping,
-               struct page *newpage, struct page *page, int extra_count)
+int folio_migrate_mapping(struct address_space *mapping,
+               struct folio *newfolio, struct folio *folio, int extra_count)
 {
-       XA_STATE(xas, &mapping->i_pages, page_index(page));
+       XA_STATE(xas, &mapping->i_pages, folio_index(folio));
        struct zone *oldzone, *newzone;
        int dirty;
-       int expected_count = expected_page_refs(mapping, page) + extra_count;
-       int nr = thp_nr_pages(page);
+       int expected_count = expected_page_refs(mapping, &folio->page) + extra_count;
+       long nr = folio_nr_pages(folio);
 
        if (!mapping) {
                /* Anonymous page without mapping */
-               if (page_count(page) != expected_count)
+               if (folio_ref_count(folio) != expected_count)
                        return -EAGAIN;
 
                /* No turning back from here */
-               newpage->index = page->index;
-               newpage->mapping = page->mapping;
-               if (PageSwapBacked(page))
-                       __SetPageSwapBacked(newpage);
+               newfolio->index = folio->index;
+               newfolio->mapping = folio->mapping;
+               if (folio_test_swapbacked(folio))
+                       __folio_set_swapbacked(newfolio);
 
                return MIGRATEPAGE_SUCCESS;
        }
 
-       oldzone = page_zone(page);
-       newzone = page_zone(newpage);
+       oldzone = folio_zone(folio);
+       newzone = folio_zone(newfolio);
 
        xas_lock_irq(&xas);
-       if (page_count(page) != expected_count || xas_load(&xas) != page) {
-               xas_unlock_irq(&xas);
-               return -EAGAIN;
-       }
-
-       if (!page_ref_freeze(page, expected_count)) {
+       if (!folio_ref_freeze(folio, expected_count)) {
                xas_unlock_irq(&xas);
                return -EAGAIN;
        }
 
        /*
-        * Now we know that no one else is looking at the page:
+        * Now we know that no one else is looking at the folio:
         * no turning back from here.
         */
-       newpage->index = page->index;
-       newpage->mapping = page->mapping;
-       page_ref_add(newpage, nr); /* add cache reference */
-       if (PageSwapBacked(page)) {
-               __SetPageSwapBacked(newpage);
-               if (PageSwapCache(page)) {
-                       SetPageSwapCache(newpage);
-                       set_page_private(newpage, page_private(page));
+       newfolio->index = folio->index;
+       newfolio->mapping = folio->mapping;
+       folio_ref_add(newfolio, nr); /* add cache reference */
+       if (folio_test_swapbacked(folio)) {
+               __folio_set_swapbacked(newfolio);
+               if (folio_test_swapcache(folio)) {
+                       folio_set_swapcache(newfolio);
+                       newfolio->private = folio_get_private(folio);
                }
        } else {
-               VM_BUG_ON_PAGE(PageSwapCache(page), page);
+               VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
        }
 
        /* Move dirty while page refs frozen and newpage not yet exposed */
-       dirty = PageDirty(page);
+       dirty = folio_test_dirty(folio);
        if (dirty) {
-               ClearPageDirty(page);
-               SetPageDirty(newpage);
+               folio_clear_dirty(folio);
+               folio_set_dirty(newfolio);
        }
 
-       xas_store(&xas, newpage);
-       if (PageTransHuge(page)) {
-               int i;
-
-               for (i = 1; i < nr; i++) {
-                       xas_next(&xas);
-                       xas_store(&xas, newpage);
-               }
-       }
+       xas_store(&xas, newfolio);
 
        /*
         * Drop cache reference from old page by unfreezing
         * to one less reference.
         * We know this isn't the last reference.
         */
-       page_ref_unfreeze(page, expected_count - nr);
+       folio_ref_unfreeze(folio, expected_count - nr);
 
        xas_unlock(&xas);
        /* Leave irq disabled to prevent preemption while updating stats */
@@ -472,18 +458,18 @@ int migrate_page_move_mapping(struct address_space *mapping,
                struct lruvec *old_lruvec, *new_lruvec;
                struct mem_cgroup *memcg;
 
-               memcg = page_memcg(page);
+               memcg = folio_memcg(folio);
                old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
                new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
 
                __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
                __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
-               if (PageSwapBacked(page) && !PageSwapCache(page)) {
+               if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) {
                        __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
                        __mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
                }
 #ifdef CONFIG_SWAP
-               if (PageSwapCache(page)) {
+               if (folio_test_swapcache(folio)) {
                        __mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
                        __mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
                }
@@ -499,11 +485,11 @@ int migrate_page_move_mapping(struct address_space *mapping,
 
        return MIGRATEPAGE_SUCCESS;
 }
-EXPORT_SYMBOL(migrate_page_move_mapping);
+EXPORT_SYMBOL(folio_migrate_mapping);
 
 /*
  * The expected number of remaining references is the same as that
- * of migrate_page_move_mapping().
+ * of folio_migrate_mapping().
  */
 int migrate_huge_page_move_mapping(struct address_space *mapping,
                                   struct page *newpage, struct page *page)
@@ -538,91 +524,87 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
 }
 
 /*
- * Copy the page to its new location
+ * Copy the flags and some other ancillary information
  */
-void migrate_page_states(struct page *newpage, struct page *page)
+void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
 {
        int cpupid;
 
-       if (PageError(page))
-               SetPageError(newpage);
-       if (PageReferenced(page))
-               SetPageReferenced(newpage);
-       if (PageUptodate(page))
-               SetPageUptodate(newpage);
-       if (TestClearPageActive(page)) {
-               VM_BUG_ON_PAGE(PageUnevictable(page), page);
-               SetPageActive(newpage);
-       } else if (TestClearPageUnevictable(page))
-               SetPageUnevictable(newpage);
-       if (PageWorkingset(page))
-               SetPageWorkingset(newpage);
-       if (PageChecked(page))
-               SetPageChecked(newpage);
-       if (PageMappedToDisk(page))
-               SetPageMappedToDisk(newpage);
-
-       /* Move dirty on pages not done by migrate_page_move_mapping() */
-       if (PageDirty(page))
-               SetPageDirty(newpage);
-
-       if (page_is_young(page))
-               set_page_young(newpage);
-       if (page_is_idle(page))
-               set_page_idle(newpage);
+       if (folio_test_error(folio))
+               folio_set_error(newfolio);
+       if (folio_test_referenced(folio))
+               folio_set_referenced(newfolio);
+       if (folio_test_uptodate(folio))
+               folio_mark_uptodate(newfolio);
+       if (folio_test_clear_active(folio)) {
+               VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio);
+               folio_set_active(newfolio);
+       } else if (folio_test_clear_unevictable(folio))
+               folio_set_unevictable(newfolio);
+       if (folio_test_workingset(folio))
+               folio_set_workingset(newfolio);
+       if (folio_test_checked(folio))
+               folio_set_checked(newfolio);
+       if (folio_test_mappedtodisk(folio))
+               folio_set_mappedtodisk(newfolio);
+
+       /* Move dirty on pages not done by folio_migrate_mapping() */
+       if (folio_test_dirty(folio))
+               folio_set_dirty(newfolio);
+
+       if (folio_test_young(folio))
+               folio_set_young(newfolio);
+       if (folio_test_idle(folio))
+               folio_set_idle(newfolio);
 
        /*
         * Copy NUMA information to the new page, to prevent over-eager
         * future migrations of this same page.
         */
-       cpupid = page_cpupid_xchg_last(page, -1);
-       page_cpupid_xchg_last(newpage, cpupid);
+       cpupid = page_cpupid_xchg_last(&folio->page, -1);
+       page_cpupid_xchg_last(&newfolio->page, cpupid);
 
-       ksm_migrate_page(newpage, page);
+       folio_migrate_ksm(newfolio, folio);
        /*
         * Please do not reorder this without considering how mm/ksm.c's
         * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
         */
-       if (PageSwapCache(page))
-               ClearPageSwapCache(page);
-       ClearPagePrivate(page);
+       if (folio_test_swapcache(folio))
+               folio_clear_swapcache(folio);
+       folio_clear_private(folio);
 
        /* page->private contains hugetlb specific flags */
-       if (!PageHuge(page))
-               set_page_private(page, 0);
+       if (!folio_test_hugetlb(folio))
+               folio->private = NULL;
 
        /*
         * If any waiters have accumulated on the new page then
         * wake them up.
         */
-       if (PageWriteback(newpage))
-               end_page_writeback(newpage);
+       if (folio_test_writeback(newfolio))
+               folio_end_writeback(newfolio);
 
        /*
         * PG_readahead shares the same bit with PG_reclaim.  The above
         * end_page_writeback() may clear PG_readahead mistakenly, so set the
         * bit after that.
         */
-       if (PageReadahead(page))
-               SetPageReadahead(newpage);
+       if (folio_test_readahead(folio))
+               folio_set_readahead(newfolio);
 
-       copy_page_owner(page, newpage);
+       folio_copy_owner(newfolio, folio);
 
-       if (!PageHuge(page))
-               mem_cgroup_migrate(page, newpage);
+       if (!folio_test_hugetlb(folio))
+               mem_cgroup_migrate(folio, newfolio);
 }
-EXPORT_SYMBOL(migrate_page_states);
+EXPORT_SYMBOL(folio_migrate_flags);
 
-void migrate_page_copy(struct page *newpage, struct page *page)
+void folio_migrate_copy(struct folio *newfolio, struct folio *folio)
 {
-       if (PageHuge(page) || PageTransHuge(page))
-               copy_huge_page(newpage, page);
-       else
-               copy_highpage(newpage, page);
-
-       migrate_page_states(newpage, page);
+       folio_copy(newfolio, folio);
+       folio_migrate_flags(newfolio, folio);
 }
-EXPORT_SYMBOL(migrate_page_copy);
+EXPORT_SYMBOL(folio_migrate_copy);
 
 /************************************************************
  *                    Migration functions
@@ -638,19 +620,21 @@ int migrate_page(struct address_space *mapping,
                struct page *newpage, struct page *page,
                enum migrate_mode mode)
 {
+       struct folio *newfolio = page_folio(newpage);
+       struct folio *folio = page_folio(page);
        int rc;
 
-       BUG_ON(PageWriteback(page));    /* Writeback must be complete */
+       BUG_ON(folio_test_writeback(folio));    /* Writeback must be complete */
 
-       rc = migrate_page_move_mapping(mapping, newpage, page, 0);
+       rc = folio_migrate_mapping(mapping, newfolio, folio, 0);
 
        if (rc != MIGRATEPAGE_SUCCESS)
                return rc;
 
        if (mode != MIGRATE_SYNC_NO_COPY)
-               migrate_page_copy(newpage, page);
+               folio_migrate_copy(newfolio, folio);
        else
-               migrate_page_states(newpage, page);
+               folio_migrate_flags(newfolio, folio);
        return MIGRATEPAGE_SUCCESS;
 }
 EXPORT_SYMBOL(migrate_page);
@@ -1100,80 +1084,6 @@ out:
        return rc;
 }
 
-
-/*
- * node_demotion[] example:
- *
- * Consider a system with two sockets.  Each socket has
- * three classes of memory attached: fast, medium and slow.
- * Each memory class is placed in its own NUMA node.  The
- * CPUs are placed in the node with the "fast" memory.  The
- * 6 NUMA nodes (0-5) might be split among the sockets like
- * this:
- *
- *     Socket A: 0, 1, 2
- *     Socket B: 3, 4, 5
- *
- * When Node 0 fills up, its memory should be migrated to
- * Node 1.  When Node 1 fills up, it should be migrated to
- * Node 2.  The migration path start on the nodes with the
- * processors (since allocations default to this node) and
- * fast memory, progress through medium and end with the
- * slow memory:
- *
- *     0 -> 1 -> 2 -> stop
- *     3 -> 4 -> 5 -> stop
- *
- * This is represented in the node_demotion[] like this:
- *
- *     {  1, // Node 0 migrates to 1
- *        2, // Node 1 migrates to 2
- *       -1, // Node 2 does not migrate
- *        4, // Node 3 migrates to 4
- *        5, // Node 4 migrates to 5
- *       -1} // Node 5 does not migrate
- */
-
-/*
- * Writes to this array occur without locking.  Cycles are
- * not allowed: Node X demotes to Y which demotes to X...
- *
- * If multiple reads are performed, a single rcu_read_lock()
- * must be held over all reads to ensure that no cycles are
- * observed.
- */
-static int node_demotion[MAX_NUMNODES] __read_mostly =
-       {[0 ...  MAX_NUMNODES - 1] = NUMA_NO_NODE};
-
-/**
- * next_demotion_node() - Get the next node in the demotion path
- * @node: The starting node to lookup the next node
- *
- * Return: node id for next memory node in the demotion path hierarchy
- * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
- * @node online or guarantee that it *continues* to be the next demotion
- * target.
- */
-int next_demotion_node(int node)
-{
-       int target;
-
-       /*
-        * node_demotion[] is updated without excluding this
-        * function from running.  RCU doesn't provide any
-        * compiler barriers, so the READ_ONCE() is required
-        * to avoid compiler reordering or read merging.
-        *
-        * Make sure to use RCU over entire code blocks if
-        * node_demotion[] reads need to be consistent.
-        */
-       rcu_read_lock();
-       target = READ_ONCE(node_demotion[node]);
-       rcu_read_unlock();
-
-       return target;
-}
-
 /*
  * Obtain the lock on page, remove all ptes and migrate the page
  * to the newly allocated page in newpage.
@@ -1429,7 +1339,7 @@ static inline int try_split_thp(struct page *page, struct page **page2,
  * @mode:              The migration mode that specifies the constraints for
  *                     page migration, if any.
  * @reason:            The reason for page migration.
- * @ret_succeeded:     Set to the number of pages migrated successfully if
+ * @ret_succeeded:     Set to the number of normal pages migrated successfully if
  *                     the caller passes a non-NULL pointer.
  *
  * The function returns after 10 attempts or if no pages are movable any more
@@ -1437,7 +1347,9 @@ static inline int try_split_thp(struct page *page, struct page **page2,
  * It is caller's responsibility to call putback_movable_pages() to return pages
  * to the LRU or free list only if ret != 0.
  *
- * Returns the number of pages that were not migrated, or an error code.
+ * Returns the number of {normal page, THP, hugetlb} that were not migrated, or
+ * an error code. The number of THP splits will be considered as the number of
+ * non-migrated THP, no matter how many subpages of the THP are migrated successfully.
  */
 int migrate_pages(struct list_head *from, new_page_t get_new_page,
                free_page_t put_new_page, unsigned long private,
@@ -1446,6 +1358,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
        int retry = 1;
        int thp_retry = 1;
        int nr_failed = 0;
+       int nr_failed_pages = 0;
        int nr_succeeded = 0;
        int nr_thp_succeeded = 0;
        int nr_thp_failed = 0;
@@ -1457,13 +1370,16 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
        int swapwrite = current->flags & PF_SWAPWRITE;
        int rc, nr_subpages;
        LIST_HEAD(ret_pages);
+       LIST_HEAD(thp_split_pages);
        bool nosplit = (reason == MR_NUMA_MISPLACED);
+       bool no_subpage_counting = false;
 
        trace_mm_migrate_pages_start(mode, reason);
 
        if (!swapwrite)
                current->flags |= PF_SWAPWRITE;
 
+thp_subpage_migration:
        for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
                retry = 0;
                thp_retry = 0;
@@ -1476,7 +1392,7 @@ retry:
                         * during migration.
                         */
                        is_thp = PageTransHuge(page) && !PageHuge(page);
-                       nr_subpages = thp_nr_pages(page);
+                       nr_subpages = compound_nr(page);
                        cond_resched();
 
                        if (PageHuge(page))
@@ -1512,18 +1428,20 @@ retry:
                        case -ENOSYS:
                                /* THP migration is unsupported */
                                if (is_thp) {
-                                       if (!try_split_thp(page, &page2, from)) {
+                                       nr_thp_failed++;
+                                       if (!try_split_thp(page, &page2, &thp_split_pages)) {
                                                nr_thp_split++;
                                                goto retry;
                                        }
 
-                                       nr_thp_failed++;
-                                       nr_failed += nr_subpages;
+                                       nr_failed_pages += nr_subpages;
                                        break;
                                }
 
                                /* Hugetlb migration is unsupported */
-                               nr_failed++;
+                               if (!no_subpage_counting)
+                                       nr_failed++;
+                               nr_failed_pages += nr_subpages;
                                break;
                        case -ENOMEM:
                                /*
@@ -1532,16 +1450,19 @@ retry:
                                 * THP NUMA faulting doesn't split THP to retry.
                                 */
                                if (is_thp && !nosplit) {
-                                       if (!try_split_thp(page, &page2, from)) {
+                                       nr_thp_failed++;
+                                       if (!try_split_thp(page, &page2, &thp_split_pages)) {
                                                nr_thp_split++;
                                                goto retry;
                                        }
 
-                                       nr_thp_failed++;
-                                       nr_failed += nr_subpages;
+                                       nr_failed_pages += nr_subpages;
                                        goto out;
                                }
-                               nr_failed++;
+
+                               if (!no_subpage_counting)
+                                       nr_failed++;
+                               nr_failed_pages += nr_subpages;
                                goto out;
                        case -EAGAIN:
                                if (is_thp) {
@@ -1551,12 +1472,11 @@ retry:
                                retry++;
                                break;
                        case MIGRATEPAGE_SUCCESS:
+                               nr_succeeded += nr_subpages;
                                if (is_thp) {
                                        nr_thp_succeeded++;
-                                       nr_succeeded += nr_subpages;
                                        break;
                                }
-                               nr_succeeded++;
                                break;
                        default:
                                /*
@@ -1567,17 +1487,37 @@ retry:
                                 */
                                if (is_thp) {
                                        nr_thp_failed++;
-                                       nr_failed += nr_subpages;
+                                       nr_failed_pages += nr_subpages;
                                        break;
                                }
-                               nr_failed++;
+
+                               if (!no_subpage_counting)
+                                       nr_failed++;
+                               nr_failed_pages += nr_subpages;
                                break;
                        }
                }
        }
-       nr_failed += retry + thp_retry;
+       nr_failed += retry;
        nr_thp_failed += thp_retry;
-       rc = nr_failed;
+       /*
+        * Try to migrate subpages of fail-to-migrate THPs, no nr_failed
+        * counting in this round, since all subpages of a THP is counted
+        * as 1 failure in the first round.
+        */
+       if (!list_empty(&thp_split_pages)) {
+               /*
+                * Move non-migrated pages (after 10 retries) to ret_pages
+                * to avoid migrating them again.
+                */
+               list_splice_init(from, &ret_pages);
+               list_splice_init(&thp_split_pages, from);
+               no_subpage_counting = true;
+               retry = 1;
+               goto thp_subpage_migration;
+       }
+
+       rc = nr_failed + nr_thp_failed;
 out:
        /*
         * Put the permanent failure page back to migration list, they
@@ -1586,11 +1526,11 @@ out:
        list_splice(&ret_pages, from);
 
        count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
-       count_vm_events(PGMIGRATE_FAIL, nr_failed);
+       count_vm_events(PGMIGRATE_FAIL, nr_failed_pages);
        count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
        count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
        count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
-       trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded,
+       trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded,
                               nr_thp_failed, nr_thp_split, mode, reason);
 
        if (!swapwrite)
@@ -2369,7 +2309,6 @@ again:
                 * can't be dropped from it).
                 */
                get_page(page);
-               migrate->cpages++;
 
                /*
                 * Optimize for the common case where page is only mapped once
@@ -2379,7 +2318,7 @@ again:
                if (trylock_page(page)) {
                        pte_t swp_pte;
 
-                       mpfn |= MIGRATE_PFN_LOCKED;
+                       migrate->cpages++;
                        ptep_get_and_clear(mm, addr, ptep);
 
                        /* Setup special migration page table entry */
@@ -2413,6 +2352,9 @@ again:
 
                        if (pte_present(pte))
                                unmapped++;
+               } else {
+                       put_page(page);
+                       mpfn = 0;
                }
 
 next:
@@ -2468,7 +2410,7 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
  * @page: struct page to check
  *
  * Pinned pages cannot be migrated. This is the same test as in
- * migrate_page_move_mapping(), except that here we allow migration of a
+ * folio_migrate_mapping(), except that here we allow migration of a
  * ZONE_DEVICE page.
  */
 static bool migrate_vma_check_page(struct page *page)
@@ -2517,49 +2459,30 @@ static bool migrate_vma_check_page(struct page *page)
 }
 
 /*
- * migrate_vma_prepare() - lock pages and isolate them from the lru
+ * migrate_vma_unmap() - replace page mapping with special migration pte entry
  * @migrate: migrate struct containing all migration information
  *
- * This locks pages that have been collected by migrate_vma_collect(). Once each
- * page is locked it is isolated from the lru (for non-device pages). Finally,
- * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be
- * migrated by concurrent kernel threads.
+ * Isolate pages from the LRU and replace mappings (CPU page table pte) with a
+ * special migration pte entry and check if it has been pinned. Pinned pages are
+ * restored because we cannot migrate them.
+ *
+ * This is the last step before we call the device driver callback to allocate
+ * destination memory and copy contents of original page over to new page.
  */
-static void migrate_vma_prepare(struct migrate_vma *migrate)
+static void migrate_vma_unmap(struct migrate_vma *migrate)
 {
        const unsigned long npages = migrate->npages;
-       const unsigned long start = migrate->start;
-       unsigned long addr, i, restore = 0;
+       unsigned long i, restore = 0;
        bool allow_drain = true;
 
        lru_add_drain();
 
-       for (i = 0; (i < npages) && migrate->cpages; i++) {
+       for (i = 0; i < npages; i++) {
                struct page *page = migrate_pfn_to_page(migrate->src[i]);
-               bool remap = true;
 
                if (!page)
                        continue;
 
-               if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
-                       /*
-                        * Because we are migrating several pages there can be
-                        * a deadlock between 2 concurrent migration where each
-                        * are waiting on each other page lock.
-                        *
-                        * Make migrate_vma() a best effort thing and backoff
-                        * for any page we can not lock right away.
-                        */
-                       if (!trylock_page(page)) {
-                               migrate->src[i] = 0;
-                               migrate->cpages--;
-                               put_page(page);
-                               continue;
-                       }
-                       remap = false;
-                       migrate->src[i] |= MIGRATE_PFN_LOCKED;
-               }
-
                /* ZONE_DEVICE pages are not on LRU */
                if (!is_zone_device_page(page)) {
                        if (!PageLRU(page) && allow_drain) {
@@ -2569,16 +2492,9 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
                        }
 
                        if (isolate_lru_page(page)) {
-                               if (remap) {
-                                       migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
-                                       migrate->cpages--;
-                                       restore++;
-                               } else {
-                                       migrate->src[i] = 0;
-                                       unlock_page(page);
-                                       migrate->cpages--;
-                                       put_page(page);
-                               }
+                               migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+                               migrate->cpages--;
+                               restore++;
                                continue;
                        }
 
@@ -2586,83 +2502,23 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
                        put_page(page);
                }
 
-               if (!migrate_vma_check_page(page)) {
-                       if (remap) {
-                               migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
-                               migrate->cpages--;
-                               restore++;
-
-                               if (!is_zone_device_page(page)) {
-                                       get_page(page);
-                                       putback_lru_page(page);
-                               }
-                       } else {
-                               migrate->src[i] = 0;
-                               unlock_page(page);
-                               migrate->cpages--;
+               if (page_mapped(page))
+                       try_to_migrate(page, 0);
 
-                               if (!is_zone_device_page(page))
-                                       putback_lru_page(page);
-                               else
-                                       put_page(page);
+               if (page_mapped(page) || !migrate_vma_check_page(page)) {
+                       if (!is_zone_device_page(page)) {
+                               get_page(page);
+                               putback_lru_page(page);
                        }
-               }
-       }
-
-       for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) {
-               struct page *page = migrate_pfn_to_page(migrate->src[i]);
-
-               if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
-                       continue;
-
-               remove_migration_pte(page, migrate->vma, addr, page);
-
-               migrate->src[i] = 0;
-               unlock_page(page);
-               put_page(page);
-               restore--;
-       }
-}
-
-/*
- * migrate_vma_unmap() - replace page mapping with special migration pte entry
- * @migrate: migrate struct containing all migration information
- *
- * Replace page mapping (CPU page table pte) with a special migration pte entry
- * and check again if it has been pinned. Pinned pages are restored because we
- * cannot migrate them.
- *
- * This is the last step before we call the device driver callback to allocate
- * destination memory and copy contents of original page over to new page.
- */
-static void migrate_vma_unmap(struct migrate_vma *migrate)
-{
-       const unsigned long npages = migrate->npages;
-       const unsigned long start = migrate->start;
-       unsigned long addr, i, restore = 0;
 
-       for (i = 0; i < npages; i++) {
-               struct page *page = migrate_pfn_to_page(migrate->src[i]);
-
-               if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
+                       migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+                       migrate->cpages--;
+                       restore++;
                        continue;
-
-               if (page_mapped(page)) {
-                       try_to_migrate(page, 0);
-                       if (page_mapped(page))
-                               goto restore;
                }
-
-               if (migrate_vma_check_page(page))
-                       continue;
-
-restore:
-               migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
-               migrate->cpages--;
-               restore++;
        }
 
-       for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
+       for (i = 0; i < npages && restore; i++) {
                struct page *page = migrate_pfn_to_page(migrate->src[i]);
 
                if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
@@ -2672,12 +2528,8 @@ restore:
 
                migrate->src[i] = 0;
                unlock_page(page);
+               put_page(page);
                restore--;
-
-               if (is_zone_device_page(page))
-                       put_page(page);
-               else
-                       putback_lru_page(page);
        }
 }
 
@@ -2700,8 +2552,8 @@ restore:
  * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
  * flag set).  Once these are allocated and copied, the caller must update each
  * corresponding entry in the dst array with the pfn value of the destination
- * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_LOCKED flags set
- * (destination pages must have their struct pages locked, via lock_page()).
+ * page and with MIGRATE_PFN_VALID. Destination pages must be locked via
+ * lock_page().
  *
  * Note that the caller does not have to migrate all the pages that are marked
  * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
@@ -2770,8 +2622,6 @@ int migrate_vma_setup(struct migrate_vma *args)
 
        migrate_vma_collect(args);
 
-       if (args->cpages)
-               migrate_vma_prepare(args);
        if (args->cpages)
                migrate_vma_unmap(args);
 
@@ -2846,7 +2696,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
 
        if (unlikely(anon_vma_prepare(vma)))
                goto abort;
-       if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
+       if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
                goto abort;
 
        /*
@@ -3066,14 +2916,152 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
 EXPORT_SYMBOL(migrate_vma_finalize);
 #endif /* CONFIG_DEVICE_PRIVATE */
 
+/*
+ * node_demotion[] example:
+ *
+ * Consider a system with two sockets.  Each socket has
+ * three classes of memory attached: fast, medium and slow.
+ * Each memory class is placed in its own NUMA node.  The
+ * CPUs are placed in the node with the "fast" memory.  The
+ * 6 NUMA nodes (0-5) might be split among the sockets like
+ * this:
+ *
+ *     Socket A: 0, 1, 2
+ *     Socket B: 3, 4, 5
+ *
+ * When Node 0 fills up, its memory should be migrated to
+ * Node 1.  When Node 1 fills up, it should be migrated to
+ * Node 2.  The migration path start on the nodes with the
+ * processors (since allocations default to this node) and
+ * fast memory, progress through medium and end with the
+ * slow memory:
+ *
+ *     0 -> 1 -> 2 -> stop
+ *     3 -> 4 -> 5 -> stop
+ *
+ * This is represented in the node_demotion[] like this:
+ *
+ *     {  nr=1, nodes[0]=1 }, // Node 0 migrates to 1
+ *     {  nr=1, nodes[0]=2 }, // Node 1 migrates to 2
+ *     {  nr=0, nodes[0]=-1 }, // Node 2 does not migrate
+ *     {  nr=1, nodes[0]=4 }, // Node 3 migrates to 4
+ *     {  nr=1, nodes[0]=5 }, // Node 4 migrates to 5
+ *     {  nr=0, nodes[0]=-1 }, // Node 5 does not migrate
+ *
+ * Moreover some systems may have multiple slow memory nodes.
+ * Suppose a system has one socket with 3 memory nodes, node 0
+ * is fast memory type, and node 1/2 both are slow memory
+ * type, and the distance between fast memory node and slow
+ * memory node is same. So the migration path should be:
+ *
+ *     0 -> 1/2 -> stop
+ *
+ * This is represented in the node_demotion[] like this:
+ *     { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2
+ *     { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate
+ *     { nr=0, nodes[0]=-1, }, // Node 2 does not migrate
+ */
+
+/*
+ * Writes to this array occur without locking.  Cycles are
+ * not allowed: Node X demotes to Y which demotes to X...
+ *
+ * If multiple reads are performed, a single rcu_read_lock()
+ * must be held over all reads to ensure that no cycles are
+ * observed.
+ */
+#define DEFAULT_DEMOTION_TARGET_NODES 15
+
+#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
+#define DEMOTION_TARGET_NODES  (MAX_NUMNODES - 1)
+#else
+#define DEMOTION_TARGET_NODES  DEFAULT_DEMOTION_TARGET_NODES
+#endif
+
+struct demotion_nodes {
+       unsigned short nr;
+       short nodes[DEMOTION_TARGET_NODES];
+};
+
+static struct demotion_nodes *node_demotion __read_mostly;
+
+/**
+ * next_demotion_node() - Get the next node in the demotion path
+ * @node: The starting node to lookup the next node
+ *
+ * Return: node id for next memory node in the demotion path hierarchy
+ * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
+ * @node online or guarantee that it *continues* to be the next demotion
+ * target.
+ */
+int next_demotion_node(int node)
+{
+       struct demotion_nodes *nd;
+       unsigned short target_nr, index;
+       int target;
+
+       if (!node_demotion)
+               return NUMA_NO_NODE;
+
+       nd = &node_demotion[node];
+
+       /*
+        * node_demotion[] is updated without excluding this
+        * function from running.  RCU doesn't provide any
+        * compiler barriers, so the READ_ONCE() is required
+        * to avoid compiler reordering or read merging.
+        *
+        * Make sure to use RCU over entire code blocks if
+        * node_demotion[] reads need to be consistent.
+        */
+       rcu_read_lock();
+       target_nr = READ_ONCE(nd->nr);
+
+       switch (target_nr) {
+       case 0:
+               target = NUMA_NO_NODE;
+               goto out;
+       case 1:
+               index = 0;
+               break;
+       default:
+               /*
+                * If there are multiple target nodes, just select one
+                * target node randomly.
+                *
+                * In addition, we can also use round-robin to select
+                * target node, but we should introduce another variable
+                * for node_demotion[] to record last selected target node,
+                * that may cause cache ping-pong due to the changing of
+                * last target node. Or introducing per-cpu data to avoid
+                * caching issue, which seems more complicated. So selecting
+                * target node randomly seems better until now.
+                */
+               index = get_random_int() % target_nr;
+               break;
+       }
+
+       target = READ_ONCE(nd->nodes[index]);
+
+out:
+       rcu_read_unlock();
+       return target;
+}
+
 #if defined(CONFIG_HOTPLUG_CPU)
 /* Disable reclaim-based migration. */
 static void __disable_all_migrate_targets(void)
 {
-       int node;
+       int node, i;
 
-       for_each_online_node(node)
-               node_demotion[node] = NUMA_NO_NODE;
+       if (!node_demotion)
+               return;
+
+       for_each_online_node(node) {
+               node_demotion[node].nr = 0;
+               for (i = 0; i < DEMOTION_TARGET_NODES; i++)
+                       node_demotion[node].nodes[i] = NUMA_NO_NODE;
+       }
 }
 
 static void disable_all_migrate_targets(void)
@@ -3100,26 +3088,40 @@ static void disable_all_migrate_targets(void)
  * Failing here is OK.  It might just indicate
  * being at the end of a chain.
  */
-static int establish_migrate_target(int node, nodemask_t *used)
+static int establish_migrate_target(int node, nodemask_t *used,
+                                   int best_distance)
 {
-       int migration_target;
+       int migration_target, index, val;
+       struct demotion_nodes *nd;
 
-       /*
-        * Can not set a migration target on a
-        * node with it already set.
-        *
-        * No need for READ_ONCE() here since this
-        * in the write path for node_demotion[].
-        * This should be the only thread writing.
-        */
-       if (node_demotion[node] != NUMA_NO_NODE)
+       if (!node_demotion)
                return NUMA_NO_NODE;
 
+       nd = &node_demotion[node];
+
        migration_target = find_next_best_node(node, used);
        if (migration_target == NUMA_NO_NODE)
                return NUMA_NO_NODE;
 
-       node_demotion[node] = migration_target;
+       /*
+        * If the node has been set a migration target node before,
+        * which means it's the best distance between them. Still
+        * check if this node can be demoted to other target nodes
+        * if they have a same best distance.
+        */
+       if (best_distance != -1) {
+               val = node_distance(node, migration_target);
+               if (val > best_distance)
+                       return NUMA_NO_NODE;
+       }
+
+       index = nd->nr;
+       if (WARN_ONCE(index >= DEMOTION_TARGET_NODES,
+                     "Exceeds maximum demotion target nodes\n"))
+               return NUMA_NO_NODE;
+
+       nd->nodes[index] = migration_target;
+       nd->nr++;
 
        return migration_target;
 }
@@ -3135,7 +3137,9 @@ static int establish_migrate_target(int node, nodemask_t *used)
  *
  * The difference here is that cycles must be avoided.  If
  * node0 migrates to node1, then neither node1, nor anything
- * node1 migrates to can migrate to node0.
+ * node1 migrates to can migrate to node0. Also one node can
+ * be migrated to multiple nodes if the target nodes all have
+ * a same best-distance against the source node.
  *
  * This function can run simultaneously with readers of
  * node_demotion[].  However, it can not run simultaneously
@@ -3147,7 +3151,7 @@ static void __set_migration_target_nodes(void)
        nodemask_t next_pass    = NODE_MASK_NONE;
        nodemask_t this_pass    = NODE_MASK_NONE;
        nodemask_t used_targets = NODE_MASK_NONE;
-       int node;
+       int node, best_distance;
 
        /*
         * Avoid any oddities like cycles that could occur
@@ -3176,18 +3180,33 @@ again:
         * multiple source nodes to share a destination.
         */
        nodes_or(used_targets, used_targets, this_pass);
-       for_each_node_mask(node, this_pass) {
-               int target_node = establish_migrate_target(node, &used_targets);
 
-               if (target_node == NUMA_NO_NODE)
-                       continue;
+       for_each_node_mask(node, this_pass) {
+               best_distance = -1;
 
                /*
-                * Visit targets from this pass in the next pass.
-                * Eventually, every node will have been part of
-                * a pass, and will become set in 'used_targets'.
+                * Try to set up the migration path for the node, and the target
+                * migration nodes can be multiple, so doing a loop to find all
+                * the target nodes if they all have a best node distance.
                 */
-               node_set(target_node, next_pass);
+               do {
+                       int target_node =
+                               establish_migrate_target(node, &used_targets,
+                                                        best_distance);
+
+                       if (target_node == NUMA_NO_NODE)
+                               break;
+
+                       if (best_distance == -1)
+                               best_distance = node_distance(node, target_node);
+
+                       /*
+                        * Visit targets from this pass in the next pass.
+                        * Eventually, every node will have been part of
+                        * a pass, and will become set in 'used_targets'.
+                        */
+                       node_set(target_node, next_pass);
+               } while (1);
        }
        /*
         * 'next_pass' contains nodes which became migration
@@ -3288,6 +3307,11 @@ static int __init migrate_on_reclaim_init(void)
 {
        int ret;
 
+       node_demotion = kmalloc_array(nr_node_ids,
+                                     sizeof(struct demotion_nodes),
+                                     GFP_KERNEL);
+       WARN_ON(!node_demotion);
+
        ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
                                        NULL, migration_offline_cpu);
        /*
@@ -3306,3 +3330,64 @@ static int __init migrate_on_reclaim_init(void)
 }
 late_initcall(migrate_on_reclaim_init);
 #endif /* CONFIG_HOTPLUG_CPU */
+
+bool numa_demotion_enabled = false;
+
+#ifdef CONFIG_SYSFS
+static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
+                                         struct kobj_attribute *attr, char *buf)
+{
+       return sysfs_emit(buf, "%s\n",
+                         numa_demotion_enabled ? "true" : "false");
+}
+
+static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
+                                          struct kobj_attribute *attr,
+                                          const char *buf, size_t count)
+{
+       if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
+               numa_demotion_enabled = true;
+       else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
+               numa_demotion_enabled = false;
+       else
+               return -EINVAL;
+
+       return count;
+}
+
+static struct kobj_attribute numa_demotion_enabled_attr =
+       __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
+              numa_demotion_enabled_store);
+
+static struct attribute *numa_attrs[] = {
+       &numa_demotion_enabled_attr.attr,
+       NULL,
+};
+
+static const struct attribute_group numa_attr_group = {
+       .attrs = numa_attrs,
+};
+
+static int __init numa_init_sysfs(void)
+{
+       int err;
+       struct kobject *numa_kobj;
+
+       numa_kobj = kobject_create_and_add("numa", mm_kobj);
+       if (!numa_kobj) {
+               pr_err("failed to create numa kobject\n");
+               return -ENOMEM;
+       }
+       err = sysfs_create_group(numa_kobj, &numa_attr_group);
+       if (err) {
+               pr_err("failed to register numa group\n");
+               goto delete_obj;
+       }
+       return 0;
+
+delete_obj:
+       kobject_put(numa_kobj);
+       return err;
+}
+subsys_initcall(numa_init_sysfs);
+#endif