mm/migrate: support un-addressable ZONE_DEVICE page in migration
authorJérôme Glisse <jglisse@redhat.com>
Fri, 8 Sep 2017 23:12:17 +0000 (16:12 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 9 Sep 2017 01:26:46 +0000 (18:26 -0700)
Allow to unmap and restore special swap entry of un-addressable
ZONE_DEVICE memory.

Link: http://lkml.kernel.org/r/20170817000548.32038-17-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Mark Hairgrove <mhairgrove@nvidia.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Sherry Cheung <SCheung@nvidia.com>
Cc: Subhash Gutti <sgutti@nvidia.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Bob Liu <liubo95@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
include/linux/migrate.h
mm/migrate.c
mm/page_vma_mapped.c
mm/rmap.c

index 8f73cebfc3f528c5e08605a7be8b5efd12a613a1..8dc8f0a3f1af31c842f09ae61dd125b22f1f9ab5 100644 (file)
@@ -159,12 +159,18 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 
 #ifdef CONFIG_MIGRATION
 
+/*
+ * Watch out for PAE architecture, which has an unsigned long, and might not
+ * have enough bits to store all physical address and flags. So far we have
+ * enough room for all our flags.
+ */
 #define MIGRATE_PFN_VALID      (1UL << 0)
 #define MIGRATE_PFN_MIGRATE    (1UL << 1)
 #define MIGRATE_PFN_LOCKED     (1UL << 2)
 #define MIGRATE_PFN_WRITE      (1UL << 3)
-#define MIGRATE_PFN_ERROR      (1UL << 4)
-#define MIGRATE_PFN_SHIFT      5
+#define MIGRATE_PFN_DEVICE     (1UL << 4)
+#define MIGRATE_PFN_ERROR      (1UL << 5)
+#define MIGRATE_PFN_SHIFT      6
 
 static inline struct page *migrate_pfn_to_page(unsigned long mpfn)
 {
index 652b2c642eed18d765b10c4a46f698133d979764..77cb2fef08ea667f38f4a688d2a66cf373438308 100644 (file)
@@ -36,6 +36,7 @@
 #include <linux/hugetlb.h>
 #include <linux/hugetlb_cgroup.h>
 #include <linux/gfp.h>
+#include <linux/memremap.h>
 #include <linux/balloon_compaction.h>
 #include <linux/mmu_notifier.h>
 #include <linux/page_idle.h>
@@ -237,7 +238,13 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
                if (is_write_migration_entry(entry))
                        pte = maybe_mkwrite(pte, vma);
 
-               flush_dcache_page(new);
+               if (unlikely(is_zone_device_page(new)) &&
+                   is_device_private_page(new)) {
+                       entry = make_device_private_entry(new, pte_write(pte));
+                       pte = swp_entry_to_pte(entry);
+               } else
+                       flush_dcache_page(new);
+
 #ifdef CONFIG_HUGETLB_PAGE
                if (PageHuge(new)) {
                        pte = pte_mkhuge(pte);
@@ -2205,17 +2212,40 @@ again:
                pte = *ptep;
                pfn = pte_pfn(pte);
 
-               if (!pte_present(pte)) {
+               if (pte_none(pte)) {
                        mpfn = pfn = 0;
                        goto next;
                }
 
+               if (!pte_present(pte)) {
+                       mpfn = pfn = 0;
+
+                       /*
+                        * Only care about unaddressable device page special
+                        * page table entry. Other special swap entries are not
+                        * migratable, and we ignore regular swapped page.
+                        */
+                       entry = pte_to_swp_entry(pte);
+                       if (!is_device_private_entry(entry))
+                               goto next;
+
+                       page = device_private_entry_to_page(entry);
+                       mpfn = migrate_pfn(page_to_pfn(page))|
+                               MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE;
+                       if (is_write_device_private_entry(entry))
+                               mpfn |= MIGRATE_PFN_WRITE;
+               } else {
+                       page = vm_normal_page(migrate->vma, addr, pte);
+                       mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+                       mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
+               }
+
                /* FIXME support THP */
-               page = vm_normal_page(migrate->vma, addr, pte);
                if (!page || !page->mapping || PageTransCompound(page)) {
                        mpfn = pfn = 0;
                        goto next;
                }
+               pfn = page_to_pfn(page);
 
                /*
                 * By getting a reference on the page we pin it and that blocks
@@ -2228,8 +2258,6 @@ again:
                 */
                get_page(page);
                migrate->cpages++;
-               mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
-               mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
 
                /*
                 * Optimize for the common case where page is only mapped once
@@ -2256,10 +2284,13 @@ again:
                         */
                        page_remove_rmap(page, false);
                        put_page(page);
-                       unmapped++;
+
+                       if (pte_present(pte))
+                               unmapped++;
                }
 
 next:
+               migrate->dst[migrate->npages] = 0;
                migrate->src[migrate->npages++] = mpfn;
        }
        arch_leave_lazy_mmu_mode();
@@ -2329,6 +2360,28 @@ static bool migrate_vma_check_page(struct page *page)
        if (PageCompound(page))
                return false;
 
+       /* Page from ZONE_DEVICE have one extra reference */
+       if (is_zone_device_page(page)) {
+               /*
+                * Private page can never be pin as they have no valid pte and
+                * GUP will fail for those. Yet if there is a pending migration
+                * a thread might try to wait on the pte migration entry and
+                * will bump the page reference count. Sadly there is no way to
+                * differentiate a regular pin from migration wait. Hence to
+                * avoid 2 racing thread trying to migrate back to CPU to enter
+                * infinite loop (one stoping migration because the other is
+                * waiting on pte migration entry). We always return true here.
+                *
+                * FIXME proper solution is to rework migration_entry_wait() so
+                * it does not need to take a reference on page.
+                */
+               if (is_device_private_page(page))
+                       return true;
+
+               /* Other ZONE_DEVICE memory type are not supported */
+               return false;
+       }
+
        if ((page_count(page) - extra) > page_mapcount(page))
                return false;
 
@@ -2379,24 +2432,30 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
                        migrate->src[i] |= MIGRATE_PFN_LOCKED;
                }
 
-               if (!PageLRU(page) && allow_drain) {
-                       /* Drain CPU's pagevec */
-                       lru_add_drain_all();
-                       allow_drain = false;
-               }
+               /* ZONE_DEVICE pages are not on LRU */
+               if (!is_zone_device_page(page)) {
+                       if (!PageLRU(page) && allow_drain) {
+                               /* Drain CPU's pagevec */
+                               lru_add_drain_all();
+                               allow_drain = false;
+                       }
 
-               if (isolate_lru_page(page)) {
-                       if (remap) {
-                               migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
-                               migrate->cpages--;
-                               restore++;
-                       } else {
-                               migrate->src[i] = 0;
-                               unlock_page(page);
-                               migrate->cpages--;
-                               put_page(page);
+                       if (isolate_lru_page(page)) {
+                               if (remap) {
+                                       migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+                                       migrate->cpages--;
+                                       restore++;
+                               } else {
+                                       migrate->src[i] = 0;
+                                       unlock_page(page);
+                                       migrate->cpages--;
+                                       put_page(page);
+                               }
+                               continue;
                        }
-                       continue;
+
+                       /* Drop the reference we took in collect */
+                       put_page(page);
                }
 
                if (!migrate_vma_check_page(page)) {
@@ -2405,14 +2464,19 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
                                migrate->cpages--;
                                restore++;
 
-                               get_page(page);
-                               putback_lru_page(page);
+                               if (!is_zone_device_page(page)) {
+                                       get_page(page);
+                                       putback_lru_page(page);
+                               }
                        } else {
                                migrate->src[i] = 0;
                                unlock_page(page);
                                migrate->cpages--;
 
-                               putback_lru_page(page);
+                               if (!is_zone_device_page(page))
+                                       putback_lru_page(page);
+                               else
+                                       put_page(page);
                        }
                }
        }
@@ -2483,7 +2547,10 @@ restore:
                unlock_page(page);
                restore--;
 
-               putback_lru_page(page);
+               if (is_zone_device_page(page))
+                       put_page(page);
+               else
+                       putback_lru_page(page);
        }
 }
 
@@ -2514,6 +2581,26 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
 
                mapping = page_mapping(page);
 
+               if (is_zone_device_page(newpage)) {
+                       if (is_device_private_page(newpage)) {
+                               /*
+                                * For now only support private anonymous when
+                                * migrating to un-addressable device memory.
+                                */
+                               if (mapping) {
+                                       migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+                                       continue;
+                               }
+                       } else {
+                               /*
+                                * Other types of ZONE_DEVICE page are not
+                                * supported.
+                                */
+                               migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+                               continue;
+                       }
+               }
+
                r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
                if (r != MIGRATEPAGE_SUCCESS)
                        migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
@@ -2554,11 +2641,17 @@ static void migrate_vma_finalize(struct migrate_vma *migrate)
                unlock_page(page);
                migrate->cpages--;
 
-               putback_lru_page(page);
+               if (is_zone_device_page(page))
+                       put_page(page);
+               else
+                       putback_lru_page(page);
 
                if (newpage != page) {
                        unlock_page(newpage);
-                       putback_lru_page(newpage);
+                       if (is_zone_device_page(newpage))
+                               put_page(newpage);
+                       else
+                               putback_lru_page(newpage);
                }
        }
 }
index 3bd3008db4cb39017feef01061d3f92149bd9c0e..6a03946469a99eb535851194f519893a5e8a2d11 100644 (file)
@@ -48,6 +48,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
                if (!is_swap_pte(*pvmw->pte))
                        return false;
                entry = pte_to_swp_entry(*pvmw->pte);
+
                if (!is_migration_entry(entry))
                        return false;
                if (migration_entry_to_page(entry) - pvmw->page >=
@@ -60,6 +61,15 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
                WARN_ON_ONCE(1);
 #endif
        } else {
+               if (is_swap_pte(*pvmw->pte)) {
+                       swp_entry_t entry;
+
+                       entry = pte_to_swp_entry(*pvmw->pte);
+                       if (is_device_private_entry(entry) &&
+                           device_private_entry_to_page(entry) == pvmw->page)
+                               return true;
+               }
+
                if (!pte_present(*pvmw->pte))
                        return false;
 
index 7dc9c02f710693232bd1739667381e1c062b2c13..0618cd85b8629fae825ca1209802301497a24df3 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -63,6 +63,7 @@
 #include <linux/hugetlb.h>
 #include <linux/backing-dev.h>
 #include <linux/page_idle.h>
+#include <linux/memremap.h>
 
 #include <asm/tlbflush.h>
 
@@ -1346,6 +1347,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
                return true;
 
+       if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
+           is_zone_device_page(page) && !is_device_private_page(page))
+               return true;
+
        if (flags & TTU_SPLIT_HUGE_PMD) {
                split_huge_pmd_address(vma, address,
                                flags & TTU_SPLIT_FREEZE, page);
@@ -1403,6 +1408,27 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                address = pvmw.address;
 
 
+               if (IS_ENABLED(CONFIG_MIGRATION) &&
+                   (flags & TTU_MIGRATION) &&
+                   is_zone_device_page(page)) {
+                       swp_entry_t entry;
+                       pte_t swp_pte;
+
+                       pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte);
+
+                       /*
+                        * Store the pfn of the page in a special migration
+                        * pte. do_swap_page() will wait until the migration
+                        * pte is removed and then restart fault handling.
+                        */
+                       entry = make_migration_entry(page, 0);
+                       swp_pte = swp_entry_to_pte(entry);
+                       if (pte_soft_dirty(pteval))
+                               swp_pte = pte_swp_mksoft_dirty(swp_pte);
+                       set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
+                       goto discard;
+               }
+
                if (!(flags & TTU_IGNORE_ACCESS)) {
                        if (ptep_clear_flush_young_notify(vma, address,
                                                pvmw.pte)) {