memblock: add align parameter to memblock_alloc_node()
[linux-2.6-block.git] / mm / hmm.c
index c968e49f7a0c527258a85b8c0259467e3b2924de..90c34f3d1243aa5ec18970ec7aac095d8f657d32 100644 (file)
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -11,7 +11,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * Authors: JÃ\83©rÃ\83´me Glisse <jglisse@redhat.com>
+ * Authors: Jérôme Glisse <jglisse@redhat.com>
  */
 /*
  * Refer to include/linux/hmm.h for information about heterogeneous memory
@@ -43,7 +43,6 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
  *
  * @mm: mm struct this HMM struct is bound to
  * @lock: lock protecting ranges list
- * @sequence: we track updates to the CPU page table with a sequence number
  * @ranges: list of range being snapshotted
  * @mirrors: list of mirrors for this mm
  * @mmu_notifier: mmu notifier to track updates to CPU page table
@@ -52,7 +51,6 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
 struct hmm {
        struct mm_struct        *mm;
        spinlock_t              lock;
-       atomic_t                sequence;
        struct list_head        ranges;
        struct list_head        mirrors;
        struct mmu_notifier     mmu_notifier;
@@ -85,22 +83,11 @@ static struct hmm *hmm_register(struct mm_struct *mm)
                return NULL;
        INIT_LIST_HEAD(&hmm->mirrors);
        init_rwsem(&hmm->mirrors_sem);
-       atomic_set(&hmm->sequence, 0);
        hmm->mmu_notifier.ops = NULL;
        INIT_LIST_HEAD(&hmm->ranges);
        spin_lock_init(&hmm->lock);
        hmm->mm = mm;
 
-       /*
-        * We should only get here if hold the mmap_sem in write mode ie on
-        * registration of first mirror through hmm_mirror_register()
-        */
-       hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
-       if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
-               kfree(hmm);
-               return NULL;
-       }
-
        spin_lock(&mm->page_table_lock);
        if (!mm->hmm)
                mm->hmm = hmm;
@@ -108,12 +95,27 @@ static struct hmm *hmm_register(struct mm_struct *mm)
                cleanup = true;
        spin_unlock(&mm->page_table_lock);
 
-       if (cleanup) {
-               mmu_notifier_unregister(&hmm->mmu_notifier, mm);
-               kfree(hmm);
-       }
+       if (cleanup)
+               goto error;
+
+       /*
+        * We should only get here if hold the mmap_sem in write mode ie on
+        * registration of first mirror through hmm_mirror_register()
+        */
+       hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
+       if (__mmu_notifier_register(&hmm->mmu_notifier, mm))
+               goto error_mm;
 
        return mm->hmm;
+
+error_mm:
+       spin_lock(&mm->page_table_lock);
+       if (mm->hmm == hmm)
+               mm->hmm = NULL;
+       spin_unlock(&mm->page_table_lock);
+error:
+       kfree(hmm);
+       return NULL;
 }
 
 void hmm_mm_destroy(struct mm_struct *mm)
@@ -121,10 +123,8 @@ void hmm_mm_destroy(struct mm_struct *mm)
        kfree(mm->hmm);
 }
 
-static void hmm_invalidate_range(struct hmm *hmm,
-                                enum hmm_update_type action,
-                                unsigned long start,
-                                unsigned long end)
+static int hmm_invalidate_range(struct hmm *hmm, bool device,
+                               const struct hmm_update *update)
 {
        struct hmm_mirror *mirror;
        struct hmm_range *range;
@@ -133,22 +133,33 @@ static void hmm_invalidate_range(struct hmm *hmm,
        list_for_each_entry(range, &hmm->ranges, list) {
                unsigned long addr, idx, npages;
 
-               if (end < range->start || start >= range->end)
+               if (update->end < range->start || update->start >= range->end)
                        continue;
 
                range->valid = false;
-               addr = max(start, range->start);
+               addr = max(update->start, range->start);
                idx = (addr - range->start) >> PAGE_SHIFT;
-               npages = (min(range->end, end) - addr) >> PAGE_SHIFT;
+               npages = (min(range->end, update->end) - addr) >> PAGE_SHIFT;
                memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
        }
        spin_unlock(&hmm->lock);
 
+       if (!device)
+               return 0;
+
        down_read(&hmm->mirrors_sem);
-       list_for_each_entry(mirror, &hmm->mirrors, list)
-               mirror->ops->sync_cpu_device_pagetables(mirror, action,
-                                                       start, end);
+       list_for_each_entry(mirror, &hmm->mirrors, list) {
+               int ret;
+
+               ret = mirror->ops->sync_cpu_device_pagetables(mirror, update);
+               if (!update->blockable && ret == -EAGAIN) {
+                       up_read(&hmm->mirrors_sem);
+                       return -EAGAIN;
+               }
+       }
        up_read(&hmm->mirrors_sem);
+
+       return 0;
 }
 
 static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
@@ -178,18 +189,21 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 }
 
 static int hmm_invalidate_range_start(struct mmu_notifier *mn,
-                                      struct mm_struct *mm,
-                                      unsigned long start,
-                                      unsigned long end,
-                                      bool blockable)
+                                     struct mm_struct *mm,
+                                     unsigned long start,
+                                     unsigned long end,
+                                     bool blockable)
 {
+       struct hmm_update update;
        struct hmm *hmm = mm->hmm;
 
        VM_BUG_ON(!hmm);
 
-       atomic_inc(&hmm->sequence);
-
-       return 0;
+       update.start = start;
+       update.end = end;
+       update.event = HMM_UPDATE_INVALIDATE;
+       update.blockable = blockable;
+       return hmm_invalidate_range(hmm, true, &update);
 }
 
 static void hmm_invalidate_range_end(struct mmu_notifier *mn,
@@ -197,11 +211,16 @@ static void hmm_invalidate_range_end(struct mmu_notifier *mn,
                                     unsigned long start,
                                     unsigned long end)
 {
+       struct hmm_update update;
        struct hmm *hmm = mm->hmm;
 
        VM_BUG_ON(!hmm);
 
-       hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end);
+       update.start = start;
+       update.end = end;
+       update.event = HMM_UPDATE_INVALIDATE;
+       update.blockable = true;
+       hmm_invalidate_range(hmm, false, &update);
 }
 
 static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
@@ -278,12 +297,13 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror)
        if (!should_unregister || mm == NULL)
                return;
 
+       mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
+
        spin_lock(&mm->page_table_lock);
        if (mm->hmm == hmm)
                mm->hmm = NULL;
        spin_unlock(&mm->page_table_lock);
 
-       mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
        kfree(hmm);
 }
 EXPORT_SYMBOL(hmm_mirror_unregister);
@@ -571,22 +591,42 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
 {
        struct hmm_vma_walk *hmm_vma_walk = walk->private;
        struct hmm_range *range = hmm_vma_walk->range;
+       struct vm_area_struct *vma = walk->vma;
        uint64_t *pfns = range->pfns;
        unsigned long addr = start, i;
        pte_t *ptep;
+       pmd_t pmd;
 
-       i = (addr - range->start) >> PAGE_SHIFT;
 
 again:
-       if (pmd_none(*pmdp))
+       pmd = READ_ONCE(*pmdp);
+       if (pmd_none(pmd))
                return hmm_vma_walk_hole(start, end, walk);
 
-       if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB))
+       if (pmd_huge(pmd) && (range->vma->vm_flags & VM_HUGETLB))
                return hmm_pfns_bad(start, end, walk);
 
-       if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
-               pmd_t pmd;
+       if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
+               bool fault, write_fault;
+               unsigned long npages;
+               uint64_t *pfns;
+
+               i = (addr - range->start) >> PAGE_SHIFT;
+               npages = (end - addr) >> PAGE_SHIFT;
+               pfns = &range->pfns[i];
 
+               hmm_range_need_fault(hmm_vma_walk, pfns, npages,
+                                    0, &fault, &write_fault);
+               if (fault || write_fault) {
+                       hmm_vma_walk->last = addr;
+                       pmd_migration_entry_wait(vma->vm_mm, pmdp);
+                       return -EAGAIN;
+               }
+               return 0;
+       } else if (!pmd_present(pmd))
+               return hmm_pfns_bad(start, end, walk);
+
+       if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
                /*
                 * No need to take pmd_lock here, even if some other threads
                 * is splitting the huge pmd we will get that event through
@@ -601,13 +641,21 @@ again:
                if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
                        goto again;
 
+               i = (addr - range->start) >> PAGE_SHIFT;
                return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd);
        }
 
-       if (pmd_bad(*pmdp))
+       /*
+        * We have handled all the valid case above ie either none, migration,
+        * huge or transparent huge. At this point either it is a valid pmd
+        * entry pointing to pte directory or it is a bad pmd that will not
+        * recover.
+        */
+       if (pmd_bad(pmd))
                return hmm_pfns_bad(start, end, walk);
 
        ptep = pte_offset_map(pmdp, addr);
+       i = (addr - range->start) >> PAGE_SHIFT;
        for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
                int r;
 
@@ -1024,7 +1072,6 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
        resource_size_t key, align_start, align_size, align_end;
        struct device *device = devmem->device;
        int ret, nid, is_ram;
-       unsigned long pfn;
 
        align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
        align_size = ALIGN(devmem->resource->start +
@@ -1109,11 +1156,14 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
                                align_size >> PAGE_SHIFT, NULL);
        mem_hotplug_done();
 
-       for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) {
-               struct page *page = pfn_to_page(pfn);
+       /*
+        * Initialization of the pages has been deferred until now in order
+        * to allow us to do the work while not holding the hotplug lock.
+        */
+       memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
+                               align_start >> PAGE_SHIFT,
+                               align_size >> PAGE_SHIFT, &devmem->pagemap);
 
-               page->pgmap = &devmem->pagemap;
-       }
        return 0;
 
 error_add_memory: