Merge branch 'i2c/for-5.4' of git://git.kernel.org/pub/scm/linux/kernel/git/wsa/linux
[linux-2.6-block.git] / mm / hmm.c
index 16b6731a34db79b46516ac74da8ec5c59a03e99c..902f5fa6bf93ade84f63db956dff993ea090b717 100644 (file)
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -8,7 +8,7 @@
  * Refer to include/linux/hmm.h for information about heterogeneous memory
  * management or HMM for short.
  */
-#include <linux/mm.h>
+#include <linux/pagewalk.h>
 #include <linux/hmm.h>
 #include <linux/init.h>
 #include <linux/rmap.h>
 #include <linux/mmu_notifier.h>
 #include <linux/memory_hotplug.h>
 
-static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
-
-/**
- * hmm_get_or_create - register HMM against an mm (HMM internal)
- *
- * @mm: mm struct to attach to
- * Returns: returns an HMM object, either by referencing the existing
- *          (per-process) object, or by creating a new one.
- *
- * This is not intended to be used directly by device drivers. If mm already
- * has an HMM struct then it get a reference on it and returns it. Otherwise
- * it allocates an HMM struct, initializes it, associate it with the mm and
- * returns it.
- */
-static struct hmm *hmm_get_or_create(struct mm_struct *mm)
+static struct mmu_notifier *hmm_alloc_notifier(struct mm_struct *mm)
 {
        struct hmm *hmm;
 
-       lockdep_assert_held_write(&mm->mmap_sem);
-
-       /* Abuse the page_table_lock to also protect mm->hmm. */
-       spin_lock(&mm->page_table_lock);
-       hmm = mm->hmm;
-       if (mm->hmm && kref_get_unless_zero(&mm->hmm->kref))
-               goto out_unlock;
-       spin_unlock(&mm->page_table_lock);
-
-       hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
+       hmm = kzalloc(sizeof(*hmm), GFP_KERNEL);
        if (!hmm)
-               return NULL;
+               return ERR_PTR(-ENOMEM);
+
        init_waitqueue_head(&hmm->wq);
        INIT_LIST_HEAD(&hmm->mirrors);
        init_rwsem(&hmm->mirrors_sem);
-       hmm->mmu_notifier.ops = NULL;
        INIT_LIST_HEAD(&hmm->ranges);
        spin_lock_init(&hmm->ranges_lock);
-       kref_init(&hmm->kref);
        hmm->notifiers = 0;
-       hmm->mm = mm;
-
-       hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
-       if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
-               kfree(hmm);
-               return NULL;
-       }
-
-       mmgrab(hmm->mm);
-
-       /*
-        * We hold the exclusive mmap_sem here so we know that mm->hmm is
-        * still NULL or 0 kref, and is safe to update.
-        */
-       spin_lock(&mm->page_table_lock);
-       mm->hmm = hmm;
-
-out_unlock:
-       spin_unlock(&mm->page_table_lock);
-       return hmm;
+       return &hmm->mmu_notifier;
 }
 
-static void hmm_free_rcu(struct rcu_head *rcu)
+static void hmm_free_notifier(struct mmu_notifier *mn)
 {
-       struct hmm *hmm = container_of(rcu, struct hmm, rcu);
+       struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
 
-       mmdrop(hmm->mm);
+       WARN_ON(!list_empty(&hmm->ranges));
+       WARN_ON(!list_empty(&hmm->mirrors));
        kfree(hmm);
 }
 
-static void hmm_free(struct kref *kref)
-{
-       struct hmm *hmm = container_of(kref, struct hmm, kref);
-
-       spin_lock(&hmm->mm->page_table_lock);
-       if (hmm->mm->hmm == hmm)
-               hmm->mm->hmm = NULL;
-       spin_unlock(&hmm->mm->page_table_lock);
-
-       mmu_notifier_unregister_no_release(&hmm->mmu_notifier, hmm->mm);
-       mmu_notifier_call_srcu(&hmm->rcu, hmm_free_rcu);
-}
-
-static inline void hmm_put(struct hmm *hmm)
-{
-       kref_put(&hmm->kref, hmm_free);
-}
-
 static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 {
        struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
        struct hmm_mirror *mirror;
 
-       /* Bail out if hmm is in the process of being freed */
-       if (!kref_get_unless_zero(&hmm->kref))
-               return;
-
        /*
         * Since hmm_range_register() holds the mmget() lock hmm_release() is
         * prevented as long as a range exists.
@@ -137,8 +73,6 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
                        mirror->ops->release(mirror);
        }
        up_read(&hmm->mirrors_sem);
-
-       hmm_put(hmm);
 }
 
 static void notifiers_decrement(struct hmm *hmm)
@@ -165,23 +99,14 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 {
        struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
        struct hmm_mirror *mirror;
-       struct hmm_update update;
        struct hmm_range *range;
        unsigned long flags;
        int ret = 0;
 
-       if (!kref_get_unless_zero(&hmm->kref))
-               return 0;
-
-       update.start = nrange->start;
-       update.end = nrange->end;
-       update.event = HMM_UPDATE_INVALIDATE;
-       update.blockable = mmu_notifier_range_blockable(nrange);
-
        spin_lock_irqsave(&hmm->ranges_lock, flags);
        hmm->notifiers++;
        list_for_each_entry(range, &hmm->ranges, list) {
-               if (update.end < range->start || update.start >= range->end)
+               if (nrange->end < range->start || nrange->start >= range->end)
                        continue;
 
                range->valid = false;
@@ -198,9 +123,10 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
        list_for_each_entry(mirror, &hmm->mirrors, list) {
                int rc;
 
-               rc = mirror->ops->sync_cpu_device_pagetables(mirror, &update);
+               rc = mirror->ops->sync_cpu_device_pagetables(mirror, nrange);
                if (rc) {
-                       if (WARN_ON(update.blockable || rc != -EAGAIN))
+                       if (WARN_ON(mmu_notifier_range_blockable(nrange) ||
+                           rc != -EAGAIN))
                                continue;
                        ret = -EAGAIN;
                        break;
@@ -211,7 +137,6 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 out:
        if (ret)
                notifiers_decrement(hmm);
-       hmm_put(hmm);
        return ret;
 }
 
@@ -220,17 +145,15 @@ static void hmm_invalidate_range_end(struct mmu_notifier *mn,
 {
        struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
 
-       if (!kref_get_unless_zero(&hmm->kref))
-               return;
-
        notifiers_decrement(hmm);
-       hmm_put(hmm);
 }
 
 static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
        .release                = hmm_release,
        .invalidate_range_start = hmm_invalidate_range_start,
        .invalidate_range_end   = hmm_invalidate_range_end,
+       .alloc_notifier         = hmm_alloc_notifier,
+       .free_notifier          = hmm_free_notifier,
 };
 
 /*
@@ -242,18 +165,27 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
  *
  * To start mirroring a process address space, the device driver must register
  * an HMM mirror struct.
+ *
+ * The caller cannot unregister the hmm_mirror while any ranges are
+ * registered.
+ *
+ * Callers using this function must put a call to mmu_notifier_synchronize()
+ * in their module exit functions.
  */
 int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
 {
+       struct mmu_notifier *mn;
+
        lockdep_assert_held_write(&mm->mmap_sem);
 
        /* Sanity check */
        if (!mm || !mirror || !mirror->ops)
                return -EINVAL;
 
-       mirror->hmm = hmm_get_or_create(mm);
-       if (!mirror->hmm)
-               return -ENOMEM;
+       mn = mmu_notifier_get_locked(&hmm_mmu_notifier_ops, mm);
+       if (IS_ERR(mn))
+               return PTR_ERR(mn);
+       mirror->hmm = container_of(mn, struct hmm, mmu_notifier);
 
        down_write(&mirror->hmm->mirrors_sem);
        list_add(&mirror->list, &mirror->hmm->mirrors);
@@ -277,7 +209,7 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror)
        down_write(&hmm->mirrors_sem);
        list_del(&mirror->list);
        up_write(&hmm->mirrors_sem);
-       hmm_put(hmm);
+       mmu_notifier_put(&hmm->mmu_notifier);
 }
 EXPORT_SYMBOL(hmm_mirror_unregister);
 
@@ -285,8 +217,7 @@ struct hmm_vma_walk {
        struct hmm_range        *range;
        struct dev_pagemap      *pgmap;
        unsigned long           last;
-       bool                    fault;
-       bool                    block;
+       unsigned int            flags;
 };
 
 static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
@@ -298,17 +229,27 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
        struct vm_area_struct *vma = walk->vma;
        vm_fault_t ret;
 
-       flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
-       flags |= write_fault ? FAULT_FLAG_WRITE : 0;
+       if (!vma)
+               goto err;
+
+       if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY)
+               flags |= FAULT_FLAG_ALLOW_RETRY;
+       if (write_fault)
+               flags |= FAULT_FLAG_WRITE;
+
        ret = handle_mm_fault(vma, addr, flags);
-       if (ret & VM_FAULT_RETRY)
+       if (ret & VM_FAULT_RETRY) {
+               /* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */
                return -EAGAIN;
-       if (ret & VM_FAULT_ERROR) {
-               *pfn = range->values[HMM_PFN_ERROR];
-               return -EFAULT;
        }
+       if (ret & VM_FAULT_ERROR)
+               goto err;
 
        return -EBUSY;
+
+err:
+       *pfn = range->values[HMM_PFN_ERROR];
+       return -EFAULT;
 }
 
 static int hmm_pfns_bad(unsigned long addr,
@@ -328,8 +269,8 @@ static int hmm_pfns_bad(unsigned long addr,
 }
 
 /*
- * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s)
- * @start: range virtual start address (inclusive)
+ * hmm_vma_walk_hole_() - handle a range lacking valid pmd or pte(s)
+ * @addr: range virtual start address (inclusive)
  * @end: range virtual end address (exclusive)
  * @fault: should we fault or not ?
  * @write_fault: write fault ?
@@ -346,13 +287,15 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
        struct hmm_vma_walk *hmm_vma_walk = walk->private;
        struct hmm_range *range = hmm_vma_walk->range;
        uint64_t *pfns = range->pfns;
-       unsigned long i, page_size;
+       unsigned long i;
 
        hmm_vma_walk->last = addr;
-       page_size = hmm_range_page_size(range);
-       i = (addr - range->start) >> range->page_shift;
+       i = (addr - range->start) >> PAGE_SHIFT;
+
+       if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE))
+               return -EPERM;
 
-       for (; addr < end; addr += page_size, i++) {
+       for (; addr < end; addr += PAGE_SIZE, i++) {
                pfns[i] = range->values[HMM_PFN_NONE];
                if (fault || write_fault) {
                        int ret;
@@ -373,15 +316,15 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
 {
        struct hmm_range *range = hmm_vma_walk->range;
 
-       if (!hmm_vma_walk->fault)
+       if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT)
                return;
 
        /*
         * So we not only consider the individual per page request we also
         * consider the default flags requested for the range. The API can
-        * be use in 2 fashions. The first one where the HMM user coalesce
-        * multiple page fault into one request and set flags per pfns for
-        * of those faults. The second one where the HMM user want to pre-
+        * be used 2 ways. The first one where the HMM user coalesces
+        * multiple page faults into one request and sets flags per pfn for
+        * those faults. The second one where the HMM user wants to pre-
         * fault a range with specific flags. For the latter one it is a
         * waste to have the user pre-fill the pfn arrays with a default
         * flags value.
@@ -391,7 +334,7 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
        /* We aren't ask to do anything ... */
        if (!(pfns & range->flags[HMM_PFN_VALID]))
                return;
-       /* If this is device memory than only fault if explicitly requested */
+       /* If this is device memory then only fault if explicitly requested */
        if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
                /* Do we fault on device memory ? */
                if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) {
@@ -418,7 +361,7 @@ static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
 {
        unsigned long i;
 
-       if (!hmm_vma_walk->fault) {
+       if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) {
                *fault = *write_fault = false;
                return;
        }
@@ -458,22 +401,10 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
                                range->flags[HMM_PFN_VALID];
 }
 
-static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
-{
-       if (!pud_present(pud))
-               return 0;
-       return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
-                               range->flags[HMM_PFN_WRITE] :
-                               range->flags[HMM_PFN_VALID];
-}
-
-static int hmm_vma_handle_pmd(struct mm_walk *walk,
-                             unsigned long addr,
-                             unsigned long end,
-                             uint64_t *pfns,
-                             pmd_t pmd)
-{
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
+               unsigned long end, uint64_t *pfns, pmd_t pmd)
+{
        struct hmm_vma_walk *hmm_vma_walk = walk->private;
        struct hmm_range *range = hmm_vma_walk->range;
        unsigned long pfn, npages, i;
@@ -488,7 +419,7 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
        if (pmd_protnone(pmd) || fault || write_fault)
                return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
 
-       pfn = pmd_pfn(pmd) + pte_index(addr);
+       pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
        for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
                if (pmd_devmap(pmd)) {
                        hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
@@ -504,11 +435,12 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
        }
        hmm_vma_walk->last = end;
        return 0;
-#else
-       /* If THP is not enabled then we should never reach that code ! */
-       return -EINVAL;
-#endif
 }
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+/* stub to allow the code below to compile */
+int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
+               unsigned long end, uint64_t *pfns, pmd_t pmd);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
 {
@@ -525,7 +457,6 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 {
        struct hmm_vma_walk *hmm_vma_walk = walk->private;
        struct hmm_range *range = hmm_vma_walk->range;
-       struct vm_area_struct *vma = walk->vma;
        bool fault, write_fault;
        uint64_t cpu_flags;
        pte_t pte = *ptep;
@@ -546,6 +477,9 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
                swp_entry_t entry = pte_to_swp_entry(pte);
 
                if (!non_swap_entry(entry)) {
+                       cpu_flags = pte_to_hmm_pfn_flags(range, pte);
+                       hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
+                                          &fault, &write_fault);
                        if (fault || write_fault)
                                goto fault;
                        return 0;
@@ -574,8 +508,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
                        if (fault || write_fault) {
                                pte_unmap(ptep);
                                hmm_vma_walk->last = addr;
-                               migration_entry_wait(vma->vm_mm,
-                                                    pmdp, addr);
+                               migration_entry_wait(walk->mm, pmdp, addr);
                                return -EBUSY;
                        }
                        return 0;
@@ -623,21 +556,16 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
 {
        struct hmm_vma_walk *hmm_vma_walk = walk->private;
        struct hmm_range *range = hmm_vma_walk->range;
-       struct vm_area_struct *vma = walk->vma;
        uint64_t *pfns = range->pfns;
        unsigned long addr = start, i;
        pte_t *ptep;
        pmd_t pmd;
 
-
 again:
        pmd = READ_ONCE(*pmdp);
        if (pmd_none(pmd))
                return hmm_vma_walk_hole(start, end, walk);
 
-       if (pmd_huge(pmd) && (range->vma->vm_flags & VM_HUGETLB))
-               return hmm_pfns_bad(start, end, walk);
-
        if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
                bool fault, write_fault;
                unsigned long npages;
@@ -651,7 +579,7 @@ again:
                                     0, &fault, &write_fault);
                if (fault || write_fault) {
                        hmm_vma_walk->last = addr;
-                       pmd_migration_entry_wait(vma->vm_mm, pmdp);
+                       pmd_migration_entry_wait(walk->mm, pmdp);
                        return -EBUSY;
                }
                return 0;
@@ -660,11 +588,11 @@ again:
 
        if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
                /*
-                * No need to take pmd_lock here, even if some other threads
+                * No need to take pmd_lock here, even if some other thread
                 * is splitting the huge pmd we will get that event through
                 * mmu_notifier callback.
                 *
-                * So just read pmd value and check again its a transparent
+                * So just read pmd value and check again it's a transparent
                 * huge or device mapping one and compute corresponding pfn
                 * values.
                 */
@@ -678,7 +606,7 @@ again:
        }
 
        /*
-        * We have handled all the valid case above ie either none, migration,
+        * We have handled all the valid cases above ie either none, migration,
         * huge or transparent huge. At this point either it is a valid pmd
         * entry pointing to pte directory or it is a bad pmd that will not
         * recover.
@@ -714,10 +642,19 @@ again:
        return 0;
 }
 
-static int hmm_vma_walk_pud(pud_t *pudp,
-                           unsigned long start,
-                           unsigned long end,
-                           struct mm_walk *walk)
+#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \
+    defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
+{
+       if (!pud_present(pud))
+               return 0;
+       return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
+                               range->flags[HMM_PFN_WRITE] :
+                               range->flags[HMM_PFN_VALID];
+}
+
+static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
+               struct mm_walk *walk)
 {
        struct hmm_vma_walk *hmm_vma_walk = walk->private;
        struct hmm_range *range = hmm_vma_walk->range;
@@ -781,42 +718,29 @@ again:
 
        return 0;
 }
+#else
+#define hmm_vma_walk_pud       NULL
+#endif
 
+#ifdef CONFIG_HUGETLB_PAGE
 static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
                                      unsigned long start, unsigned long end,
                                      struct mm_walk *walk)
 {
-#ifdef CONFIG_HUGETLB_PAGE
-       unsigned long addr = start, i, pfn, mask, size, pfn_inc;
+       unsigned long addr = start, i, pfn;
        struct hmm_vma_walk *hmm_vma_walk = walk->private;
        struct hmm_range *range = hmm_vma_walk->range;
        struct vm_area_struct *vma = walk->vma;
-       struct hstate *h = hstate_vma(vma);
        uint64_t orig_pfn, cpu_flags;
        bool fault, write_fault;
        spinlock_t *ptl;
        pte_t entry;
        int ret = 0;
 
-       size = 1UL << huge_page_shift(h);
-       mask = size - 1;
-       if (range->page_shift != PAGE_SHIFT) {
-               /* Make sure we are looking at full page. */
-               if (start & mask)
-                       return -EINVAL;
-               if (end < (start + size))
-                       return -EINVAL;
-               pfn_inc = size >> PAGE_SHIFT;
-       } else {
-               pfn_inc = 1;
-               size = PAGE_SIZE;
-       }
-
-
-       ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
+       ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
        entry = huge_ptep_get(pte);
 
-       i = (start - range->start) >> range->page_shift;
+       i = (start - range->start) >> PAGE_SHIFT;
        orig_pfn = range->pfns[i];
        range->pfns[i] = range->values[HMM_PFN_NONE];
        cpu_flags = pte_to_hmm_pfn_flags(range, entry);
@@ -828,8 +752,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
                goto unlock;
        }
 
-       pfn = pte_pfn(entry) + ((start & mask) >> range->page_shift);
-       for (; addr < end; addr += size, i++, pfn += pfn_inc)
+       pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
+       for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
                range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
                                 cpu_flags;
        hmm_vma_walk->last = end;
@@ -841,10 +765,10 @@ unlock:
                return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
 
        return ret;
-#else /* CONFIG_HUGETLB_PAGE */
-       return -EINVAL;
-#endif
 }
+#else
+#define hmm_vma_walk_hugetlb_entry NULL
+#endif /* CONFIG_HUGETLB_PAGE */
 
 static void hmm_pfns_clear(struct hmm_range *range,
                           uint64_t *pfns,
@@ -859,44 +783,32 @@ static void hmm_pfns_clear(struct hmm_range *range,
  * hmm_range_register() - start tracking change to CPU page table over a range
  * @range: range
  * @mm: the mm struct for the range of virtual address
- * @start: start virtual address (inclusive)
- * @end: end virtual address (exclusive)
- * @page_shift: expect page shift for the range
- * Returns 0 on success, -EFAULT if the address space is no longer valid
+ *
+ * Return: 0 on success, -EFAULT if the address space is no longer valid
  *
  * Track updates to the CPU page table see include/linux/hmm.h
  */
-int hmm_range_register(struct hmm_range *range,
-                      struct hmm_mirror *mirror,
-                      unsigned long start,
-                      unsigned long end,
-                      unsigned page_shift)
+int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror)
 {
-       unsigned long mask = ((1UL << page_shift) - 1UL);
        struct hmm *hmm = mirror->hmm;
        unsigned long flags;
 
        range->valid = false;
        range->hmm = NULL;
 
-       if ((start & mask) || (end & mask))
+       if ((range->start & (PAGE_SIZE - 1)) || (range->end & (PAGE_SIZE - 1)))
                return -EINVAL;
-       if (start >= end)
+       if (range->start >= range->end)
                return -EINVAL;
 
-       range->page_shift = page_shift;
-       range->start = start;
-       range->end = end;
-
        /* Prevent hmm_release() from running while the range is valid */
-       if (!mmget_not_zero(hmm->mm))
+       if (!mmget_not_zero(hmm->mmu_notifier.mm))
                return -EFAULT;
 
        /* Initialize range to track CPU page table updates. */
        spin_lock_irqsave(&hmm->ranges_lock, flags);
 
        range->hmm = hmm;
-       kref_get(&hmm->kref);
        list_add(&range->list, &hmm->ranges);
 
        /*
@@ -928,8 +840,7 @@ void hmm_range_unregister(struct hmm_range *range)
        spin_unlock_irqrestore(&hmm->ranges_lock, flags);
 
        /* Drop reference taken by hmm_range_register() */
-       mmput(hmm->mm);
-       hmm_put(hmm);
+       mmput(hmm->mmu_notifier.mm);
 
        /*
         * The range is now invalid and the ref on the hmm is dropped, so
@@ -941,105 +852,33 @@ void hmm_range_unregister(struct hmm_range *range)
 }
 EXPORT_SYMBOL(hmm_range_unregister);
 
-/*
- * hmm_range_snapshot() - snapshot CPU page table for a range
- * @range: range
- * Return: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
- *          permission (for instance asking for write and range is read only),
- *          -EBUSY if you need to retry, -EFAULT invalid (ie either no valid
- *          vma or it is illegal to access that range), number of valid pages
- *          in range->pfns[] (from range start address).
- *
- * This snapshots the CPU page table for a range of virtual addresses. Snapshot
- * validity is tracked by range struct. See in include/linux/hmm.h for example
- * on how to use.
- */
-long hmm_range_snapshot(struct hmm_range *range)
-{
-       const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
-       unsigned long start = range->start, end;
-       struct hmm_vma_walk hmm_vma_walk;
-       struct hmm *hmm = range->hmm;
-       struct vm_area_struct *vma;
-       struct mm_walk mm_walk;
-
-       lockdep_assert_held(&hmm->mm->mmap_sem);
-       do {
-               /* If range is no longer valid force retry. */
-               if (!range->valid)
-                       return -EBUSY;
-
-               vma = find_vma(hmm->mm, start);
-               if (vma == NULL || (vma->vm_flags & device_vma))
-                       return -EFAULT;
-
-               if (is_vm_hugetlb_page(vma)) {
-                       if (huge_page_shift(hstate_vma(vma)) !=
-                                   range->page_shift &&
-                           range->page_shift != PAGE_SHIFT)
-                               return -EINVAL;
-               } else {
-                       if (range->page_shift != PAGE_SHIFT)
-                               return -EINVAL;
-               }
-
-               if (!(vma->vm_flags & VM_READ)) {
-                       /*
-                        * If vma do not allow read access, then assume that it
-                        * does not allow write access, either. HMM does not
-                        * support architecture that allow write without read.
-                        */
-                       hmm_pfns_clear(range, range->pfns,
-                               range->start, range->end);
-                       return -EPERM;
-               }
-
-               range->vma = vma;
-               hmm_vma_walk.pgmap = NULL;
-               hmm_vma_walk.last = start;
-               hmm_vma_walk.fault = false;
-               hmm_vma_walk.range = range;
-               mm_walk.private = &hmm_vma_walk;
-               end = min(range->end, vma->vm_end);
-
-               mm_walk.vma = vma;
-               mm_walk.mm = vma->vm_mm;
-               mm_walk.pte_entry = NULL;
-               mm_walk.test_walk = NULL;
-               mm_walk.hugetlb_entry = NULL;
-               mm_walk.pud_entry = hmm_vma_walk_pud;
-               mm_walk.pmd_entry = hmm_vma_walk_pmd;
-               mm_walk.pte_hole = hmm_vma_walk_hole;
-               mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
-
-               walk_page_range(start, end, &mm_walk);
-               start = end;
-       } while (start < range->end);
-
-       return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
-}
-EXPORT_SYMBOL(hmm_range_snapshot);
+static const struct mm_walk_ops hmm_walk_ops = {
+       .pud_entry      = hmm_vma_walk_pud,
+       .pmd_entry      = hmm_vma_walk_pmd,
+       .pte_hole       = hmm_vma_walk_hole,
+       .hugetlb_entry  = hmm_vma_walk_hugetlb_entry,
+};
 
-/*
- * hmm_range_fault() - try to fault some address in a virtual address range
- * @range: range being faulted
- * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
- * Return: number of valid pages in range->pfns[] (from range start
- *          address). This may be zero. If the return value is negative,
- *          then one of the following values may be returned:
+/**
+ * hmm_range_fault - try to fault some address in a virtual address range
+ * @range:     range being faulted
+ * @flags:     HMM_FAULT_* flags
  *
- *           -EINVAL  invalid arguments or mm or virtual address are in an
- *                    invalid vma (for instance device file vma).
- *           -ENOMEM: Out of memory.
- *           -EPERM:  Invalid permission (for instance asking for write and
- *                    range is read only).
- *           -EAGAIN: If you need to retry and mmap_sem was drop. This can only
- *                    happens if block argument is false.
- *           -EBUSY:  If the the range is being invalidated and you should wait
- *                    for invalidation to finish.
- *           -EFAULT: Invalid (ie either no valid vma or it is illegal to access
- *                    that range), number of valid pages in range->pfns[] (from
- *                    range start address).
+ * Return: the number of valid pages in range->pfns[] (from range start
+ * address), which may be zero.  On error one of the following status codes
+ * can be returned:
+ *
+ * -EINVAL:    Invalid arguments or mm or virtual address is in an invalid vma
+ *             (e.g., device file vma).
+ * -ENOMEM:    Out of memory.
+ * -EPERM:     Invalid permission (e.g., asking for write and range is read
+ *             only).
+ * -EAGAIN:    A page fault needs to be retried and mmap_sem was dropped.
+ * -EBUSY:     The range has been invalidated and the caller needs to wait for
+ *             the invalidation to finish.
+ * -EFAULT:    Invalid (i.e., either no valid vma or it is illegal to access
+ *             that range) number of valid pages in range->pfns[] (from
+ *              range start address).
  *
  * This is similar to a regular CPU page fault except that it will not trigger
  * any memory migration if the memory being faulted is not accessible by CPUs
@@ -1048,37 +887,26 @@ EXPORT_SYMBOL(hmm_range_snapshot);
  * On error, for one virtual address in the range, the function will mark the
  * corresponding HMM pfn entry with an error flag.
  */
-long hmm_range_fault(struct hmm_range *range, bool block)
+long hmm_range_fault(struct hmm_range *range, unsigned int flags)
 {
        const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
        unsigned long start = range->start, end;
        struct hmm_vma_walk hmm_vma_walk;
        struct hmm *hmm = range->hmm;
        struct vm_area_struct *vma;
-       struct mm_walk mm_walk;
        int ret;
 
-       lockdep_assert_held(&hmm->mm->mmap_sem);
+       lockdep_assert_held(&hmm->mmu_notifier.mm->mmap_sem);
 
        do {
                /* If range is no longer valid force retry. */
                if (!range->valid)
                        return -EBUSY;
 
-               vma = find_vma(hmm->mm, start);
+               vma = find_vma(hmm->mmu_notifier.mm, start);
                if (vma == NULL || (vma->vm_flags & device_vma))
                        return -EFAULT;
 
-               if (is_vm_hugetlb_page(vma)) {
-                       if (huge_page_shift(hstate_vma(vma)) !=
-                           range->page_shift &&
-                           range->page_shift != PAGE_SHIFT)
-                               return -EINVAL;
-               } else {
-                       if (range->page_shift != PAGE_SHIFT)
-                               return -EINVAL;
-               }
-
                if (!(vma->vm_flags & VM_READ)) {
                        /*
                         * If vma do not allow read access, then assume that it
@@ -1090,27 +918,18 @@ long hmm_range_fault(struct hmm_range *range, bool block)
                        return -EPERM;
                }
 
-               range->vma = vma;
                hmm_vma_walk.pgmap = NULL;
                hmm_vma_walk.last = start;
-               hmm_vma_walk.fault = true;
-               hmm_vma_walk.block = block;
+               hmm_vma_walk.flags = flags;
                hmm_vma_walk.range = range;
-               mm_walk.private = &hmm_vma_walk;
                end = min(range->end, vma->vm_end);
 
-               mm_walk.vma = vma;
-               mm_walk.mm = vma->vm_mm;
-               mm_walk.pte_entry = NULL;
-               mm_walk.test_walk = NULL;
-               mm_walk.hugetlb_entry = NULL;
-               mm_walk.pud_entry = hmm_vma_walk_pud;
-               mm_walk.pmd_entry = hmm_vma_walk_pmd;
-               mm_walk.pte_hole = hmm_vma_walk_hole;
-               mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
+               walk_page_range(vma->vm_mm, start, end, &hmm_walk_ops,
+                               &hmm_vma_walk);
 
                do {
-                       ret = walk_page_range(start, end, &mm_walk);
+                       ret = walk_page_range(vma->vm_mm, start, end,
+                                       &hmm_walk_ops, &hmm_vma_walk);
                        start = hmm_vma_walk.last;
 
                        /* Keep trying while the range is valid. */
@@ -1133,25 +952,22 @@ long hmm_range_fault(struct hmm_range *range, bool block)
 EXPORT_SYMBOL(hmm_range_fault);
 
 /**
- * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one.
- * @range: range being faulted
- * @device: device against to dma map page to
- * @daddrs: dma address of mapped pages
- * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
- * Return: number of pages mapped on success, -EAGAIN if mmap_sem have been
- *          drop and you need to try again, some other error value otherwise
+ * hmm_range_dma_map - hmm_range_fault() and dma map page all in one.
+ * @range:     range being faulted
+ * @device:    device to map page to
+ * @daddrs:    array of dma addresses for the mapped pages
+ * @flags:     HMM_FAULT_*
  *
- * Note same usage pattern as hmm_range_fault().
+ * Return: the number of pages mapped on success (including zero), or any
+ * status return from hmm_range_fault() otherwise.
  */
-long hmm_range_dma_map(struct hmm_range *range,
-                      struct device *device,
-                      dma_addr_t *daddrs,
-                      bool block)
+long hmm_range_dma_map(struct hmm_range *range, struct device *device,
+               dma_addr_t *daddrs, unsigned int flags)
 {
        unsigned long i, npages, mapped;
        long ret;
 
-       ret = hmm_range_fault(range, block);
+       ret = hmm_range_fault(range, flags);
        if (ret <= 0)
                return ret ? ret : -EBUSY;
 
@@ -1222,7 +1038,6 @@ EXPORT_SYMBOL(hmm_range_dma_map);
 /**
  * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map()
  * @range: range being unmapped
- * @vma: the vma against which the range (optional)
  * @device: device against which dma map was done
  * @daddrs: dma address of mapped pages
  * @dirty: dirty page if it had the write flag set
@@ -1234,7 +1049,6 @@ EXPORT_SYMBOL(hmm_range_dma_map);
  * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress.
  */
 long hmm_range_dma_unmap(struct hmm_range *range,
-                        struct vm_area_struct *vma,
                         struct device *device,
                         dma_addr_t *daddrs,
                         bool dirty)