xen/gntdev: use mmu_interval_notifier_insert
[linux-2.6-block.git] / mm / hmm.c
CommitLineData
c942fddf 1// SPDX-License-Identifier: GPL-2.0-or-later
133ff0ea
JG
2/*
3 * Copyright 2013 Red Hat Inc.
4 *
f813f219 5 * Authors: Jérôme Glisse <jglisse@redhat.com>
133ff0ea
JG
6 */
7/*
8 * Refer to include/linux/hmm.h for information about heterogeneous memory
9 * management or HMM for short.
10 */
a520110e 11#include <linux/pagewalk.h>
133ff0ea 12#include <linux/hmm.h>
858b54da 13#include <linux/init.h>
da4c3c73
JG
14#include <linux/rmap.h>
15#include <linux/swap.h>
133ff0ea
JG
16#include <linux/slab.h>
17#include <linux/sched.h>
4ef589dc
JG
18#include <linux/mmzone.h>
19#include <linux/pagemap.h>
da4c3c73
JG
20#include <linux/swapops.h>
21#include <linux/hugetlb.h>
4ef589dc 22#include <linux/memremap.h>
c8a53b2d 23#include <linux/sched/mm.h>
7b2d55d2 24#include <linux/jump_label.h>
55c0ece8 25#include <linux/dma-mapping.h>
c0b12405 26#include <linux/mmu_notifier.h>
4ef589dc
JG
27#include <linux/memory_hotplug.h>
28
74eee180
JG
29struct hmm_vma_walk {
30 struct hmm_range *range;
992de9a8 31 struct dev_pagemap *pgmap;
74eee180 32 unsigned long last;
9a4903e4 33 unsigned int flags;
74eee180
JG
34};
35
2aee09d8
JG
36static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
37 bool write_fault, uint64_t *pfn)
74eee180 38{
9b1ae605 39 unsigned int flags = FAULT_FLAG_REMOTE;
74eee180 40 struct hmm_vma_walk *hmm_vma_walk = walk->private;
f88a1e90 41 struct hmm_range *range = hmm_vma_walk->range;
74eee180 42 struct vm_area_struct *vma = walk->vma;
50a7ca3c 43 vm_fault_t ret;
74eee180 44
6c64f2bb
RC
45 if (!vma)
46 goto err;
47
9a4903e4
CH
48 if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY)
49 flags |= FAULT_FLAG_ALLOW_RETRY;
50 if (write_fault)
51 flags |= FAULT_FLAG_WRITE;
52
50a7ca3c 53 ret = handle_mm_fault(vma, addr, flags);
e709accc
JG
54 if (ret & VM_FAULT_RETRY) {
55 /* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */
73231612 56 return -EAGAIN;
e709accc 57 }
6c64f2bb
RC
58 if (ret & VM_FAULT_ERROR)
59 goto err;
74eee180 60
73231612 61 return -EBUSY;
6c64f2bb
RC
62
63err:
64 *pfn = range->values[HMM_PFN_ERROR];
65 return -EFAULT;
74eee180
JG
66}
67
da4c3c73
JG
68static int hmm_pfns_bad(unsigned long addr,
69 unsigned long end,
70 struct mm_walk *walk)
71{
c719547f
JG
72 struct hmm_vma_walk *hmm_vma_walk = walk->private;
73 struct hmm_range *range = hmm_vma_walk->range;
ff05c0c6 74 uint64_t *pfns = range->pfns;
da4c3c73
JG
75 unsigned long i;
76
77 i = (addr - range->start) >> PAGE_SHIFT;
78 for (; addr < end; addr += PAGE_SIZE, i++)
f88a1e90 79 pfns[i] = range->values[HMM_PFN_ERROR];
da4c3c73
JG
80
81 return 0;
82}
83
5504ed29 84/*
d2e8d551
RC
85 * hmm_vma_walk_hole_() - handle a range lacking valid pmd or pte(s)
86 * @addr: range virtual start address (inclusive)
5504ed29 87 * @end: range virtual end address (exclusive)
2aee09d8
JG
88 * @fault: should we fault or not ?
89 * @write_fault: write fault ?
5504ed29 90 * @walk: mm_walk structure
085ea250 91 * Return: 0 on success, -EBUSY after page fault, or page fault error
5504ed29
JG
92 *
93 * This function will be called whenever pmd_none() or pte_none() returns true,
94 * or whenever there is no page directory covering the virtual address range.
95 */
2aee09d8
JG
96static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
97 bool fault, bool write_fault,
98 struct mm_walk *walk)
da4c3c73 99{
74eee180
JG
100 struct hmm_vma_walk *hmm_vma_walk = walk->private;
101 struct hmm_range *range = hmm_vma_walk->range;
ff05c0c6 102 uint64_t *pfns = range->pfns;
7f08263d 103 unsigned long i;
da4c3c73 104
74eee180 105 hmm_vma_walk->last = addr;
7f08263d 106 i = (addr - range->start) >> PAGE_SHIFT;
63d5066f 107
c18ce674
RC
108 if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE))
109 return -EPERM;
110
7f08263d 111 for (; addr < end; addr += PAGE_SIZE, i++) {
f88a1e90 112 pfns[i] = range->values[HMM_PFN_NONE];
2aee09d8 113 if (fault || write_fault) {
74eee180 114 int ret;
da4c3c73 115
2aee09d8
JG
116 ret = hmm_vma_do_fault(walk, addr, write_fault,
117 &pfns[i]);
73231612 118 if (ret != -EBUSY)
74eee180
JG
119 return ret;
120 }
121 }
122
73231612 123 return (fault || write_fault) ? -EBUSY : 0;
2aee09d8
JG
124}
125
126static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
127 uint64_t pfns, uint64_t cpu_flags,
128 bool *fault, bool *write_fault)
129{
f88a1e90
JG
130 struct hmm_range *range = hmm_vma_walk->range;
131
d45d464b 132 if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT)
2aee09d8
JG
133 return;
134
023a019a
JG
135 /*
136 * So we not only consider the individual per page request we also
137 * consider the default flags requested for the range. The API can
d2e8d551
RC
138 * be used 2 ways. The first one where the HMM user coalesces
139 * multiple page faults into one request and sets flags per pfn for
140 * those faults. The second one where the HMM user wants to pre-
023a019a
JG
141 * fault a range with specific flags. For the latter one it is a
142 * waste to have the user pre-fill the pfn arrays with a default
143 * flags value.
144 */
145 pfns = (pfns & range->pfn_flags_mask) | range->default_flags;
146
2aee09d8 147 /* We aren't ask to do anything ... */
f88a1e90 148 if (!(pfns & range->flags[HMM_PFN_VALID]))
2aee09d8 149 return;
d2e8d551 150 /* If this is device memory then only fault if explicitly requested */
f88a1e90
JG
151 if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
152 /* Do we fault on device memory ? */
153 if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) {
154 *write_fault = pfns & range->flags[HMM_PFN_WRITE];
155 *fault = true;
156 }
2aee09d8
JG
157 return;
158 }
f88a1e90
JG
159
160 /* If CPU page table is not valid then we need to fault */
161 *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]);
162 /* Need to write fault ? */
163 if ((pfns & range->flags[HMM_PFN_WRITE]) &&
164 !(cpu_flags & range->flags[HMM_PFN_WRITE])) {
165 *write_fault = true;
2aee09d8
JG
166 *fault = true;
167 }
168}
169
170static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
171 const uint64_t *pfns, unsigned long npages,
172 uint64_t cpu_flags, bool *fault,
173 bool *write_fault)
174{
175 unsigned long i;
176
d45d464b 177 if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) {
2aee09d8
JG
178 *fault = *write_fault = false;
179 return;
180 }
181
a3e0d41c 182 *fault = *write_fault = false;
2aee09d8
JG
183 for (i = 0; i < npages; ++i) {
184 hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags,
185 fault, write_fault);
a3e0d41c 186 if ((*write_fault))
2aee09d8
JG
187 return;
188 }
189}
190
191static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
192 struct mm_walk *walk)
193{
194 struct hmm_vma_walk *hmm_vma_walk = walk->private;
195 struct hmm_range *range = hmm_vma_walk->range;
196 bool fault, write_fault;
197 unsigned long i, npages;
198 uint64_t *pfns;
199
200 i = (addr - range->start) >> PAGE_SHIFT;
201 npages = (end - addr) >> PAGE_SHIFT;
202 pfns = &range->pfns[i];
203 hmm_range_need_fault(hmm_vma_walk, pfns, npages,
204 0, &fault, &write_fault);
205 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
206}
207
f88a1e90 208static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
2aee09d8
JG
209{
210 if (pmd_protnone(pmd))
211 return 0;
f88a1e90
JG
212 return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] |
213 range->flags[HMM_PFN_WRITE] :
214 range->flags[HMM_PFN_VALID];
da4c3c73
JG
215}
216
992de9a8 217#ifdef CONFIG_TRANSPARENT_HUGEPAGE
9d3973d6
CH
218static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
219 unsigned long end, uint64_t *pfns, pmd_t pmd)
220{
53f5c3f4 221 struct hmm_vma_walk *hmm_vma_walk = walk->private;
f88a1e90 222 struct hmm_range *range = hmm_vma_walk->range;
2aee09d8 223 unsigned long pfn, npages, i;
2aee09d8 224 bool fault, write_fault;
f88a1e90 225 uint64_t cpu_flags;
53f5c3f4 226
2aee09d8 227 npages = (end - addr) >> PAGE_SHIFT;
f88a1e90 228 cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
2aee09d8
JG
229 hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags,
230 &fault, &write_fault);
53f5c3f4 231
2aee09d8
JG
232 if (pmd_protnone(pmd) || fault || write_fault)
233 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
53f5c3f4 234
309f9a4f 235 pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
992de9a8
JG
236 for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
237 if (pmd_devmap(pmd)) {
238 hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
239 hmm_vma_walk->pgmap);
240 if (unlikely(!hmm_vma_walk->pgmap))
241 return -EBUSY;
242 }
391aab11 243 pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
992de9a8
JG
244 }
245 if (hmm_vma_walk->pgmap) {
246 put_dev_pagemap(hmm_vma_walk->pgmap);
247 hmm_vma_walk->pgmap = NULL;
248 }
53f5c3f4
JG
249 hmm_vma_walk->last = end;
250 return 0;
251}
9d3973d6
CH
252#else /* CONFIG_TRANSPARENT_HUGEPAGE */
253/* stub to allow the code below to compile */
254int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
255 unsigned long end, uint64_t *pfns, pmd_t pmd);
256#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
53f5c3f4 257
f88a1e90 258static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
2aee09d8 259{
789c2af8 260 if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte))
2aee09d8 261 return 0;
f88a1e90
JG
262 return pte_write(pte) ? range->flags[HMM_PFN_VALID] |
263 range->flags[HMM_PFN_WRITE] :
264 range->flags[HMM_PFN_VALID];
2aee09d8
JG
265}
266
53f5c3f4
JG
267static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
268 unsigned long end, pmd_t *pmdp, pte_t *ptep,
269 uint64_t *pfn)
270{
271 struct hmm_vma_walk *hmm_vma_walk = walk->private;
f88a1e90 272 struct hmm_range *range = hmm_vma_walk->range;
2aee09d8
JG
273 bool fault, write_fault;
274 uint64_t cpu_flags;
53f5c3f4 275 pte_t pte = *ptep;
f88a1e90 276 uint64_t orig_pfn = *pfn;
53f5c3f4 277
f88a1e90 278 *pfn = range->values[HMM_PFN_NONE];
73231612 279 fault = write_fault = false;
53f5c3f4
JG
280
281 if (pte_none(pte)) {
73231612
JG
282 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0,
283 &fault, &write_fault);
2aee09d8 284 if (fault || write_fault)
53f5c3f4
JG
285 goto fault;
286 return 0;
287 }
288
289 if (!pte_present(pte)) {
290 swp_entry_t entry = pte_to_swp_entry(pte);
291
292 if (!non_swap_entry(entry)) {
e3fe8e55
YP
293 cpu_flags = pte_to_hmm_pfn_flags(range, pte);
294 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
295 &fault, &write_fault);
2aee09d8 296 if (fault || write_fault)
53f5c3f4
JG
297 goto fault;
298 return 0;
299 }
300
301 /*
302 * This is a special swap entry, ignore migration, use
303 * device and report anything else as error.
304 */
305 if (is_device_private_entry(entry)) {
f88a1e90
JG
306 cpu_flags = range->flags[HMM_PFN_VALID] |
307 range->flags[HMM_PFN_DEVICE_PRIVATE];
2aee09d8 308 cpu_flags |= is_write_device_private_entry(entry) ?
f88a1e90
JG
309 range->flags[HMM_PFN_WRITE] : 0;
310 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
311 &fault, &write_fault);
312 if (fault || write_fault)
313 goto fault;
391aab11
JG
314 *pfn = hmm_device_entry_from_pfn(range,
315 swp_offset(entry));
f88a1e90 316 *pfn |= cpu_flags;
53f5c3f4
JG
317 return 0;
318 }
319
320 if (is_migration_entry(entry)) {
2aee09d8 321 if (fault || write_fault) {
53f5c3f4
JG
322 pte_unmap(ptep);
323 hmm_vma_walk->last = addr;
d2e8d551 324 migration_entry_wait(walk->mm, pmdp, addr);
73231612 325 return -EBUSY;
53f5c3f4
JG
326 }
327 return 0;
328 }
329
330 /* Report error for everything else */
f88a1e90 331 *pfn = range->values[HMM_PFN_ERROR];
53f5c3f4 332 return -EFAULT;
73231612
JG
333 } else {
334 cpu_flags = pte_to_hmm_pfn_flags(range, pte);
335 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
336 &fault, &write_fault);
53f5c3f4
JG
337 }
338
2aee09d8 339 if (fault || write_fault)
53f5c3f4
JG
340 goto fault;
341
992de9a8
JG
342 if (pte_devmap(pte)) {
343 hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
344 hmm_vma_walk->pgmap);
345 if (unlikely(!hmm_vma_walk->pgmap))
346 return -EBUSY;
347 } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) {
ac541f25
RC
348 if (!is_zero_pfn(pte_pfn(pte))) {
349 *pfn = range->values[HMM_PFN_SPECIAL];
350 return -EFAULT;
351 }
352 /*
353 * Since each architecture defines a struct page for the zero
354 * page, just fall through and treat it like a normal page.
355 */
992de9a8
JG
356 }
357
391aab11 358 *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags;
53f5c3f4
JG
359 return 0;
360
361fault:
992de9a8
JG
362 if (hmm_vma_walk->pgmap) {
363 put_dev_pagemap(hmm_vma_walk->pgmap);
364 hmm_vma_walk->pgmap = NULL;
365 }
53f5c3f4
JG
366 pte_unmap(ptep);
367 /* Fault any virtual address we were asked to fault */
2aee09d8 368 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
53f5c3f4
JG
369}
370
da4c3c73
JG
371static int hmm_vma_walk_pmd(pmd_t *pmdp,
372 unsigned long start,
373 unsigned long end,
374 struct mm_walk *walk)
375{
74eee180
JG
376 struct hmm_vma_walk *hmm_vma_walk = walk->private;
377 struct hmm_range *range = hmm_vma_walk->range;
ff05c0c6 378 uint64_t *pfns = range->pfns;
da4c3c73 379 unsigned long addr = start, i;
da4c3c73 380 pte_t *ptep;
d08faca0 381 pmd_t pmd;
da4c3c73 382
da4c3c73 383again:
d08faca0
JG
384 pmd = READ_ONCE(*pmdp);
385 if (pmd_none(pmd))
da4c3c73
JG
386 return hmm_vma_walk_hole(start, end, walk);
387
d08faca0
JG
388 if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
389 bool fault, write_fault;
390 unsigned long npages;
391 uint64_t *pfns;
392
393 i = (addr - range->start) >> PAGE_SHIFT;
394 npages = (end - addr) >> PAGE_SHIFT;
395 pfns = &range->pfns[i];
396
397 hmm_range_need_fault(hmm_vma_walk, pfns, npages,
398 0, &fault, &write_fault);
399 if (fault || write_fault) {
400 hmm_vma_walk->last = addr;
d2e8d551 401 pmd_migration_entry_wait(walk->mm, pmdp);
73231612 402 return -EBUSY;
d08faca0
JG
403 }
404 return 0;
405 } else if (!pmd_present(pmd))
406 return hmm_pfns_bad(start, end, walk);
da4c3c73 407
d08faca0 408 if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
da4c3c73 409 /*
d2e8d551 410 * No need to take pmd_lock here, even if some other thread
da4c3c73
JG
411 * is splitting the huge pmd we will get that event through
412 * mmu_notifier callback.
413 *
d2e8d551 414 * So just read pmd value and check again it's a transparent
da4c3c73
JG
415 * huge or device mapping one and compute corresponding pfn
416 * values.
417 */
418 pmd = pmd_read_atomic(pmdp);
419 barrier();
420 if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
421 goto again;
74eee180 422
d08faca0 423 i = (addr - range->start) >> PAGE_SHIFT;
53f5c3f4 424 return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd);
da4c3c73
JG
425 }
426
d08faca0 427 /*
d2e8d551 428 * We have handled all the valid cases above ie either none, migration,
d08faca0
JG
429 * huge or transparent huge. At this point either it is a valid pmd
430 * entry pointing to pte directory or it is a bad pmd that will not
431 * recover.
432 */
433 if (pmd_bad(pmd))
da4c3c73
JG
434 return hmm_pfns_bad(start, end, walk);
435
436 ptep = pte_offset_map(pmdp, addr);
d08faca0 437 i = (addr - range->start) >> PAGE_SHIFT;
da4c3c73 438 for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
53f5c3f4 439 int r;
74eee180 440
53f5c3f4
JG
441 r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]);
442 if (r) {
443 /* hmm_vma_handle_pte() did unmap pte directory */
444 hmm_vma_walk->last = addr;
445 return r;
74eee180 446 }
da4c3c73 447 }
992de9a8
JG
448 if (hmm_vma_walk->pgmap) {
449 /*
450 * We do put_dev_pagemap() here and not in hmm_vma_handle_pte()
451 * so that we can leverage get_dev_pagemap() optimization which
452 * will not re-take a reference on a pgmap if we already have
453 * one.
454 */
455 put_dev_pagemap(hmm_vma_walk->pgmap);
456 hmm_vma_walk->pgmap = NULL;
457 }
da4c3c73
JG
458 pte_unmap(ptep - 1);
459
53f5c3f4 460 hmm_vma_walk->last = addr;
da4c3c73
JG
461 return 0;
462}
463
f0b3c45c
CH
464#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \
465 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
466static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
467{
468 if (!pud_present(pud))
469 return 0;
470 return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
471 range->flags[HMM_PFN_WRITE] :
472 range->flags[HMM_PFN_VALID];
473}
474
475static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
476 struct mm_walk *walk)
992de9a8
JG
477{
478 struct hmm_vma_walk *hmm_vma_walk = walk->private;
479 struct hmm_range *range = hmm_vma_walk->range;
480 unsigned long addr = start, next;
481 pmd_t *pmdp;
482 pud_t pud;
483 int ret;
484
485again:
486 pud = READ_ONCE(*pudp);
487 if (pud_none(pud))
488 return hmm_vma_walk_hole(start, end, walk);
489
490 if (pud_huge(pud) && pud_devmap(pud)) {
491 unsigned long i, npages, pfn;
492 uint64_t *pfns, cpu_flags;
493 bool fault, write_fault;
494
495 if (!pud_present(pud))
496 return hmm_vma_walk_hole(start, end, walk);
497
498 i = (addr - range->start) >> PAGE_SHIFT;
499 npages = (end - addr) >> PAGE_SHIFT;
500 pfns = &range->pfns[i];
501
502 cpu_flags = pud_to_hmm_pfn_flags(range, pud);
503 hmm_range_need_fault(hmm_vma_walk, pfns, npages,
504 cpu_flags, &fault, &write_fault);
505 if (fault || write_fault)
506 return hmm_vma_walk_hole_(addr, end, fault,
507 write_fault, walk);
508
992de9a8
JG
509 pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
510 for (i = 0; i < npages; ++i, ++pfn) {
511 hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
512 hmm_vma_walk->pgmap);
513 if (unlikely(!hmm_vma_walk->pgmap))
514 return -EBUSY;
391aab11
JG
515 pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
516 cpu_flags;
992de9a8
JG
517 }
518 if (hmm_vma_walk->pgmap) {
519 put_dev_pagemap(hmm_vma_walk->pgmap);
520 hmm_vma_walk->pgmap = NULL;
521 }
522 hmm_vma_walk->last = end;
523 return 0;
992de9a8
JG
524 }
525
526 split_huge_pud(walk->vma, pudp, addr);
527 if (pud_none(*pudp))
528 goto again;
529
530 pmdp = pmd_offset(pudp, addr);
531 do {
532 next = pmd_addr_end(addr, end);
533 ret = hmm_vma_walk_pmd(pmdp, addr, next, walk);
534 if (ret)
535 return ret;
536 } while (pmdp++, addr = next, addr != end);
537
538 return 0;
539}
f0b3c45c
CH
540#else
541#define hmm_vma_walk_pud NULL
542#endif
992de9a8 543
251bbe59 544#ifdef CONFIG_HUGETLB_PAGE
63d5066f
JG
545static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
546 unsigned long start, unsigned long end,
547 struct mm_walk *walk)
548{
05c23af4 549 unsigned long addr = start, i, pfn;
63d5066f
JG
550 struct hmm_vma_walk *hmm_vma_walk = walk->private;
551 struct hmm_range *range = hmm_vma_walk->range;
552 struct vm_area_struct *vma = walk->vma;
63d5066f
JG
553 uint64_t orig_pfn, cpu_flags;
554 bool fault, write_fault;
555 spinlock_t *ptl;
556 pte_t entry;
557 int ret = 0;
558
d2e8d551 559 ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
63d5066f
JG
560 entry = huge_ptep_get(pte);
561
7f08263d 562 i = (start - range->start) >> PAGE_SHIFT;
63d5066f
JG
563 orig_pfn = range->pfns[i];
564 range->pfns[i] = range->values[HMM_PFN_NONE];
565 cpu_flags = pte_to_hmm_pfn_flags(range, entry);
566 fault = write_fault = false;
567 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
568 &fault, &write_fault);
569 if (fault || write_fault) {
570 ret = -ENOENT;
571 goto unlock;
572 }
573
05c23af4 574 pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
7f08263d 575 for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
391aab11
JG
576 range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
577 cpu_flags;
63d5066f
JG
578 hmm_vma_walk->last = end;
579
580unlock:
581 spin_unlock(ptl);
582
583 if (ret == -ENOENT)
584 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
585
586 return ret;
63d5066f 587}
251bbe59
CH
588#else
589#define hmm_vma_walk_hugetlb_entry NULL
590#endif /* CONFIG_HUGETLB_PAGE */
63d5066f 591
f88a1e90
JG
592static void hmm_pfns_clear(struct hmm_range *range,
593 uint64_t *pfns,
33cd47dc
JG
594 unsigned long addr,
595 unsigned long end)
596{
597 for (; addr < end; addr += PAGE_SIZE, pfns++)
f88a1e90 598 *pfns = range->values[HMM_PFN_NONE];
33cd47dc
JG
599}
600
7b86ac33
CH
601static const struct mm_walk_ops hmm_walk_ops = {
602 .pud_entry = hmm_vma_walk_pud,
603 .pmd_entry = hmm_vma_walk_pmd,
604 .pte_hole = hmm_vma_walk_hole,
605 .hugetlb_entry = hmm_vma_walk_hugetlb_entry,
606};
607
9a4903e4
CH
608/**
609 * hmm_range_fault - try to fault some address in a virtual address range
610 * @range: range being faulted
611 * @flags: HMM_FAULT_* flags
612 *
613 * Return: the number of valid pages in range->pfns[] (from range start
614 * address), which may be zero. On error one of the following status codes
615 * can be returned:
73231612 616 *
9a4903e4
CH
617 * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma
618 * (e.g., device file vma).
619 * -ENOMEM: Out of memory.
620 * -EPERM: Invalid permission (e.g., asking for write and range is read
621 * only).
622 * -EAGAIN: A page fault needs to be retried and mmap_sem was dropped.
623 * -EBUSY: The range has been invalidated and the caller needs to wait for
624 * the invalidation to finish.
625 * -EFAULT: Invalid (i.e., either no valid vma or it is illegal to access
626 * that range) number of valid pages in range->pfns[] (from
627 * range start address).
74eee180
JG
628 *
629 * This is similar to a regular CPU page fault except that it will not trigger
73231612
JG
630 * any memory migration if the memory being faulted is not accessible by CPUs
631 * and caller does not ask for migration.
74eee180 632 *
ff05c0c6
JG
633 * On error, for one virtual address in the range, the function will mark the
634 * corresponding HMM pfn entry with an error flag.
74eee180 635 */
9a4903e4 636long hmm_range_fault(struct hmm_range *range, unsigned int flags)
74eee180 637{
63d5066f 638 const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
a3e0d41c 639 unsigned long start = range->start, end;
74eee180 640 struct hmm_vma_walk hmm_vma_walk;
a22dd506 641 struct mm_struct *mm = range->notifier->mm;
a3e0d41c 642 struct vm_area_struct *vma;
74eee180
JG
643 int ret;
644
04ec32fb 645 lockdep_assert_held(&mm->mmap_sem);
704f3f2c 646
a3e0d41c
JG
647 do {
648 /* If range is no longer valid force retry. */
a22dd506
JG
649 if (mmu_interval_check_retry(range->notifier,
650 range->notifier_seq))
2bcbeaef 651 return -EBUSY;
74eee180 652
04ec32fb 653 vma = find_vma(mm, start);
63d5066f 654 if (vma == NULL || (vma->vm_flags & device_vma))
a3e0d41c 655 return -EFAULT;
704f3f2c 656
a3e0d41c
JG
657 if (!(vma->vm_flags & VM_READ)) {
658 /*
659 * If vma do not allow read access, then assume that it
660 * does not allow write access, either. HMM does not
661 * support architecture that allow write without read.
662 */
663 hmm_pfns_clear(range, range->pfns,
664 range->start, range->end);
665 return -EPERM;
666 }
74eee180 667
992de9a8 668 hmm_vma_walk.pgmap = NULL;
a3e0d41c 669 hmm_vma_walk.last = start;
9a4903e4 670 hmm_vma_walk.flags = flags;
a3e0d41c 671 hmm_vma_walk.range = range;
a3e0d41c
JG
672 end = min(range->end, vma->vm_end);
673
7b86ac33
CH
674 walk_page_range(vma->vm_mm, start, end, &hmm_walk_ops,
675 &hmm_vma_walk);
a3e0d41c
JG
676
677 do {
7b86ac33
CH
678 ret = walk_page_range(vma->vm_mm, start, end,
679 &hmm_walk_ops, &hmm_vma_walk);
a3e0d41c
JG
680 start = hmm_vma_walk.last;
681
682 /* Keep trying while the range is valid. */
a22dd506
JG
683 } while (ret == -EBUSY &&
684 !mmu_interval_check_retry(range->notifier,
685 range->notifier_seq));
a3e0d41c
JG
686
687 if (ret) {
688 unsigned long i;
689
690 i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
691 hmm_pfns_clear(range, &range->pfns[i],
692 hmm_vma_walk.last, range->end);
693 return ret;
694 }
695 start = end;
74eee180 696
a3e0d41c 697 } while (start < range->end);
704f3f2c 698
73231612 699 return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
74eee180 700}
73231612 701EXPORT_SYMBOL(hmm_range_fault);
55c0ece8
JG
702
703/**
9a4903e4
CH
704 * hmm_range_dma_map - hmm_range_fault() and dma map page all in one.
705 * @range: range being faulted
706 * @device: device to map page to
707 * @daddrs: array of dma addresses for the mapped pages
708 * @flags: HMM_FAULT_*
55c0ece8 709 *
9a4903e4
CH
710 * Return: the number of pages mapped on success (including zero), or any
711 * status return from hmm_range_fault() otherwise.
55c0ece8 712 */
9a4903e4
CH
713long hmm_range_dma_map(struct hmm_range *range, struct device *device,
714 dma_addr_t *daddrs, unsigned int flags)
55c0ece8
JG
715{
716 unsigned long i, npages, mapped;
717 long ret;
718
9a4903e4 719 ret = hmm_range_fault(range, flags);
55c0ece8
JG
720 if (ret <= 0)
721 return ret ? ret : -EBUSY;
722
723 npages = (range->end - range->start) >> PAGE_SHIFT;
724 for (i = 0, mapped = 0; i < npages; ++i) {
725 enum dma_data_direction dir = DMA_TO_DEVICE;
726 struct page *page;
727
728 /*
729 * FIXME need to update DMA API to provide invalid DMA address
730 * value instead of a function to test dma address value. This
731 * would remove lot of dumb code duplicated accross many arch.
732 *
733 * For now setting it to 0 here is good enough as the pfns[]
734 * value is what is use to check what is valid and what isn't.
735 */
736 daddrs[i] = 0;
737
391aab11 738 page = hmm_device_entry_to_page(range, range->pfns[i]);
55c0ece8
JG
739 if (page == NULL)
740 continue;
741
742 /* Check if range is being invalidated */
a22dd506
JG
743 if (mmu_interval_check_retry(range->notifier,
744 range->notifier_seq)) {
55c0ece8
JG
745 ret = -EBUSY;
746 goto unmap;
747 }
748
749 /* If it is read and write than map bi-directional. */
750 if (range->pfns[i] & range->flags[HMM_PFN_WRITE])
751 dir = DMA_BIDIRECTIONAL;
752
753 daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir);
754 if (dma_mapping_error(device, daddrs[i])) {
755 ret = -EFAULT;
756 goto unmap;
757 }
758
759 mapped++;
760 }
761
762 return mapped;
763
764unmap:
765 for (npages = i, i = 0; (i < npages) && mapped; ++i) {
766 enum dma_data_direction dir = DMA_TO_DEVICE;
767 struct page *page;
768
391aab11 769 page = hmm_device_entry_to_page(range, range->pfns[i]);
55c0ece8
JG
770 if (page == NULL)
771 continue;
772
773 if (dma_mapping_error(device, daddrs[i]))
774 continue;
775
776 /* If it is read and write than map bi-directional. */
777 if (range->pfns[i] & range->flags[HMM_PFN_WRITE])
778 dir = DMA_BIDIRECTIONAL;
779
780 dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
781 mapped--;
782 }
783
784 return ret;
785}
786EXPORT_SYMBOL(hmm_range_dma_map);
787
788/**
789 * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map()
790 * @range: range being unmapped
55c0ece8
JG
791 * @device: device against which dma map was done
792 * @daddrs: dma address of mapped pages
793 * @dirty: dirty page if it had the write flag set
085ea250 794 * Return: number of page unmapped on success, -EINVAL otherwise
55c0ece8
JG
795 *
796 * Note that caller MUST abide by mmu notifier or use HMM mirror and abide
797 * to the sync_cpu_device_pagetables() callback so that it is safe here to
798 * call set_page_dirty(). Caller must also take appropriate locks to avoid
799 * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress.
800 */
801long hmm_range_dma_unmap(struct hmm_range *range,
55c0ece8
JG
802 struct device *device,
803 dma_addr_t *daddrs,
804 bool dirty)
805{
806 unsigned long i, npages;
807 long cpages = 0;
808
809 /* Sanity check. */
810 if (range->end <= range->start)
811 return -EINVAL;
812 if (!daddrs)
813 return -EINVAL;
814 if (!range->pfns)
815 return -EINVAL;
816
817 npages = (range->end - range->start) >> PAGE_SHIFT;
818 for (i = 0; i < npages; ++i) {
819 enum dma_data_direction dir = DMA_TO_DEVICE;
820 struct page *page;
821
391aab11 822 page = hmm_device_entry_to_page(range, range->pfns[i]);
55c0ece8
JG
823 if (page == NULL)
824 continue;
825
826 /* If it is read and write than map bi-directional. */
827 if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) {
828 dir = DMA_BIDIRECTIONAL;
829
830 /*
831 * See comments in function description on why it is
832 * safe here to call set_page_dirty()
833 */
834 if (dirty)
835 set_page_dirty(page);
836 }
837
838 /* Unmap and clear pfns/dma address */
839 dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
840 range->pfns[i] = range->values[HMM_PFN_NONE];
841 /* FIXME see comments in hmm_vma_dma_map() */
842 daddrs[i] = 0;
843 cpages++;
844 }
845
846 return cpages;
847}
848EXPORT_SYMBOL(hmm_range_dma_unmap);