mm/memcontrol: support MEMORY_DEVICE_PRIVATE
[linux-2.6-block.git] / mm / hmm.c
CommitLineData
133ff0ea
JG
1/*
2 * Copyright 2013 Red Hat Inc.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * Authors: Jérôme Glisse <jglisse@redhat.com>
15 */
16/*
17 * Refer to include/linux/hmm.h for information about heterogeneous memory
18 * management or HMM for short.
19 */
20#include <linux/mm.h>
21#include <linux/hmm.h>
da4c3c73
JG
22#include <linux/rmap.h>
23#include <linux/swap.h>
133ff0ea
JG
24#include <linux/slab.h>
25#include <linux/sched.h>
da4c3c73
JG
26#include <linux/swapops.h>
27#include <linux/hugetlb.h>
7b2d55d2 28#include <linux/jump_label.h>
c0b12405 29#include <linux/mmu_notifier.h>
133ff0ea
JG
30
31
7b2d55d2
JG
32/*
33 * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h
34 */
35DEFINE_STATIC_KEY_FALSE(device_private_key);
36EXPORT_SYMBOL(device_private_key);
37
38
133ff0ea 39#ifdef CONFIG_HMM
c0b12405
JG
40static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
41
133ff0ea
JG
42/*
43 * struct hmm - HMM per mm struct
44 *
45 * @mm: mm struct this HMM struct is bound to
da4c3c73 46 * @lock: lock protecting ranges list
c0b12405 47 * @sequence: we track updates to the CPU page table with a sequence number
da4c3c73 48 * @ranges: list of range being snapshotted
c0b12405
JG
49 * @mirrors: list of mirrors for this mm
50 * @mmu_notifier: mmu notifier to track updates to CPU page table
51 * @mirrors_sem: read/write semaphore protecting the mirrors list
133ff0ea
JG
52 */
53struct hmm {
54 struct mm_struct *mm;
da4c3c73 55 spinlock_t lock;
c0b12405 56 atomic_t sequence;
da4c3c73 57 struct list_head ranges;
c0b12405
JG
58 struct list_head mirrors;
59 struct mmu_notifier mmu_notifier;
60 struct rw_semaphore mirrors_sem;
133ff0ea
JG
61};
62
63/*
64 * hmm_register - register HMM against an mm (HMM internal)
65 *
66 * @mm: mm struct to attach to
67 *
68 * This is not intended to be used directly by device drivers. It allocates an
69 * HMM struct if mm does not have one, and initializes it.
70 */
71static struct hmm *hmm_register(struct mm_struct *mm)
72{
c0b12405
JG
73 struct hmm *hmm = READ_ONCE(mm->hmm);
74 bool cleanup = false;
133ff0ea
JG
75
76 /*
77 * The hmm struct can only be freed once the mm_struct goes away,
78 * hence we should always have pre-allocated an new hmm struct
79 * above.
80 */
c0b12405
JG
81 if (hmm)
82 return hmm;
83
84 hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
85 if (!hmm)
86 return NULL;
87 INIT_LIST_HEAD(&hmm->mirrors);
88 init_rwsem(&hmm->mirrors_sem);
89 atomic_set(&hmm->sequence, 0);
90 hmm->mmu_notifier.ops = NULL;
da4c3c73
JG
91 INIT_LIST_HEAD(&hmm->ranges);
92 spin_lock_init(&hmm->lock);
c0b12405
JG
93 hmm->mm = mm;
94
95 /*
96 * We should only get here if hold the mmap_sem in write mode ie on
97 * registration of first mirror through hmm_mirror_register()
98 */
99 hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
100 if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
101 kfree(hmm);
102 return NULL;
103 }
104
105 spin_lock(&mm->page_table_lock);
106 if (!mm->hmm)
107 mm->hmm = hmm;
108 else
109 cleanup = true;
110 spin_unlock(&mm->page_table_lock);
111
112 if (cleanup) {
113 mmu_notifier_unregister(&hmm->mmu_notifier, mm);
114 kfree(hmm);
115 }
116
133ff0ea
JG
117 return mm->hmm;
118}
119
120void hmm_mm_destroy(struct mm_struct *mm)
121{
122 kfree(mm->hmm);
123}
124#endif /* CONFIG_HMM */
c0b12405
JG
125
126#if IS_ENABLED(CONFIG_HMM_MIRROR)
127static void hmm_invalidate_range(struct hmm *hmm,
128 enum hmm_update_type action,
129 unsigned long start,
130 unsigned long end)
131{
132 struct hmm_mirror *mirror;
da4c3c73
JG
133 struct hmm_range *range;
134
135 spin_lock(&hmm->lock);
136 list_for_each_entry(range, &hmm->ranges, list) {
137 unsigned long addr, idx, npages;
138
139 if (end < range->start || start >= range->end)
140 continue;
141
142 range->valid = false;
143 addr = max(start, range->start);
144 idx = (addr - range->start) >> PAGE_SHIFT;
145 npages = (min(range->end, end) - addr) >> PAGE_SHIFT;
146 memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
147 }
148 spin_unlock(&hmm->lock);
c0b12405
JG
149
150 down_read(&hmm->mirrors_sem);
151 list_for_each_entry(mirror, &hmm->mirrors, list)
152 mirror->ops->sync_cpu_device_pagetables(mirror, action,
153 start, end);
154 up_read(&hmm->mirrors_sem);
155}
156
157static void hmm_invalidate_range_start(struct mmu_notifier *mn,
158 struct mm_struct *mm,
159 unsigned long start,
160 unsigned long end)
161{
162 struct hmm *hmm = mm->hmm;
163
164 VM_BUG_ON(!hmm);
165
166 atomic_inc(&hmm->sequence);
167}
168
169static void hmm_invalidate_range_end(struct mmu_notifier *mn,
170 struct mm_struct *mm,
171 unsigned long start,
172 unsigned long end)
173{
174 struct hmm *hmm = mm->hmm;
175
176 VM_BUG_ON(!hmm);
177
178 hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end);
179}
180
181static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
182 .invalidate_range_start = hmm_invalidate_range_start,
183 .invalidate_range_end = hmm_invalidate_range_end,
184};
185
186/*
187 * hmm_mirror_register() - register a mirror against an mm
188 *
189 * @mirror: new mirror struct to register
190 * @mm: mm to register against
191 *
192 * To start mirroring a process address space, the device driver must register
193 * an HMM mirror struct.
194 *
195 * THE mm->mmap_sem MUST BE HELD IN WRITE MODE !
196 */
197int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
198{
199 /* Sanity check */
200 if (!mm || !mirror || !mirror->ops)
201 return -EINVAL;
202
203 mirror->hmm = hmm_register(mm);
204 if (!mirror->hmm)
205 return -ENOMEM;
206
207 down_write(&mirror->hmm->mirrors_sem);
208 list_add(&mirror->list, &mirror->hmm->mirrors);
209 up_write(&mirror->hmm->mirrors_sem);
210
211 return 0;
212}
213EXPORT_SYMBOL(hmm_mirror_register);
214
215/*
216 * hmm_mirror_unregister() - unregister a mirror
217 *
218 * @mirror: new mirror struct to register
219 *
220 * Stop mirroring a process address space, and cleanup.
221 */
222void hmm_mirror_unregister(struct hmm_mirror *mirror)
223{
224 struct hmm *hmm = mirror->hmm;
225
226 down_write(&hmm->mirrors_sem);
227 list_del(&mirror->list);
228 up_write(&hmm->mirrors_sem);
229}
230EXPORT_SYMBOL(hmm_mirror_unregister);
da4c3c73 231
74eee180
JG
232struct hmm_vma_walk {
233 struct hmm_range *range;
234 unsigned long last;
235 bool fault;
236 bool block;
237 bool write;
238};
239
240static int hmm_vma_do_fault(struct mm_walk *walk,
241 unsigned long addr,
242 hmm_pfn_t *pfn)
243{
244 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
245 struct hmm_vma_walk *hmm_vma_walk = walk->private;
246 struct vm_area_struct *vma = walk->vma;
247 int r;
248
249 flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
250 flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0;
251 r = handle_mm_fault(vma, addr, flags);
252 if (r & VM_FAULT_RETRY)
253 return -EBUSY;
254 if (r & VM_FAULT_ERROR) {
255 *pfn = HMM_PFN_ERROR;
256 return -EFAULT;
257 }
258
259 return -EAGAIN;
260}
261
da4c3c73
JG
262static void hmm_pfns_special(hmm_pfn_t *pfns,
263 unsigned long addr,
264 unsigned long end)
265{
266 for (; addr < end; addr += PAGE_SIZE, pfns++)
267 *pfns = HMM_PFN_SPECIAL;
268}
269
270static int hmm_pfns_bad(unsigned long addr,
271 unsigned long end,
272 struct mm_walk *walk)
273{
274 struct hmm_range *range = walk->private;
275 hmm_pfn_t *pfns = range->pfns;
276 unsigned long i;
277
278 i = (addr - range->start) >> PAGE_SHIFT;
279 for (; addr < end; addr += PAGE_SIZE, i++)
280 pfns[i] = HMM_PFN_ERROR;
281
282 return 0;
283}
284
74eee180
JG
285static void hmm_pfns_clear(hmm_pfn_t *pfns,
286 unsigned long addr,
287 unsigned long end)
288{
289 for (; addr < end; addr += PAGE_SIZE, pfns++)
290 *pfns = 0;
291}
292
da4c3c73
JG
293static int hmm_vma_walk_hole(unsigned long addr,
294 unsigned long end,
295 struct mm_walk *walk)
296{
74eee180
JG
297 struct hmm_vma_walk *hmm_vma_walk = walk->private;
298 struct hmm_range *range = hmm_vma_walk->range;
da4c3c73
JG
299 hmm_pfn_t *pfns = range->pfns;
300 unsigned long i;
301
74eee180 302 hmm_vma_walk->last = addr;
da4c3c73 303 i = (addr - range->start) >> PAGE_SHIFT;
74eee180 304 for (; addr < end; addr += PAGE_SIZE, i++) {
da4c3c73 305 pfns[i] = HMM_PFN_EMPTY;
74eee180
JG
306 if (hmm_vma_walk->fault) {
307 int ret;
da4c3c73 308
74eee180
JG
309 ret = hmm_vma_do_fault(walk, addr, &pfns[i]);
310 if (ret != -EAGAIN)
311 return ret;
312 }
313 }
314
315 return hmm_vma_walk->fault ? -EAGAIN : 0;
da4c3c73
JG
316}
317
318static int hmm_vma_walk_clear(unsigned long addr,
319 unsigned long end,
320 struct mm_walk *walk)
321{
74eee180
JG
322 struct hmm_vma_walk *hmm_vma_walk = walk->private;
323 struct hmm_range *range = hmm_vma_walk->range;
da4c3c73
JG
324 hmm_pfn_t *pfns = range->pfns;
325 unsigned long i;
326
74eee180 327 hmm_vma_walk->last = addr;
da4c3c73 328 i = (addr - range->start) >> PAGE_SHIFT;
74eee180 329 for (; addr < end; addr += PAGE_SIZE, i++) {
da4c3c73 330 pfns[i] = 0;
74eee180
JG
331 if (hmm_vma_walk->fault) {
332 int ret;
da4c3c73 333
74eee180
JG
334 ret = hmm_vma_do_fault(walk, addr, &pfns[i]);
335 if (ret != -EAGAIN)
336 return ret;
337 }
338 }
339
340 return hmm_vma_walk->fault ? -EAGAIN : 0;
da4c3c73
JG
341}
342
343static int hmm_vma_walk_pmd(pmd_t *pmdp,
344 unsigned long start,
345 unsigned long end,
346 struct mm_walk *walk)
347{
74eee180
JG
348 struct hmm_vma_walk *hmm_vma_walk = walk->private;
349 struct hmm_range *range = hmm_vma_walk->range;
da4c3c73
JG
350 struct vm_area_struct *vma = walk->vma;
351 hmm_pfn_t *pfns = range->pfns;
352 unsigned long addr = start, i;
74eee180 353 bool write_fault;
da4c3c73
JG
354 hmm_pfn_t flag;
355 pte_t *ptep;
356
357 i = (addr - range->start) >> PAGE_SHIFT;
358 flag = vma->vm_flags & VM_READ ? HMM_PFN_READ : 0;
74eee180 359 write_fault = hmm_vma_walk->fault & hmm_vma_walk->write;
da4c3c73
JG
360
361again:
362 if (pmd_none(*pmdp))
363 return hmm_vma_walk_hole(start, end, walk);
364
365 if (pmd_huge(*pmdp) && vma->vm_flags & VM_HUGETLB)
366 return hmm_pfns_bad(start, end, walk);
367
368 if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
369 unsigned long pfn;
370 pmd_t pmd;
371
372 /*
373 * No need to take pmd_lock here, even if some other threads
374 * is splitting the huge pmd we will get that event through
375 * mmu_notifier callback.
376 *
377 * So just read pmd value and check again its a transparent
378 * huge or device mapping one and compute corresponding pfn
379 * values.
380 */
381 pmd = pmd_read_atomic(pmdp);
382 barrier();
383 if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
384 goto again;
385 if (pmd_protnone(pmd))
386 return hmm_vma_walk_clear(start, end, walk);
387
74eee180
JG
388 if (write_fault && !pmd_write(pmd))
389 return hmm_vma_walk_clear(start, end, walk);
390
da4c3c73
JG
391 pfn = pmd_pfn(pmd) + pte_index(addr);
392 flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0;
393 for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
394 pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag;
395 return 0;
396 }
397
398 if (pmd_bad(*pmdp))
399 return hmm_pfns_bad(start, end, walk);
400
401 ptep = pte_offset_map(pmdp, addr);
402 for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
403 pte_t pte = *ptep;
404
405 pfns[i] = 0;
406
74eee180 407 if (pte_none(pte)) {
da4c3c73 408 pfns[i] = HMM_PFN_EMPTY;
74eee180
JG
409 if (hmm_vma_walk->fault)
410 goto fault;
da4c3c73
JG
411 continue;
412 }
413
74eee180
JG
414 if (!pte_present(pte)) {
415 swp_entry_t entry;
416
417 if (!non_swap_entry(entry)) {
418 if (hmm_vma_walk->fault)
419 goto fault;
420 continue;
421 }
422
423 entry = pte_to_swp_entry(pte);
424
425 /*
426 * This is a special swap entry, ignore migration, use
427 * device and report anything else as error.
428 */
429 if (is_migration_entry(entry)) {
430 if (hmm_vma_walk->fault) {
431 pte_unmap(ptep);
432 hmm_vma_walk->last = addr;
433 migration_entry_wait(vma->vm_mm,
434 pmdp, addr);
435 return -EAGAIN;
436 }
437 continue;
438 } else {
439 /* Report error for everything else */
440 pfns[i] = HMM_PFN_ERROR;
441 }
442 continue;
443 }
444
445 if (write_fault && !pte_write(pte))
446 goto fault;
447
da4c3c73
JG
448 pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag;
449 pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0;
74eee180
JG
450 continue;
451
452fault:
453 pte_unmap(ptep);
454 /* Fault all pages in range */
455 return hmm_vma_walk_clear(start, end, walk);
da4c3c73
JG
456 }
457 pte_unmap(ptep - 1);
458
459 return 0;
460}
461
462/*
463 * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses
464 * @vma: virtual memory area containing the virtual address range
465 * @range: used to track snapshot validity
466 * @start: range virtual start address (inclusive)
467 * @end: range virtual end address (exclusive)
468 * @entries: array of hmm_pfn_t: provided by the caller, filled in by function
469 * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, 0 success
470 *
471 * This snapshots the CPU page table for a range of virtual addresses. Snapshot
472 * validity is tracked by range struct. See hmm_vma_range_done() for further
473 * information.
474 *
475 * The range struct is initialized here. It tracks the CPU page table, but only
476 * if the function returns success (0), in which case the caller must then call
477 * hmm_vma_range_done() to stop CPU page table update tracking on this range.
478 *
479 * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS
480 * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED !
481 */
482int hmm_vma_get_pfns(struct vm_area_struct *vma,
483 struct hmm_range *range,
484 unsigned long start,
485 unsigned long end,
486 hmm_pfn_t *pfns)
487{
74eee180 488 struct hmm_vma_walk hmm_vma_walk;
da4c3c73
JG
489 struct mm_walk mm_walk;
490 struct hmm *hmm;
491
492 /* FIXME support hugetlb fs */
493 if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
494 hmm_pfns_special(pfns, start, end);
495 return -EINVAL;
496 }
497
498 /* Sanity check, this really should not happen ! */
499 if (start < vma->vm_start || start >= vma->vm_end)
500 return -EINVAL;
501 if (end < vma->vm_start || end > vma->vm_end)
502 return -EINVAL;
503
504 hmm = hmm_register(vma->vm_mm);
505 if (!hmm)
506 return -ENOMEM;
507 /* Caller must have registered a mirror, via hmm_mirror_register() ! */
508 if (!hmm->mmu_notifier.ops)
509 return -EINVAL;
510
511 /* Initialize range to track CPU page table update */
512 range->start = start;
513 range->pfns = pfns;
514 range->end = end;
515 spin_lock(&hmm->lock);
516 range->valid = true;
517 list_add_rcu(&range->list, &hmm->ranges);
518 spin_unlock(&hmm->lock);
519
74eee180
JG
520 hmm_vma_walk.fault = false;
521 hmm_vma_walk.range = range;
522 mm_walk.private = &hmm_vma_walk;
523
da4c3c73
JG
524 mm_walk.vma = vma;
525 mm_walk.mm = vma->vm_mm;
da4c3c73
JG
526 mm_walk.pte_entry = NULL;
527 mm_walk.test_walk = NULL;
528 mm_walk.hugetlb_entry = NULL;
529 mm_walk.pmd_entry = hmm_vma_walk_pmd;
530 mm_walk.pte_hole = hmm_vma_walk_hole;
531
532 walk_page_range(start, end, &mm_walk);
da4c3c73
JG
533 return 0;
534}
535EXPORT_SYMBOL(hmm_vma_get_pfns);
536
537/*
538 * hmm_vma_range_done() - stop tracking change to CPU page table over a range
539 * @vma: virtual memory area containing the virtual address range
540 * @range: range being tracked
541 * Returns: false if range data has been invalidated, true otherwise
542 *
543 * Range struct is used to track updates to the CPU page table after a call to
544 * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done
545 * using the data, or wants to lock updates to the data it got from those
546 * functions, it must call the hmm_vma_range_done() function, which will then
547 * stop tracking CPU page table updates.
548 *
549 * Note that device driver must still implement general CPU page table update
550 * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using
551 * the mmu_notifier API directly.
552 *
553 * CPU page table update tracking done through hmm_range is only temporary and
554 * to be used while trying to duplicate CPU page table contents for a range of
555 * virtual addresses.
556 *
557 * There are two ways to use this :
558 * again:
74eee180 559 * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...);
da4c3c73
JG
560 * trans = device_build_page_table_update_transaction(pfns);
561 * device_page_table_lock();
562 * if (!hmm_vma_range_done(vma, range)) {
563 * device_page_table_unlock();
564 * goto again;
565 * }
566 * device_commit_transaction(trans);
567 * device_page_table_unlock();
568 *
569 * Or:
74eee180 570 * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...);
da4c3c73
JG
571 * device_page_table_lock();
572 * hmm_vma_range_done(vma, range);
573 * device_update_page_table(pfns);
574 * device_page_table_unlock();
575 */
576bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range)
577{
578 unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
579 struct hmm *hmm;
580
581 if (range->end <= range->start) {
582 BUG();
583 return false;
584 }
585
586 hmm = hmm_register(vma->vm_mm);
587 if (!hmm) {
588 memset(range->pfns, 0, sizeof(*range->pfns) * npages);
589 return false;
590 }
591
592 spin_lock(&hmm->lock);
593 list_del_rcu(&range->list);
594 spin_unlock(&hmm->lock);
595
596 return range->valid;
597}
598EXPORT_SYMBOL(hmm_vma_range_done);
74eee180
JG
599
600/*
601 * hmm_vma_fault() - try to fault some address in a virtual address range
602 * @vma: virtual memory area containing the virtual address range
603 * @range: use to track pfns array content validity
604 * @start: fault range virtual start address (inclusive)
605 * @end: fault range virtual end address (exclusive)
606 * @pfns: array of hmm_pfn_t, only entry with fault flag set will be faulted
607 * @write: is it a write fault
608 * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
609 * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop)
610 *
611 * This is similar to a regular CPU page fault except that it will not trigger
612 * any memory migration if the memory being faulted is not accessible by CPUs.
613 *
614 * On error, for one virtual address in the range, the function will set the
615 * hmm_pfn_t error flag for the corresponding pfn entry.
616 *
617 * Expected use pattern:
618 * retry:
619 * down_read(&mm->mmap_sem);
620 * // Find vma and address device wants to fault, initialize hmm_pfn_t
621 * // array accordingly
622 * ret = hmm_vma_fault(vma, start, end, pfns, allow_retry);
623 * switch (ret) {
624 * case -EAGAIN:
625 * hmm_vma_range_done(vma, range);
626 * // You might want to rate limit or yield to play nicely, you may
627 * // also commit any valid pfn in the array assuming that you are
628 * // getting true from hmm_vma_range_monitor_end()
629 * goto retry;
630 * case 0:
631 * break;
632 * default:
633 * // Handle error !
634 * up_read(&mm->mmap_sem)
635 * return;
636 * }
637 * // Take device driver lock that serialize device page table update
638 * driver_lock_device_page_table_update();
639 * hmm_vma_range_done(vma, range);
640 * // Commit pfns we got from hmm_vma_fault()
641 * driver_unlock_device_page_table_update();
642 * up_read(&mm->mmap_sem)
643 *
644 * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0)
645 * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION !
646 *
647 * YOU HAVE BEEN WARNED !
648 */
649int hmm_vma_fault(struct vm_area_struct *vma,
650 struct hmm_range *range,
651 unsigned long start,
652 unsigned long end,
653 hmm_pfn_t *pfns,
654 bool write,
655 bool block)
656{
657 struct hmm_vma_walk hmm_vma_walk;
658 struct mm_walk mm_walk;
659 struct hmm *hmm;
660 int ret;
661
662 /* Sanity check, this really should not happen ! */
663 if (start < vma->vm_start || start >= vma->vm_end)
664 return -EINVAL;
665 if (end < vma->vm_start || end > vma->vm_end)
666 return -EINVAL;
667
668 hmm = hmm_register(vma->vm_mm);
669 if (!hmm) {
670 hmm_pfns_clear(pfns, start, end);
671 return -ENOMEM;
672 }
673 /* Caller must have registered a mirror using hmm_mirror_register() */
674 if (!hmm->mmu_notifier.ops)
675 return -EINVAL;
676
677 /* Initialize range to track CPU page table update */
678 range->start = start;
679 range->pfns = pfns;
680 range->end = end;
681 spin_lock(&hmm->lock);
682 range->valid = true;
683 list_add_rcu(&range->list, &hmm->ranges);
684 spin_unlock(&hmm->lock);
685
686 /* FIXME support hugetlb fs */
687 if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
688 hmm_pfns_special(pfns, start, end);
689 return 0;
690 }
691
692 hmm_vma_walk.fault = true;
693 hmm_vma_walk.write = write;
694 hmm_vma_walk.block = block;
695 hmm_vma_walk.range = range;
696 mm_walk.private = &hmm_vma_walk;
697 hmm_vma_walk.last = range->start;
698
699 mm_walk.vma = vma;
700 mm_walk.mm = vma->vm_mm;
701 mm_walk.pte_entry = NULL;
702 mm_walk.test_walk = NULL;
703 mm_walk.hugetlb_entry = NULL;
704 mm_walk.pmd_entry = hmm_vma_walk_pmd;
705 mm_walk.pte_hole = hmm_vma_walk_hole;
706
707 do {
708 ret = walk_page_range(start, end, &mm_walk);
709 start = hmm_vma_walk.last;
710 } while (ret == -EAGAIN);
711
712 if (ret) {
713 unsigned long i;
714
715 i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
716 hmm_pfns_clear(&pfns[i], hmm_vma_walk.last, end);
717 hmm_vma_range_done(vma, range);
718 }
719 return ret;
720}
721EXPORT_SYMBOL(hmm_vma_fault);
c0b12405 722#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */