Commit | Line | Data |
---|---|---|
b2441318 | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
1da177e4 LT |
2 | #ifndef _LINUX_RMAP_H |
3 | #define _LINUX_RMAP_H | |
4 | /* | |
5 | * Declarations for Reverse Mapping functions in mm/rmap.c | |
6 | */ | |
7 | ||
1da177e4 LT |
8 | #include <linux/list.h> |
9 | #include <linux/slab.h> | |
10 | #include <linux/mm.h> | |
5a505085 | 11 | #include <linux/rwsem.h> |
bed7161a | 12 | #include <linux/memcontrol.h> |
ace71a19 | 13 | #include <linux/highmem.h> |
2aff7a47 | 14 | #include <linux/pagemap.h> |
fb3d824d | 15 | #include <linux/memremap.h> |
1da177e4 LT |
16 | |
17 | /* | |
18 | * The anon_vma heads a list of private "related" vmas, to scan if | |
19 | * an anonymous page pointing to this anon_vma needs to be unmapped: | |
20 | * the vmas on the list will be related by forking, or by splitting. | |
21 | * | |
22 | * Since vmas come and go as they are split and merged (particularly | |
23 | * in mprotect), the mapping field of an anonymous page cannot point | |
24 | * directly to a vma: instead it points to an anon_vma, on whose list | |
25 | * the related vmas can be easily linked or unlinked. | |
26 | * | |
27 | * After unlinking the last vma on the list, we must garbage collect | |
28 | * the anon_vma object itself: we're guaranteed no page can be | |
29 | * pointing to this anon_vma once its vma list is empty. | |
30 | */ | |
31 | struct anon_vma { | |
5a505085 IM |
32 | struct anon_vma *root; /* Root of this anon_vma tree */ |
33 | struct rw_semaphore rwsem; /* W: modification, R: walking the list */ | |
7f60c214 | 34 | /* |
83813267 | 35 | * The refcount is taken on an anon_vma when there is no |
7f60c214 MG |
36 | * guarantee that the vma of page tables will exist for |
37 | * the duration of the operation. A caller that takes | |
38 | * the reference is responsible for clearing up the | |
39 | * anon_vma if they are the last user on release | |
40 | */ | |
83813267 PZ |
41 | atomic_t refcount; |
42 | ||
7a3ef208 | 43 | /* |
2555283e JH |
44 | * Count of child anon_vmas. Equals to the count of all anon_vmas that |
45 | * have ->parent pointing to this one, including itself. | |
7a3ef208 KK |
46 | * |
47 | * This counter is used for making decision about reusing anon_vma | |
48 | * instead of forking new one. See comments in function anon_vma_clone. | |
49 | */ | |
2555283e JH |
50 | unsigned long num_children; |
51 | /* Count of VMAs whose ->anon_vma pointer points to this object. */ | |
52 | unsigned long num_active_vmas; | |
7a3ef208 KK |
53 | |
54 | struct anon_vma *parent; /* Parent of this anon_vma */ | |
55 | ||
7906d00c | 56 | /* |
bf181b9f | 57 | * NOTE: the LSB of the rb_root.rb_node is set by |
7906d00c | 58 | * mm_take_all_locks() _after_ taking the above lock. So the |
bf181b9f | 59 | * rb_root must only be read/written after taking the above lock |
7906d00c AA |
60 | * to be sure to see a valid next pointer. The LSB bit itself |
61 | * is serialized by a system wide lock only visible to | |
62 | * mm_take_all_locks() (mm_all_locks_mutex). | |
63 | */ | |
f808c13f DB |
64 | |
65 | /* Interval tree of private "related" vmas */ | |
66 | struct rb_root_cached rb_root; | |
5beb4930 RR |
67 | }; |
68 | ||
69 | /* | |
70 | * The copy-on-write semantics of fork mean that an anon_vma | |
71 | * can become associated with multiple processes. Furthermore, | |
72 | * each child process will have its own anon_vma, where new | |
73 | * pages for that process are instantiated. | |
74 | * | |
75 | * This structure allows us to find the anon_vmas associated | |
76 | * with a VMA, or the VMAs associated with an anon_vma. | |
77 | * The "same_vma" list contains the anon_vma_chains linking | |
78 | * all the anon_vmas associated with this VMA. | |
bf181b9f | 79 | * The "rb" field indexes on an interval tree the anon_vma_chains |
5beb4930 RR |
80 | * which link all the VMAs associated with this anon_vma. |
81 | */ | |
82 | struct anon_vma_chain { | |
83 | struct vm_area_struct *vma; | |
84 | struct anon_vma *anon_vma; | |
c1e8d7c6 | 85 | struct list_head same_vma; /* locked by mmap_lock & page_table_lock */ |
5a505085 | 86 | struct rb_node rb; /* locked by anon_vma->rwsem */ |
bf181b9f | 87 | unsigned long rb_subtree_last; |
ed8ea815 ML |
88 | #ifdef CONFIG_DEBUG_VM_RB |
89 | unsigned long cached_vma_start, cached_vma_last; | |
90 | #endif | |
1da177e4 LT |
91 | }; |
92 | ||
02c6de8d | 93 | enum ttu_flags { |
a128ca71 SL |
94 | TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */ |
95 | TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */ | |
732ed558 | 96 | TTU_SYNC = 0x10, /* avoid racy checks with PVMW_SYNC */ |
6da6b1d4 | 97 | TTU_HWPOISON = 0x20, /* do convert pte to hwpoison entry */ |
a128ca71 | 98 | TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible |
72b252ae MG |
99 | * and caller guarantees they will |
100 | * do a final flush if necessary */ | |
b5ff8161 | 101 | TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock: |
2a52bcbc | 102 | * caller holds it */ |
02c6de8d MK |
103 | }; |
104 | ||
1da177e4 | 105 | #ifdef CONFIG_MMU |
76545066 RR |
106 | static inline void get_anon_vma(struct anon_vma *anon_vma) |
107 | { | |
83813267 | 108 | atomic_inc(&anon_vma->refcount); |
76545066 RR |
109 | } |
110 | ||
01d8b20d PZ |
111 | void __put_anon_vma(struct anon_vma *anon_vma); |
112 | ||
113 | static inline void put_anon_vma(struct anon_vma *anon_vma) | |
114 | { | |
115 | if (atomic_dec_and_test(&anon_vma->refcount)) | |
116 | __put_anon_vma(anon_vma); | |
117 | } | |
1da177e4 | 118 | |
4fc3f1d6 | 119 | static inline void anon_vma_lock_write(struct anon_vma *anon_vma) |
cba48b98 | 120 | { |
5a505085 | 121 | down_write(&anon_vma->root->rwsem); |
cba48b98 RR |
122 | } |
123 | ||
adef4406 AA |
124 | static inline int anon_vma_trylock_write(struct anon_vma *anon_vma) |
125 | { | |
126 | return down_write_trylock(&anon_vma->root->rwsem); | |
127 | } | |
128 | ||
08b52706 | 129 | static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) |
cba48b98 | 130 | { |
5a505085 | 131 | up_write(&anon_vma->root->rwsem); |
cba48b98 RR |
132 | } |
133 | ||
4fc3f1d6 IM |
134 | static inline void anon_vma_lock_read(struct anon_vma *anon_vma) |
135 | { | |
136 | down_read(&anon_vma->root->rwsem); | |
137 | } | |
138 | ||
6d4675e6 MK |
139 | static inline int anon_vma_trylock_read(struct anon_vma *anon_vma) |
140 | { | |
141 | return down_read_trylock(&anon_vma->root->rwsem); | |
142 | } | |
143 | ||
4fc3f1d6 IM |
144 | static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) |
145 | { | |
146 | up_read(&anon_vma->root->rwsem); | |
147 | } | |
148 | ||
149 | ||
1da177e4 LT |
150 | /* |
151 | * anon_vma helper functions. | |
152 | */ | |
153 | void anon_vma_init(void); /* create anon_vma_cachep */ | |
d5a187da | 154 | int __anon_vma_prepare(struct vm_area_struct *); |
5beb4930 RR |
155 | void unlink_anon_vmas(struct vm_area_struct *); |
156 | int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); | |
157 | int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); | |
1da177e4 | 158 | |
d5a187da VB |
159 | static inline int anon_vma_prepare(struct vm_area_struct *vma) |
160 | { | |
161 | if (likely(vma->anon_vma)) | |
162 | return 0; | |
163 | ||
164 | return __anon_vma_prepare(vma); | |
165 | } | |
166 | ||
5beb4930 RR |
167 | static inline void anon_vma_merge(struct vm_area_struct *vma, |
168 | struct vm_area_struct *next) | |
169 | { | |
81d1b09c | 170 | VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma); |
5beb4930 RR |
171 | unlink_anon_vmas(next); |
172 | } | |
173 | ||
29eea9b5 | 174 | struct anon_vma *folio_get_anon_vma(struct folio *folio); |
01d8b20d | 175 | |
14f9135d DH |
176 | /* RMAP flags, currently only relevant for some anon rmap operations. */ |
177 | typedef int __bitwise rmap_t; | |
178 | ||
179 | /* | |
0cae959e DH |
180 | * No special request: A mapped anonymous (sub)page is possibly shared between |
181 | * processes. | |
14f9135d DH |
182 | */ |
183 | #define RMAP_NONE ((__force rmap_t)0) | |
184 | ||
0cae959e | 185 | /* The anonymous (sub)page is exclusive to a single process. */ |
14f9135d DH |
186 | #define RMAP_EXCLUSIVE ((__force rmap_t)BIT(0)) |
187 | ||
68f03208 DH |
188 | /* |
189 | * Internally, we're using an enum to specify the granularity. We make the | |
190 | * compiler emit specialized code for each granularity. | |
191 | */ | |
192 | enum rmap_level { | |
193 | RMAP_LEVEL_PTE = 0, | |
194 | RMAP_LEVEL_PMD, | |
195 | }; | |
196 | ||
197 | static inline void __folio_rmap_sanity_checks(struct folio *folio, | |
198 | struct page *page, int nr_pages, enum rmap_level level) | |
199 | { | |
200 | /* hugetlb folios are handled separately. */ | |
201 | VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); | |
9c593869 DH |
202 | |
203 | /* | |
204 | * TODO: we get driver-allocated folios that have nothing to do with | |
205 | * the rmap using vm_insert_page(); therefore, we cannot assume that | |
206 | * folio_test_large_rmappable() holds for large folios. We should | |
207 | * handle any desired mapcount+stats accounting for these folios in | |
208 | * VM_MIXEDMAP VMAs separately, and then sanity-check here that | |
209 | * we really only get rmappable folios. | |
210 | */ | |
68f03208 DH |
211 | |
212 | VM_WARN_ON_ONCE(nr_pages <= 0); | |
213 | VM_WARN_ON_FOLIO(page_folio(page) != folio, folio); | |
214 | VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio); | |
215 | ||
216 | switch (level) { | |
217 | case RMAP_LEVEL_PTE: | |
218 | break; | |
219 | case RMAP_LEVEL_PMD: | |
220 | /* | |
221 | * We don't support folios larger than a single PMD yet. So | |
222 | * when RMAP_LEVEL_PMD is set, we assume that we are creating | |
223 | * a single "entire" mapping of the folio. | |
224 | */ | |
225 | VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio); | |
226 | VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio); | |
227 | break; | |
228 | default: | |
229 | VM_WARN_ON_ONCE(true); | |
230 | } | |
231 | } | |
232 | ||
1da177e4 LT |
233 | /* |
234 | * rmap interfaces called when adding or removing pte of page | |
235 | */ | |
06968625 | 236 | void folio_move_anon_rmap(struct folio *, struct vm_area_struct *); |
8bd51300 DH |
237 | void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages, |
238 | struct vm_area_struct *, unsigned long address, rmap_t flags); | |
239 | #define folio_add_anon_rmap_pte(folio, page, vma, address, flags) \ | |
240 | folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags) | |
241 | void folio_add_anon_rmap_pmd(struct folio *, struct page *, | |
242 | struct vm_area_struct *, unsigned long address, rmap_t flags); | |
4d510f3d MWO |
243 | void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, |
244 | unsigned long address); | |
68f03208 DH |
245 | void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, |
246 | struct vm_area_struct *); | |
247 | #define folio_add_file_rmap_pte(folio, page, vma) \ | |
248 | folio_add_file_rmap_ptes(folio, page, 1, vma) | |
249 | void folio_add_file_rmap_pmd(struct folio *, struct page *, | |
250 | struct vm_area_struct *); | |
b06dc281 DH |
251 | void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages, |
252 | struct vm_area_struct *); | |
253 | #define folio_remove_rmap_pte(folio, page, vma) \ | |
254 | folio_remove_rmap_ptes(folio, page, 1, vma) | |
255 | void folio_remove_rmap_pmd(struct folio *, struct page *, | |
256 | struct vm_area_struct *); | |
40f2bbf7 | 257 | |
9d5fafd5 | 258 | void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, |
28c5209d | 259 | unsigned long address, rmap_t flags); |
9d5fafd5 | 260 | void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *, |
cea86fe2 | 261 | unsigned long address); |
0fe6e20b | 262 | |
a13d0964 | 263 | /* See folio_try_dup_anon_rmap_*() */ |
ebe2e35e DH |
264 | static inline int hugetlb_try_dup_anon_rmap(struct folio *folio, |
265 | struct vm_area_struct *vma) | |
266 | { | |
267 | VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); | |
268 | VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); | |
269 | ||
270 | if (PageAnonExclusive(&folio->page)) { | |
271 | if (unlikely(folio_needs_cow_for_dma(vma, folio))) | |
272 | return -EBUSY; | |
273 | ClearPageAnonExclusive(&folio->page); | |
274 | } | |
275 | atomic_inc(&folio->_entire_mapcount); | |
276 | return 0; | |
277 | } | |
278 | ||
e3b4b137 | 279 | /* See folio_try_share_anon_rmap_*() */ |
0c2ec32b DH |
280 | static inline int hugetlb_try_share_anon_rmap(struct folio *folio) |
281 | { | |
282 | VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); | |
283 | VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); | |
284 | VM_WARN_ON_FOLIO(!PageAnonExclusive(&folio->page), folio); | |
285 | ||
286 | /* Paired with the memory barrier in try_grab_folio(). */ | |
287 | if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) | |
288 | smp_mb(); | |
289 | ||
290 | if (unlikely(folio_maybe_dma_pinned(folio))) | |
291 | return -EBUSY; | |
292 | ClearPageAnonExclusive(&folio->page); | |
293 | ||
294 | /* | |
295 | * This is conceptually a smp_wmb() paired with the smp_rmb() in | |
296 | * gup_must_unshare(). | |
297 | */ | |
298 | if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) | |
299 | smp_mb__after_atomic(); | |
300 | return 0; | |
301 | } | |
302 | ||
44887f39 DH |
303 | static inline void hugetlb_add_file_rmap(struct folio *folio) |
304 | { | |
305 | VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); | |
306 | VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); | |
307 | ||
308 | atomic_inc(&folio->_entire_mapcount); | |
309 | } | |
310 | ||
e135826b DH |
311 | static inline void hugetlb_remove_rmap(struct folio *folio) |
312 | { | |
313 | VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); | |
314 | ||
315 | atomic_dec(&folio->_entire_mapcount); | |
316 | } | |
317 | ||
d8ef5e31 DH |
318 | static __always_inline void __folio_dup_file_rmap(struct folio *folio, |
319 | struct page *page, int nr_pages, enum rmap_level level) | |
320 | { | |
321 | __folio_rmap_sanity_checks(folio, page, nr_pages, level); | |
322 | ||
323 | switch (level) { | |
324 | case RMAP_LEVEL_PTE: | |
325 | do { | |
326 | atomic_inc(&page->_mapcount); | |
327 | } while (page++, --nr_pages > 0); | |
328 | break; | |
329 | case RMAP_LEVEL_PMD: | |
330 | atomic_inc(&folio->_entire_mapcount); | |
331 | break; | |
332 | } | |
333 | } | |
334 | ||
335 | /** | |
336 | * folio_dup_file_rmap_ptes - duplicate PTE mappings of a page range of a folio | |
337 | * @folio: The folio to duplicate the mappings of | |
338 | * @page: The first page to duplicate the mappings of | |
339 | * @nr_pages: The number of pages of which the mapping will be duplicated | |
340 | * | |
341 | * The page range of the folio is defined by [page, page + nr_pages) | |
342 | * | |
343 | * The caller needs to hold the page table lock. | |
344 | */ | |
345 | static inline void folio_dup_file_rmap_ptes(struct folio *folio, | |
346 | struct page *page, int nr_pages) | |
347 | { | |
348 | __folio_dup_file_rmap(folio, page, nr_pages, RMAP_LEVEL_PTE); | |
349 | } | |
350 | #define folio_dup_file_rmap_pte(folio, page) \ | |
351 | folio_dup_file_rmap_ptes(folio, page, 1) | |
352 | ||
353 | /** | |
354 | * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio | |
355 | * @folio: The folio to duplicate the mapping of | |
356 | * @page: The first page to duplicate the mapping of | |
357 | * | |
358 | * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) | |
359 | * | |
360 | * The caller needs to hold the page table lock. | |
361 | */ | |
362 | static inline void folio_dup_file_rmap_pmd(struct folio *folio, | |
363 | struct page *page) | |
364 | { | |
365 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | |
366 | __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, RMAP_LEVEL_PTE); | |
367 | #else | |
368 | WARN_ON_ONCE(true); | |
369 | #endif | |
370 | } | |
371 | ||
61d90309 DH |
372 | static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, |
373 | struct page *page, int nr_pages, struct vm_area_struct *src_vma, | |
374 | enum rmap_level level) | |
1da177e4 | 375 | { |
61d90309 DH |
376 | bool maybe_pinned; |
377 | int i; | |
ebe2e35e | 378 | |
61d90309 DH |
379 | VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); |
380 | __folio_rmap_sanity_checks(folio, page, nr_pages, level); | |
c7f84b57 | 381 | |
61d90309 DH |
382 | /* |
383 | * If this folio may have been pinned by the parent process, | |
384 | * don't allow to duplicate the mappings but instead require to e.g., | |
385 | * copy the subpage immediately for the child so that we'll always | |
386 | * guarantee the pinned folio won't be randomly replaced in the | |
387 | * future on write faults. | |
388 | */ | |
389 | maybe_pinned = likely(!folio_is_device_private(folio)) && | |
390 | unlikely(folio_needs_cow_for_dma(src_vma, folio)); | |
391 | ||
392 | /* | |
393 | * No need to check+clear for already shared PTEs/PMDs of the | |
394 | * folio. But if any page is PageAnonExclusive, we must fallback to | |
395 | * copying if the folio maybe pinned. | |
396 | */ | |
397 | switch (level) { | |
398 | case RMAP_LEVEL_PTE: | |
399 | if (unlikely(maybe_pinned)) { | |
400 | for (i = 0; i < nr_pages; i++) | |
401 | if (PageAnonExclusive(page + i)) | |
402 | return -EBUSY; | |
403 | } | |
404 | do { | |
405 | if (PageAnonExclusive(page)) | |
406 | ClearPageAnonExclusive(page); | |
407 | atomic_inc(&page->_mapcount); | |
408 | } while (page++, --nr_pages > 0); | |
409 | break; | |
410 | case RMAP_LEVEL_PMD: | |
411 | if (PageAnonExclusive(page)) { | |
412 | if (unlikely(maybe_pinned)) | |
413 | return -EBUSY; | |
414 | ClearPageAnonExclusive(page); | |
415 | } | |
c7f84b57 | 416 | atomic_inc(&folio->_entire_mapcount); |
61d90309 | 417 | break; |
c7f84b57 | 418 | } |
61d90309 | 419 | return 0; |
1da177e4 LT |
420 | } |
421 | ||
fb3d824d | 422 | /** |
61d90309 DH |
423 | * folio_try_dup_anon_rmap_ptes - try duplicating PTE mappings of a page range |
424 | * of a folio | |
425 | * @folio: The folio to duplicate the mappings of | |
426 | * @page: The first page to duplicate the mappings of | |
427 | * @nr_pages: The number of pages of which the mapping will be duplicated | |
428 | * @src_vma: The vm area from which the mappings are duplicated | |
fb3d824d | 429 | * |
61d90309 | 430 | * The page range of the folio is defined by [page, page + nr_pages) |
fb3d824d | 431 | * |
61d90309 DH |
432 | * The caller needs to hold the page table lock and the |
433 | * vma->vma_mm->write_protect_seq. | |
fb3d824d | 434 | * |
61d90309 DH |
435 | * Duplicating the mappings can only fail if the folio may be pinned; device |
436 | * private folios cannot get pinned and consequently this function cannot fail | |
437 | * for them. | |
438 | * | |
439 | * If duplicating the mappings succeeded, the duplicated PTEs have to be R/O in | |
440 | * the parent and the child. They must *not* be writable after this call | |
441 | * succeeded. | |
442 | * | |
443 | * Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise. | |
444 | */ | |
445 | static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio, | |
446 | struct page *page, int nr_pages, struct vm_area_struct *src_vma) | |
447 | { | |
448 | return __folio_try_dup_anon_rmap(folio, page, nr_pages, src_vma, | |
449 | RMAP_LEVEL_PTE); | |
450 | } | |
451 | #define folio_try_dup_anon_rmap_pte(folio, page, vma) \ | |
452 | folio_try_dup_anon_rmap_ptes(folio, page, 1, vma) | |
453 | ||
454 | /** | |
455 | * folio_try_dup_anon_rmap_pmd - try duplicating a PMD mapping of a page range | |
456 | * of a folio | |
457 | * @folio: The folio to duplicate the mapping of | |
458 | * @page: The first page to duplicate the mapping of | |
459 | * @src_vma: The vm area from which the mapping is duplicated | |
460 | * | |
461 | * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) | |
462 | * | |
463 | * The caller needs to hold the page table lock and the | |
464 | * vma->vma_mm->write_protect_seq. | |
465 | * | |
466 | * Duplicating the mapping can only fail if the folio may be pinned; device | |
467 | * private folios cannot get pinned and consequently this function cannot fail | |
468 | * for them. | |
469 | * | |
470 | * If duplicating the mapping succeeds, the duplicated PMD has to be R/O in | |
471 | * the parent and the child. They must *not* be writable after this call | |
472 | * succeeded. | |
fb3d824d DH |
473 | * |
474 | * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise. | |
475 | */ | |
61d90309 DH |
476 | static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio, |
477 | struct page *page, struct vm_area_struct *src_vma) | |
478 | { | |
479 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | |
480 | return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, src_vma, | |
481 | RMAP_LEVEL_PMD); | |
482 | #else | |
483 | WARN_ON_ONCE(true); | |
484 | return -EBUSY; | |
485 | #endif | |
486 | } | |
487 | ||
e3b4b137 DH |
488 | static __always_inline int __folio_try_share_anon_rmap(struct folio *folio, |
489 | struct page *page, int nr_pages, enum rmap_level level) | |
6c287605 | 490 | { |
e3b4b137 DH |
491 | VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); |
492 | VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio); | |
493 | __folio_rmap_sanity_checks(folio, page, nr_pages, level); | |
6c287605 | 494 | |
e3b4b137 DH |
495 | /* device private folios cannot get pinned via GUP. */ |
496 | if (unlikely(folio_is_device_private(folio))) { | |
088b8aa5 DH |
497 | ClearPageAnonExclusive(page); |
498 | return 0; | |
499 | } | |
500 | ||
501 | /* | |
502 | * We have to make sure that when we clear PageAnonExclusive, that | |
503 | * the page is not pinned and that concurrent GUP-fast won't succeed in | |
504 | * concurrently pinning the page. | |
505 | * | |
506 | * Conceptually, PageAnonExclusive clearing consists of: | |
507 | * (A1) Clear PTE | |
508 | * (A2) Check if the page is pinned; back off if so. | |
509 | * (A3) Clear PageAnonExclusive | |
510 | * (A4) Restore PTE (optional, but certainly not writable) | |
511 | * | |
512 | * When clearing PageAnonExclusive, we cannot possibly map the page | |
513 | * writable again, because anon pages that may be shared must never | |
514 | * be writable. So in any case, if the PTE was writable it cannot | |
515 | * be writable anymore afterwards and there would be a PTE change. Only | |
516 | * if the PTE wasn't writable, there might not be a PTE change. | |
517 | * | |
518 | * Conceptually, GUP-fast pinning of an anon page consists of: | |
519 | * (B1) Read the PTE | |
520 | * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so. | |
521 | * (B3) Pin the mapped page | |
522 | * (B4) Check if the PTE changed by re-reading it; back off if so. | |
523 | * (B5) If the original PTE is not writable, check if | |
524 | * PageAnonExclusive is not set; back off if so. | |
525 | * | |
526 | * If the PTE was writable, we only have to make sure that GUP-fast | |
527 | * observes a PTE change and properly backs off. | |
528 | * | |
529 | * If the PTE was not writable, we have to make sure that GUP-fast either | |
530 | * detects a (temporary) PTE change or that PageAnonExclusive is cleared | |
531 | * and properly backs off. | |
532 | * | |
533 | * Consequently, when clearing PageAnonExclusive(), we have to make | |
534 | * sure that (A1), (A2)/(A3) and (A4) happen in the right memory | |
535 | * order. In GUP-fast pinning code, we have to make sure that (B3),(B4) | |
536 | * and (B5) happen in the right memory order. | |
537 | * | |
538 | * We assume that there might not be a memory barrier after | |
539 | * clearing/invalidating the PTE (A1) and before restoring the PTE (A4), | |
540 | * so we use explicit ones here. | |
541 | */ | |
6c287605 | 542 | |
088b8aa5 DH |
543 | /* Paired with the memory barrier in try_grab_folio(). */ |
544 | if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) | |
545 | smp_mb(); | |
6c287605 | 546 | |
e3b4b137 | 547 | if (unlikely(folio_maybe_dma_pinned(folio))) |
088b8aa5 | 548 | return -EBUSY; |
6c287605 | 549 | ClearPageAnonExclusive(page); |
088b8aa5 DH |
550 | |
551 | /* | |
552 | * This is conceptually a smp_wmb() paired with the smp_rmb() in | |
553 | * gup_must_unshare(). | |
554 | */ | |
555 | if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) | |
556 | smp_mb__after_atomic(); | |
6c287605 DH |
557 | return 0; |
558 | } | |
559 | ||
e3b4b137 DH |
560 | /** |
561 | * folio_try_share_anon_rmap_pte - try marking an exclusive anonymous page | |
562 | * mapped by a PTE possibly shared to prepare | |
563 | * for KSM or temporary unmapping | |
564 | * @folio: The folio to share a mapping of | |
565 | * @page: The mapped exclusive page | |
566 | * | |
567 | * The caller needs to hold the page table lock and has to have the page table | |
568 | * entries cleared/invalidated. | |
569 | * | |
570 | * This is similar to folio_try_dup_anon_rmap_pte(), however, not used during | |
571 | * fork() to duplicate mappings, but instead to prepare for KSM or temporarily | |
572 | * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pte(). | |
573 | * | |
574 | * Marking the mapped page shared can only fail if the folio maybe pinned; | |
575 | * device private folios cannot get pinned and consequently this function cannot | |
576 | * fail. | |
577 | * | |
578 | * Returns 0 if marking the mapped page possibly shared succeeded. Returns | |
579 | * -EBUSY otherwise. | |
580 | */ | |
581 | static inline int folio_try_share_anon_rmap_pte(struct folio *folio, | |
582 | struct page *page) | |
583 | { | |
584 | return __folio_try_share_anon_rmap(folio, page, 1, RMAP_LEVEL_PTE); | |
585 | } | |
586 | ||
587 | /** | |
588 | * folio_try_share_anon_rmap_pmd - try marking an exclusive anonymous page | |
589 | * range mapped by a PMD possibly shared to | |
590 | * prepare for temporary unmapping | |
591 | * @folio: The folio to share the mapping of | |
592 | * @page: The first page to share the mapping of | |
593 | * | |
594 | * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) | |
595 | * | |
596 | * The caller needs to hold the page table lock and has to have the page table | |
597 | * entries cleared/invalidated. | |
598 | * | |
599 | * This is similar to folio_try_dup_anon_rmap_pmd(), however, not used during | |
600 | * fork() to duplicate a mapping, but instead to prepare for temporarily | |
601 | * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pmd(). | |
602 | * | |
603 | * Marking the mapped pages shared can only fail if the folio maybe pinned; | |
604 | * device private folios cannot get pinned and consequently this function cannot | |
605 | * fail. | |
606 | * | |
607 | * Returns 0 if marking the mapped pages possibly shared succeeded. Returns | |
608 | * -EBUSY otherwise. | |
609 | */ | |
610 | static inline int folio_try_share_anon_rmap_pmd(struct folio *folio, | |
611 | struct page *page) | |
612 | { | |
613 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | |
614 | return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR, | |
615 | RMAP_LEVEL_PMD); | |
616 | #else | |
617 | WARN_ON_ONCE(true); | |
618 | return -EBUSY; | |
619 | #endif | |
620 | } | |
621 | ||
1da177e4 LT |
622 | /* |
623 | * Called from mm/vmscan.c to handle paging out | |
624 | */ | |
b3ac0413 | 625 | int folio_referenced(struct folio *, int is_locked, |
72835c86 | 626 | struct mem_cgroup *memcg, unsigned long *vm_flags); |
5ad64688 | 627 | |
4b8554c5 | 628 | void try_to_migrate(struct folio *folio, enum ttu_flags flags); |
869f7ee6 | 629 | void try_to_unmap(struct folio *, enum ttu_flags flags); |
1da177e4 | 630 | |
b756a3b5 AP |
631 | int make_device_exclusive_range(struct mm_struct *mm, unsigned long start, |
632 | unsigned long end, struct page **pages, | |
633 | void *arg); | |
634 | ||
ace71a19 KS |
635 | /* Avoid racy checks */ |
636 | #define PVMW_SYNC (1 << 0) | |
2aff7a47 | 637 | /* Look for migration entries rather than present PTEs */ |
ace71a19 KS |
638 | #define PVMW_MIGRATION (1 << 1) |
639 | ||
640 | struct page_vma_mapped_walk { | |
2aff7a47 MWO |
641 | unsigned long pfn; |
642 | unsigned long nr_pages; | |
643 | pgoff_t pgoff; | |
ace71a19 KS |
644 | struct vm_area_struct *vma; |
645 | unsigned long address; | |
646 | pmd_t *pmd; | |
647 | pte_t *pte; | |
648 | spinlock_t *ptl; | |
649 | unsigned int flags; | |
650 | }; | |
651 | ||
eed05e54 MWO |
652 | #define DEFINE_PAGE_VMA_WALK(name, _page, _vma, _address, _flags) \ |
653 | struct page_vma_mapped_walk name = { \ | |
2aff7a47 | 654 | .pfn = page_to_pfn(_page), \ |
507db792 YS |
655 | .nr_pages = compound_nr(_page), \ |
656 | .pgoff = page_to_pgoff(_page), \ | |
eed05e54 MWO |
657 | .vma = _vma, \ |
658 | .address = _address, \ | |
659 | .flags = _flags, \ | |
660 | } | |
661 | ||
662 | #define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags) \ | |
663 | struct page_vma_mapped_walk name = { \ | |
2aff7a47 MWO |
664 | .pfn = folio_pfn(_folio), \ |
665 | .nr_pages = folio_nr_pages(_folio), \ | |
666 | .pgoff = folio_pgoff(_folio), \ | |
eed05e54 MWO |
667 | .vma = _vma, \ |
668 | .address = _address, \ | |
669 | .flags = _flags, \ | |
670 | } | |
671 | ||
ace71a19 KS |
672 | static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw) |
673 | { | |
5d5d19ed | 674 | /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */ |
2aff7a47 | 675 | if (pvmw->pte && !is_vm_hugetlb_page(pvmw->vma)) |
ace71a19 KS |
676 | pte_unmap(pvmw->pte); |
677 | if (pvmw->ptl) | |
678 | spin_unlock(pvmw->ptl); | |
679 | } | |
680 | ||
681 | bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw); | |
682 | ||
1da177e4 LT |
683 | /* |
684 | * Used by swapoff to help locate where page is expected in vma. | |
685 | */ | |
686 | unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); | |
687 | ||
d08b3851 PZ |
688 | /* |
689 | * Cleans the PTEs of shared mappings. | |
690 | * (and since clean PTEs should also be readonly, write protects them too) | |
691 | * | |
692 | * returns the number of cleaned PTEs. | |
693 | */ | |
d9c08e22 | 694 | int folio_mkclean(struct folio *); |
d08b3851 | 695 | |
6a8e0596 MS |
696 | int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, |
697 | struct vm_area_struct *vma); | |
698 | ||
4eecb8b9 | 699 | void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); |
e388466d | 700 | |
6a46079c | 701 | int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); |
10be22df | 702 | |
0dd1c7bb JK |
703 | /* |
704 | * rmap_walk_control: To control rmap traversing for specific needs | |
705 | * | |
706 | * arg: passed to rmap_one() and invalid_vma() | |
6d4675e6 MK |
707 | * try_lock: bail out if the rmap lock is contended |
708 | * contended: indicate the rmap traversal bailed out due to lock contention | |
0dd1c7bb JK |
709 | * rmap_one: executed on each vma where page is mapped |
710 | * done: for checking traversing termination condition | |
0dd1c7bb JK |
711 | * anon_lock: for getting anon_lock by optimized way rather than default |
712 | * invalid_vma: for skipping uninterested vma | |
713 | */ | |
051ac83a JK |
714 | struct rmap_walk_control { |
715 | void *arg; | |
6d4675e6 MK |
716 | bool try_lock; |
717 | bool contended; | |
e4b82222 MK |
718 | /* |
719 | * Return false if page table scanning in rmap_walk should be stopped. | |
720 | * Otherwise, return true. | |
721 | */ | |
2f031c6f | 722 | bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma, |
051ac83a | 723 | unsigned long addr, void *arg); |
2f031c6f | 724 | int (*done)(struct folio *folio); |
6d4675e6 MK |
725 | struct anon_vma *(*anon_lock)(struct folio *folio, |
726 | struct rmap_walk_control *rwc); | |
0dd1c7bb | 727 | bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); |
051ac83a JK |
728 | }; |
729 | ||
6d4675e6 MK |
730 | void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc); |
731 | void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc); | |
6d4675e6 MK |
732 | struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, |
733 | struct rmap_walk_control *rwc); | |
e9995ef9 | 734 | |
1da177e4 LT |
735 | #else /* !CONFIG_MMU */ |
736 | ||
737 | #define anon_vma_init() do {} while (0) | |
738 | #define anon_vma_prepare(vma) (0) | |
1da177e4 | 739 | |
b3ac0413 | 740 | static inline int folio_referenced(struct folio *folio, int is_locked, |
72835c86 | 741 | struct mem_cgroup *memcg, |
01ff53f4 MF |
742 | unsigned long *vm_flags) |
743 | { | |
744 | *vm_flags = 0; | |
64574746 | 745 | return 0; |
01ff53f4 MF |
746 | } |
747 | ||
869f7ee6 | 748 | static inline void try_to_unmap(struct folio *folio, enum ttu_flags flags) |
ab7965de CH |
749 | { |
750 | } | |
1da177e4 | 751 | |
d9c08e22 | 752 | static inline int folio_mkclean(struct folio *folio) |
d08b3851 PZ |
753 | { |
754 | return 0; | |
755 | } | |
1da177e4 LT |
756 | #endif /* CONFIG_MMU */ |
757 | ||
d9c08e22 MWO |
758 | static inline int page_mkclean(struct page *page) |
759 | { | |
760 | return folio_mkclean(page_folio(page)); | |
761 | } | |
1da177e4 | 762 | #endif /* _LINUX_RMAP_H */ |