Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
e2cda322 AA |
2 | /* |
3 | * mm/pgtable-generic.c | |
4 | * | |
ca5999fd | 5 | * Generic pgtable methods declared in linux/pgtable.h |
e2cda322 AA |
6 | * |
7 | * Copyright (C) 2010 Linus Torvalds | |
8 | */ | |
9 | ||
f95ba941 | 10 | #include <linux/pagemap.h> |
a31acd3e | 11 | #include <linux/hugetlb.h> |
ca5999fd | 12 | #include <linux/pgtable.h> |
0d940a9b HD |
13 | #include <linux/swap.h> |
14 | #include <linux/swapops.h> | |
36090def | 15 | #include <linux/mm_inline.h> |
13cf577e | 16 | #include <asm/pgalloc.h> |
e2cda322 | 17 | #include <asm/tlb.h> |
e2cda322 | 18 | |
bc4b4448 JK |
19 | /* |
20 | * If a p?d_bad entry is found while walking page tables, report | |
21 | * the error, before resetting entry to p?d_none. Usually (but | |
22 | * very seldom) called out from the p?d_none_or_clear_bad macros. | |
23 | */ | |
24 | ||
25 | void pgd_clear_bad(pgd_t *pgd) | |
26 | { | |
27 | pgd_ERROR(*pgd); | |
28 | pgd_clear(pgd); | |
29 | } | |
30 | ||
f2400abc | 31 | #ifndef __PAGETABLE_P4D_FOLDED |
c2febafc KS |
32 | void p4d_clear_bad(p4d_t *p4d) |
33 | { | |
34 | p4d_ERROR(*p4d); | |
35 | p4d_clear(p4d); | |
36 | } | |
f2400abc | 37 | #endif |
c2febafc | 38 | |
f2400abc | 39 | #ifndef __PAGETABLE_PUD_FOLDED |
bc4b4448 JK |
40 | void pud_clear_bad(pud_t *pud) |
41 | { | |
42 | pud_ERROR(*pud); | |
43 | pud_clear(pud); | |
44 | } | |
f2400abc | 45 | #endif |
bc4b4448 | 46 | |
f2400abc VG |
47 | /* |
48 | * Note that the pmd variant below can't be stub'ed out just as for p4d/pud | |
49 | * above. pmd folding is special and typically pmd_* macros refer to upper | |
50 | * level even when folded | |
51 | */ | |
bc4b4448 JK |
52 | void pmd_clear_bad(pmd_t *pmd) |
53 | { | |
54 | pmd_ERROR(*pmd); | |
55 | pmd_clear(pmd); | |
56 | } | |
57 | ||
e2cda322 AA |
58 | #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS |
59 | /* | |
ca5999fd | 60 | * Only sets the access flags (dirty, accessed), as well as write |
cef23d9d | 61 | * permission. Furthermore, we know it always gets set to a "more |
e2cda322 AA |
62 | * permissive" setting, which allows most architectures to optimize |
63 | * this. We return whether the PTE actually changed, which in turn | |
64 | * instructs the caller to do things like update__mmu_cache. This | |
65 | * used to be done in the caller, but sparc needs minor faults to | |
66 | * force that call on sun4c so we changed this macro slightly | |
67 | */ | |
68 | int ptep_set_access_flags(struct vm_area_struct *vma, | |
69 | unsigned long address, pte_t *ptep, | |
70 | pte_t entry, int dirty) | |
71 | { | |
c33c7948 | 72 | int changed = !pte_same(ptep_get(ptep), entry); |
e2cda322 AA |
73 | if (changed) { |
74 | set_pte_at(vma->vm_mm, address, ptep, entry); | |
99c29133 | 75 | flush_tlb_fix_spurious_fault(vma, address, ptep); |
e2cda322 AA |
76 | } |
77 | return changed; | |
78 | } | |
79 | #endif | |
80 | ||
52585bcc VG |
81 | #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH |
82 | int ptep_clear_flush_young(struct vm_area_struct *vma, | |
83 | unsigned long address, pte_t *ptep) | |
84 | { | |
85 | int young; | |
86 | young = ptep_test_and_clear_young(vma, address, ptep); | |
87 | if (young) | |
88 | flush_tlb_page(vma, address); | |
89 | return young; | |
90 | } | |
91 | #endif | |
92 | ||
93 | #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH | |
94 | pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, | |
95 | pte_t *ptep) | |
96 | { | |
97 | struct mm_struct *mm = (vma)->vm_mm; | |
98 | pte_t pte; | |
99 | pte = ptep_get_and_clear(mm, address, ptep); | |
100 | if (pte_accessible(mm, pte)) | |
101 | flush_tlb_page(vma, address); | |
102 | return pte; | |
103 | } | |
104 | #endif | |
105 | ||
bd5e88ad VG |
106 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
107 | ||
e2cda322 AA |
108 | #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS |
109 | int pmdp_set_access_flags(struct vm_area_struct *vma, | |
110 | unsigned long address, pmd_t *pmdp, | |
111 | pmd_t entry, int dirty) | |
112 | { | |
e2cda322 AA |
113 | int changed = !pmd_same(*pmdp, entry); |
114 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | |
115 | if (changed) { | |
116 | set_pmd_at(vma->vm_mm, address, pmdp, entry); | |
12ebc158 | 117 | flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); |
e2cda322 AA |
118 | } |
119 | return changed; | |
e2cda322 AA |
120 | } |
121 | #endif | |
122 | ||
e2cda322 AA |
123 | #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH |
124 | int pmdp_clear_flush_young(struct vm_area_struct *vma, | |
125 | unsigned long address, pmd_t *pmdp) | |
126 | { | |
127 | int young; | |
d8c37c48 | 128 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
e2cda322 AA |
129 | young = pmdp_test_and_clear_young(vma, address, pmdp); |
130 | if (young) | |
12ebc158 | 131 | flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); |
e2cda322 AA |
132 | return young; |
133 | } | |
134 | #endif | |
135 | ||
8809aa2d | 136 | #ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH |
8809aa2d AK |
137 | pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, |
138 | pmd_t *pmdp) | |
e2cda322 AA |
139 | { |
140 | pmd_t pmd; | |
e2cda322 | 141 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
99fa8a48 HD |
142 | VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) && |
143 | !pmd_devmap(*pmdp)); | |
8809aa2d | 144 | pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); |
12ebc158 | 145 | flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); |
e2cda322 AA |
146 | return pmd; |
147 | } | |
a00cc7d9 MW |
148 | |
149 | #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD | |
150 | pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, | |
151 | pud_t *pudp) | |
152 | { | |
153 | pud_t pud; | |
154 | ||
155 | VM_BUG_ON(address & ~HPAGE_PUD_MASK); | |
156 | VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp)); | |
157 | pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp); | |
158 | flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); | |
159 | return pud; | |
160 | } | |
161 | #endif | |
e2cda322 AA |
162 | #endif |
163 | ||
e3ebcf64 | 164 | #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT |
6b0b50b0 AK |
165 | void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, |
166 | pgtable_t pgtable) | |
e3ebcf64 | 167 | { |
c4088ebd | 168 | assert_spin_locked(pmd_lockptr(mm, pmdp)); |
e3ebcf64 GS |
169 | |
170 | /* FIFO */ | |
c389a250 | 171 | if (!pmd_huge_pte(mm, pmdp)) |
e3ebcf64 GS |
172 | INIT_LIST_HEAD(&pgtable->lru); |
173 | else | |
c389a250 KS |
174 | list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru); |
175 | pmd_huge_pte(mm, pmdp) = pgtable; | |
e3ebcf64 | 176 | } |
e3ebcf64 GS |
177 | #endif |
178 | ||
179 | #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW | |
e3ebcf64 | 180 | /* no "address" argument so destroys page coloring of some arch */ |
6b0b50b0 | 181 | pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) |
e3ebcf64 GS |
182 | { |
183 | pgtable_t pgtable; | |
184 | ||
c4088ebd | 185 | assert_spin_locked(pmd_lockptr(mm, pmdp)); |
e3ebcf64 GS |
186 | |
187 | /* FIFO */ | |
c389a250 | 188 | pgtable = pmd_huge_pte(mm, pmdp); |
14669347 GT |
189 | pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru, |
190 | struct page, lru); | |
191 | if (pmd_huge_pte(mm, pmdp)) | |
e3ebcf64 | 192 | list_del(&pgtable->lru); |
e3ebcf64 GS |
193 | return pgtable; |
194 | } | |
e3ebcf64 | 195 | #endif |
46dcde73 GS |
196 | |
197 | #ifndef __HAVE_ARCH_PMDP_INVALIDATE | |
d52605d7 | 198 | pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, |
46dcde73 GS |
199 | pmd_t *pmdp) |
200 | { | |
86ec2da0 | 201 | pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); |
12ebc158 | 202 | flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); |
d52605d7 | 203 | return old; |
46dcde73 | 204 | } |
46dcde73 | 205 | #endif |
f28b6ff8 | 206 | |
4f831457 NA |
207 | #ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD |
208 | pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, | |
209 | pmd_t *pmdp) | |
210 | { | |
211 | return pmdp_invalidate(vma, address, pmdp); | |
212 | } | |
213 | #endif | |
214 | ||
f28b6ff8 | 215 | #ifndef pmdp_collapse_flush |
f28b6ff8 AK |
216 | pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, |
217 | pmd_t *pmdp) | |
218 | { | |
8809aa2d AK |
219 | /* |
220 | * pmd and hugepage pte format are same. So we could | |
221 | * use the same function. | |
222 | */ | |
f28b6ff8 AK |
223 | pmd_t pmd; |
224 | ||
225 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | |
226 | VM_BUG_ON(pmd_trans_huge(*pmdp)); | |
8809aa2d | 227 | pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); |
6a6ac72f VG |
228 | |
229 | /* collapse entails shooting down ptes not pmd */ | |
230 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | |
f28b6ff8 AK |
231 | return pmd; |
232 | } | |
f28b6ff8 | 233 | #endif |
13cf577e HD |
234 | |
235 | /* arch define pte_free_defer in asm/pgalloc.h for its own implementation */ | |
236 | #ifndef pte_free_defer | |
237 | static void pte_free_now(struct rcu_head *head) | |
238 | { | |
239 | struct page *page; | |
240 | ||
241 | page = container_of(head, struct page, rcu_head); | |
242 | pte_free(NULL /* mm not passed and not used */, (pgtable_t)page); | |
243 | } | |
244 | ||
245 | void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) | |
246 | { | |
247 | struct page *page; | |
248 | ||
249 | page = pgtable; | |
250 | call_rcu(&page->rcu_head, pte_free_now); | |
251 | } | |
252 | #endif /* pte_free_defer */ | |
bd5e88ad | 253 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
0d940a9b | 254 | |
146b42e0 HD |
255 | #if defined(CONFIG_GUP_GET_PXX_LOW_HIGH) && \ |
256 | (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RCU)) | |
257 | /* | |
258 | * See the comment above ptep_get_lockless() in include/linux/pgtable.h: | |
259 | * the barriers in pmdp_get_lockless() cannot guarantee that the value in | |
260 | * pmd_high actually belongs with the value in pmd_low; but holding interrupts | |
261 | * off blocks the TLB flush between present updates, which guarantees that a | |
262 | * successful __pte_offset_map() points to a page from matched halves. | |
263 | */ | |
264 | static unsigned long pmdp_get_lockless_start(void) | |
265 | { | |
266 | unsigned long irqflags; | |
267 | ||
268 | local_irq_save(irqflags); | |
269 | return irqflags; | |
270 | } | |
271 | static void pmdp_get_lockless_end(unsigned long irqflags) | |
272 | { | |
273 | local_irq_restore(irqflags); | |
274 | } | |
275 | #else | |
276 | static unsigned long pmdp_get_lockless_start(void) { return 0; } | |
277 | static void pmdp_get_lockless_end(unsigned long irqflags) { } | |
278 | #endif | |
279 | ||
0d940a9b HD |
280 | pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) |
281 | { | |
146b42e0 | 282 | unsigned long irqflags; |
0d940a9b HD |
283 | pmd_t pmdval; |
284 | ||
a349d72f | 285 | rcu_read_lock(); |
146b42e0 | 286 | irqflags = pmdp_get_lockless_start(); |
0d940a9b | 287 | pmdval = pmdp_get_lockless(pmd); |
146b42e0 HD |
288 | pmdp_get_lockless_end(irqflags); |
289 | ||
0d940a9b HD |
290 | if (pmdvalp) |
291 | *pmdvalp = pmdval; | |
292 | if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) | |
293 | goto nomap; | |
294 | if (unlikely(pmd_trans_huge(pmdval) || pmd_devmap(pmdval))) | |
295 | goto nomap; | |
296 | if (unlikely(pmd_bad(pmdval))) { | |
297 | pmd_clear_bad(pmd); | |
298 | goto nomap; | |
299 | } | |
300 | return __pte_map(&pmdval, addr); | |
301 | nomap: | |
a349d72f | 302 | rcu_read_unlock(); |
0d940a9b HD |
303 | return NULL; |
304 | } | |
305 | ||
306 | pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, | |
307 | unsigned long addr, spinlock_t **ptlp) | |
308 | { | |
309 | pmd_t pmdval; | |
310 | pte_t *pte; | |
311 | ||
312 | pte = __pte_offset_map(pmd, addr, &pmdval); | |
313 | if (likely(pte)) | |
314 | *ptlp = pte_lockptr(mm, &pmdval); | |
315 | return pte; | |
316 | } | |
317 | ||
610d0657 HD |
318 | /* |
319 | * pte_offset_map_lock(mm, pmd, addr, ptlp), and its internal implementation | |
320 | * __pte_offset_map_lock() below, is usually called with the pmd pointer for | |
321 | * addr, reached by walking down the mm's pgd, p4d, pud for addr: either while | |
322 | * holding mmap_lock or vma lock for read or for write; or in truncate or rmap | |
323 | * context, while holding file's i_mmap_lock or anon_vma lock for read (or for | |
324 | * write). In a few cases, it may be used with pmd pointing to a pmd_t already | |
325 | * copied to or constructed on the stack. | |
326 | * | |
327 | * When successful, it returns the pte pointer for addr, with its page table | |
328 | * kmapped if necessary (when CONFIG_HIGHPTE), and locked against concurrent | |
329 | * modification by software, with a pointer to that spinlock in ptlp (in some | |
330 | * configs mm->page_table_lock, in SPLIT_PTLOCK configs a spinlock in table's | |
331 | * struct page). pte_unmap_unlock(pte, ptl) to unlock and unmap afterwards. | |
332 | * | |
333 | * But it is unsuccessful, returning NULL with *ptlp unchanged, if there is no | |
334 | * page table at *pmd: if, for example, the page table has just been removed, | |
335 | * or replaced by the huge pmd of a THP. (When successful, *pmd is rechecked | |
336 | * after acquiring the ptlock, and retried internally if it changed: so that a | |
337 | * page table can be safely removed or replaced by THP while holding its lock.) | |
338 | * | |
339 | * pte_offset_map(pmd, addr), and its internal helper __pte_offset_map() above, | |
340 | * just returns the pte pointer for addr, its page table kmapped if necessary; | |
341 | * or NULL if there is no page table at *pmd. It does not attempt to lock the | |
342 | * page table, so cannot normally be used when the page table is to be updated, | |
343 | * or when entries read must be stable. But it does take rcu_read_lock(): so | |
344 | * that even when page table is racily removed, it remains a valid though empty | |
345 | * and disconnected table. Until pte_unmap(pte) unmaps and rcu_read_unlock()s | |
346 | * afterwards. | |
347 | * | |
348 | * pte_offset_map_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map(); | |
349 | * but when successful, it also outputs a pointer to the spinlock in ptlp - as | |
350 | * pte_offset_map_lock() does, but in this case without locking it. This helps | |
351 | * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time | |
352 | * act on a changed *pmd: pte_offset_map_nolock() provides the correct spinlock | |
353 | * pointer for the page table that it returns. In principle, the caller should | |
354 | * recheck *pmd once the lock is taken; in practice, no callsite needs that - | |
355 | * either the mmap_lock for write, or pte_same() check on contents, is enough. | |
356 | * | |
357 | * Note that free_pgtables(), used after unmapping detached vmas, or when | |
358 | * exiting the whole mm, does not take page table lock before freeing a page | |
359 | * table, and may not use RCU at all: "outsiders" like khugepaged should avoid | |
360 | * pte_offset_map() and co once the vma is detached from mm or mm_users is zero. | |
361 | */ | |
0d940a9b HD |
362 | pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, |
363 | unsigned long addr, spinlock_t **ptlp) | |
364 | { | |
365 | spinlock_t *ptl; | |
366 | pmd_t pmdval; | |
367 | pte_t *pte; | |
368 | again: | |
369 | pte = __pte_offset_map(pmd, addr, &pmdval); | |
370 | if (unlikely(!pte)) | |
371 | return pte; | |
372 | ptl = pte_lockptr(mm, &pmdval); | |
373 | spin_lock(ptl); | |
374 | if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) { | |
375 | *ptlp = ptl; | |
376 | return pte; | |
377 | } | |
378 | pte_unmap_unlock(pte, ptl); | |
379 | goto again; | |
380 | } |