mm: multi-gen LRU: minimal implementation
[linux-2.6-block.git] / include / linux / mm_inline.h
CommitLineData
b2441318 1/* SPDX-License-Identifier: GPL-2.0 */
b2e18538
RR
2#ifndef LINUX_MM_INLINE_H
3#define LINUX_MM_INLINE_H
4
36090def 5#include <linux/atomic.h>
2c888cfb 6#include <linux/huge_mm.h>
6e543d57 7#include <linux/swap.h>
17fca131 8#include <linux/string.h>
999dad82
PX
9#include <linux/userfaultfd_k.h>
10#include <linux/swapops.h>
2c888cfb 11
b2e18538 12/**
889a3747
MWO
13 * folio_is_file_lru - Should the folio be on a file LRU or anon LRU?
14 * @folio: The folio to test.
b2e18538
RR
15 *
16 * We would like to get this info without a page flag, but the state
889a3747 17 * needs to survive until the folio is last deleted from the LRU, which
b2e18538 18 * could be as far down as __page_cache_release.
889a3747
MWO
19 *
20 * Return: An integer (not a boolean!) used to sort a folio onto the
21 * right LRU list and to account folios correctly.
22 * 1 if @folio is a regular filesystem backed page cache folio
23 * or a lazily freed anonymous folio (e.g. via MADV_FREE).
24 * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise
25 * ram or swap backed folio.
b2e18538 26 */
889a3747
MWO
27static inline int folio_is_file_lru(struct folio *folio)
28{
29 return !folio_test_swapbacked(folio);
30}
31
9de4f22a 32static inline int page_is_file_lru(struct page *page)
b2e18538 33{
889a3747 34 return folio_is_file_lru(page_folio(page));
b2e18538
RR
35}
36
aa1b6790 37static __always_inline void __update_lru_size(struct lruvec *lruvec,
599d0c95 38 enum lru_list lru, enum zone_type zid,
889a3747 39 long nr_pages)
9d5e6a9f 40{
599d0c95
MG
41 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
42
ec1c86b2
YZ
43 lockdep_assert_held(&lruvec->lru_lock);
44 WARN_ON_ONCE(nr_pages != (int)nr_pages);
45
e0ee0e71 46 __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
71c799f4
MK
47 __mod_zone_page_state(&pgdat->node_zones[zid],
48 NR_ZONE_LRU_BASE + lru, nr_pages);
aa1b6790
YZ
49}
50
51static __always_inline void update_lru_size(struct lruvec *lruvec,
52 enum lru_list lru, enum zone_type zid,
53 long nr_pages)
54{
55 __update_lru_size(lruvec, lru, zid, nr_pages);
7ee36a14 56#ifdef CONFIG_MEMCG
b4536f0c 57 mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
9d5e6a9f
HD
58#endif
59}
60
1c1c53d4 61/**
889a3747
MWO
62 * __folio_clear_lru_flags - Clear page lru flags before releasing a page.
63 * @folio: The folio that was on lru and now has a zero reference.
1c1c53d4 64 */
889a3747 65static __always_inline void __folio_clear_lru_flags(struct folio *folio)
1da177e4 66{
889a3747 67 VM_BUG_ON_FOLIO(!folio_test_lru(folio), folio);
bc711271 68
889a3747 69 __folio_clear_lru(folio);
87560179
YZ
70
71 /* this shouldn't happen, so leave the flags to bad_page() */
889a3747 72 if (folio_test_active(folio) && folio_test_unevictable(folio))
87560179 73 return;
b69408e8 74
889a3747
MWO
75 __folio_clear_active(folio);
76 __folio_clear_unevictable(folio);
77}
78
79static __always_inline void __clear_page_lru_flags(struct page *page)
80{
81 __folio_clear_lru_flags(page_folio(page));
1da177e4 82}
21eac81f 83
b69408e8 84/**
889a3747
MWO
85 * folio_lru_list - Which LRU list should a folio be on?
86 * @folio: The folio to test.
b69408e8 87 *
889a3747 88 * Return: The LRU list a folio should be on, as an index
b69408e8
CL
89 * into the array of LRU lists.
90 */
889a3747 91static __always_inline enum lru_list folio_lru_list(struct folio *folio)
b69408e8 92{
401a8e1c 93 enum lru_list lru;
b69408e8 94
889a3747 95 VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio);
bc711271 96
889a3747 97 if (folio_test_unevictable(folio))
c1770e34
YZ
98 return LRU_UNEVICTABLE;
99
889a3747
MWO
100 lru = folio_is_file_lru(folio) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
101 if (folio_test_active(folio))
c1770e34
YZ
102 lru += LRU_ACTIVE;
103
b69408e8
CL
104 return lru;
105}
f90d8191 106
ec1c86b2
YZ
107#ifdef CONFIG_LRU_GEN
108
109static inline bool lru_gen_enabled(void)
110{
111 return true;
112}
113
114static inline bool lru_gen_in_fault(void)
115{
116 return current->in_lru_fault;
117}
118
119static inline int lru_gen_from_seq(unsigned long seq)
120{
121 return seq % MAX_NR_GENS;
122}
123
ac35a490
YZ
124static inline int lru_hist_from_seq(unsigned long seq)
125{
126 return seq % NR_HIST_GENS;
127}
128
129static inline int lru_tier_from_refs(int refs)
130{
131 VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH));
132
133 /* see the comment in folio_lru_refs() */
134 return order_base_2(refs + 1);
135}
136
137static inline int folio_lru_refs(struct folio *folio)
138{
139 unsigned long flags = READ_ONCE(folio->flags);
140 bool workingset = flags & BIT(PG_workingset);
141
142 /*
143 * Return the number of accesses beyond PG_referenced, i.e., N-1 if the
144 * total number of accesses is N>1, since N=0,1 both map to the first
145 * tier. lru_tier_from_refs() will account for this off-by-one. Also see
146 * the comment on MAX_NR_TIERS.
147 */
148 return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset;
149}
150
ec1c86b2
YZ
151static inline int folio_lru_gen(struct folio *folio)
152{
153 unsigned long flags = READ_ONCE(folio->flags);
154
155 return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
156}
157
158static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
159{
160 unsigned long max_seq = lruvec->lrugen.max_seq;
161
162 VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
163
164 /* see the comment on MIN_NR_GENS */
165 return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
166}
167
168static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio,
169 int old_gen, int new_gen)
170{
171 int type = folio_is_file_lru(folio);
172 int zone = folio_zonenum(folio);
173 int delta = folio_nr_pages(folio);
174 enum lru_list lru = type * LRU_INACTIVE_FILE;
175 struct lru_gen_struct *lrugen = &lruvec->lrugen;
176
177 VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
178 VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
179 VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1);
180
181 if (old_gen >= 0)
182 WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone],
183 lrugen->nr_pages[old_gen][type][zone] - delta);
184 if (new_gen >= 0)
185 WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone],
186 lrugen->nr_pages[new_gen][type][zone] + delta);
187
188 /* addition */
189 if (old_gen < 0) {
190 if (lru_gen_is_active(lruvec, new_gen))
191 lru += LRU_ACTIVE;
192 __update_lru_size(lruvec, lru, zone, delta);
193 return;
194 }
195
196 /* deletion */
197 if (new_gen < 0) {
198 if (lru_gen_is_active(lruvec, old_gen))
199 lru += LRU_ACTIVE;
200 __update_lru_size(lruvec, lru, zone, -delta);
201 return;
202 }
ac35a490
YZ
203
204 /* promotion */
205 if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
206 __update_lru_size(lruvec, lru, zone, -delta);
207 __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
208 }
209
210 /* demotion requires isolation, e.g., lru_deactivate_fn() */
211 VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
ec1c86b2
YZ
212}
213
214static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
215{
216 unsigned long seq;
217 unsigned long flags;
218 int gen = folio_lru_gen(folio);
219 int type = folio_is_file_lru(folio);
220 int zone = folio_zonenum(folio);
221 struct lru_gen_struct *lrugen = &lruvec->lrugen;
222
223 VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
224
225 if (folio_test_unevictable(folio))
226 return false;
227 /*
228 * There are three common cases for this page:
229 * 1. If it's hot, e.g., freshly faulted in or previously hot and
230 * migrated, add it to the youngest generation.
231 * 2. If it's cold but can't be evicted immediately, i.e., an anon page
232 * not in swapcache or a dirty page pending writeback, add it to the
233 * second oldest generation.
234 * 3. Everything else (clean, cold) is added to the oldest generation.
235 */
236 if (folio_test_active(folio))
237 seq = lrugen->max_seq;
238 else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) ||
239 (folio_test_reclaim(folio) &&
240 (folio_test_dirty(folio) || folio_test_writeback(folio))))
241 seq = lrugen->min_seq[type] + 1;
242 else
243 seq = lrugen->min_seq[type];
244
245 gen = lru_gen_from_seq(seq);
246 flags = (gen + 1UL) << LRU_GEN_PGOFF;
247 /* see the comment on MIN_NR_GENS about PG_active */
248 set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags);
249
250 lru_gen_update_size(lruvec, folio, -1, gen);
251 /* for folio_rotate_reclaimable() */
252 if (reclaiming)
253 list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]);
254 else
255 list_add(&folio->lru, &lrugen->lists[gen][type][zone]);
256
257 return true;
258}
259
260static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
261{
262 unsigned long flags;
263 int gen = folio_lru_gen(folio);
264
265 if (gen < 0)
266 return false;
267
268 VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
269 VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
270
271 /* for folio_migrate_flags() */
272 flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
273 flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags);
274 gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
275
276 lru_gen_update_size(lruvec, folio, gen, -1);
277 list_del(&folio->lru);
278
279 return true;
280}
281
282#else /* !CONFIG_LRU_GEN */
283
284static inline bool lru_gen_enabled(void)
285{
286 return false;
287}
288
289static inline bool lru_gen_in_fault(void)
290{
291 return false;
292}
293
294static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
295{
296 return false;
297}
298
299static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
300{
301 return false;
302}
303
304#endif /* CONFIG_LRU_GEN */
305
889a3747
MWO
306static __always_inline
307void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio)
308{
309 enum lru_list lru = folio_lru_list(folio);
310
ec1c86b2
YZ
311 if (lru_gen_add_folio(lruvec, folio, false))
312 return;
313
889a3747
MWO
314 update_lru_size(lruvec, lru, folio_zonenum(folio),
315 folio_nr_pages(folio));
07ca7606
HD
316 if (lru != LRU_UNEVICTABLE)
317 list_add(&folio->lru, &lruvec->lists[lru]);
889a3747
MWO
318}
319
f90d8191 320static __always_inline void add_page_to_lru_list(struct page *page,
3a9c9788 321 struct lruvec *lruvec)
f90d8191 322{
889a3747
MWO
323 lruvec_add_folio(lruvec, page_folio(page));
324}
3a9c9788 325
889a3747
MWO
326static __always_inline
327void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio)
328{
329 enum lru_list lru = folio_lru_list(folio);
330
ec1c86b2
YZ
331 if (lru_gen_add_folio(lruvec, folio, true))
332 return;
333
889a3747
MWO
334 update_lru_size(lruvec, lru, folio_zonenum(folio),
335 folio_nr_pages(folio));
07ca7606 336 /* This is not expected to be used on LRU_UNEVICTABLE */
889a3747 337 list_add_tail(&folio->lru, &lruvec->lists[lru]);
f90d8191
YZ
338}
339
340static __always_inline void add_page_to_lru_list_tail(struct page *page,
3a9c9788 341 struct lruvec *lruvec)
f90d8191 342{
889a3747
MWO
343 lruvec_add_folio_tail(lruvec, page_folio(page));
344}
3a9c9788 345
889a3747
MWO
346static __always_inline
347void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
348{
07ca7606
HD
349 enum lru_list lru = folio_lru_list(folio);
350
ec1c86b2
YZ
351 if (lru_gen_del_folio(lruvec, folio, false))
352 return;
353
07ca7606
HD
354 if (lru != LRU_UNEVICTABLE)
355 list_del(&folio->lru);
356 update_lru_size(lruvec, lru, folio_zonenum(folio),
889a3747 357 -folio_nr_pages(folio));
f90d8191
YZ
358}
359
360static __always_inline void del_page_from_lru_list(struct page *page,
46ae6b2c 361 struct lruvec *lruvec)
f90d8191 362{
889a3747 363 lruvec_del_folio(lruvec, page_folio(page));
f90d8191 364}
17fca131
AB
365
366#ifdef CONFIG_ANON_VMA_NAME
367/*
5c26f6ac
SB
368 * mmap_lock should be read-locked when calling anon_vma_name(). Caller should
369 * either keep holding the lock while using the returned pointer or it should
370 * raise anon_vma_name refcount before releasing the lock.
17fca131 371 */
5c26f6ac
SB
372extern struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma);
373extern struct anon_vma_name *anon_vma_name_alloc(const char *name);
374extern void anon_vma_name_free(struct kref *kref);
17fca131 375
5c26f6ac
SB
376/* mmap_lock should be read-locked */
377static inline void anon_vma_name_get(struct anon_vma_name *anon_name)
378{
379 if (anon_name)
380 kref_get(&anon_name->kref);
381}
17fca131 382
5c26f6ac
SB
383static inline void anon_vma_name_put(struct anon_vma_name *anon_name)
384{
385 if (anon_name)
386 kref_put(&anon_name->kref, anon_vma_name_free);
387}
17fca131 388
96403e11
SB
389static inline
390struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name)
391{
392 /* Prevent anon_name refcount saturation early on */
393 if (kref_read(&anon_name->kref) < REFCOUNT_MAX) {
394 anon_vma_name_get(anon_name);
395 return anon_name;
396
397 }
398 return anon_vma_name_alloc(anon_name->name);
399}
400
5c26f6ac
SB
401static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
402 struct vm_area_struct *new_vma)
17fca131 403{
5c26f6ac 404 struct anon_vma_name *anon_name = anon_vma_name(orig_vma);
17fca131 405
96403e11
SB
406 if (anon_name)
407 new_vma->anon_name = anon_vma_name_reuse(anon_name);
5c26f6ac
SB
408}
409
410static inline void free_anon_vma_name(struct vm_area_struct *vma)
411{
412 /*
413 * Not using anon_vma_name because it generates a warning if mmap_lock
414 * is not held, which might be the case here.
415 */
416 if (!vma->vm_file)
417 anon_vma_name_put(vma->anon_name);
418}
419
420static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
421 struct anon_vma_name *anon_name2)
422{
423 if (anon_name1 == anon_name2)
17fca131
AB
424 return true;
425
5c26f6ac
SB
426 return anon_name1 && anon_name2 &&
427 !strcmp(anon_name1->name, anon_name2->name);
17fca131 428}
5c26f6ac 429
17fca131 430#else /* CONFIG_ANON_VMA_NAME */
5c26f6ac
SB
431static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
432{
433 return NULL;
434}
435
436static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
17fca131
AB
437{
438 return NULL;
439}
5c26f6ac
SB
440
441static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {}
442static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {}
443static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
444 struct vm_area_struct *new_vma) {}
445static inline void free_anon_vma_name(struct vm_area_struct *vma) {}
446
447static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
448 struct anon_vma_name *anon_name2)
17fca131
AB
449{
450 return true;
451}
5c26f6ac 452
17fca131
AB
453#endif /* CONFIG_ANON_VMA_NAME */
454
36090def
AB
455static inline void init_tlb_flush_pending(struct mm_struct *mm)
456{
457 atomic_set(&mm->tlb_flush_pending, 0);
458}
459
460static inline void inc_tlb_flush_pending(struct mm_struct *mm)
461{
462 atomic_inc(&mm->tlb_flush_pending);
463 /*
464 * The only time this value is relevant is when there are indeed pages
465 * to flush. And we'll only flush pages after changing them, which
466 * requires the PTL.
467 *
468 * So the ordering here is:
469 *
470 * atomic_inc(&mm->tlb_flush_pending);
471 * spin_lock(&ptl);
472 * ...
473 * set_pte_at();
474 * spin_unlock(&ptl);
475 *
476 * spin_lock(&ptl)
477 * mm_tlb_flush_pending();
478 * ....
479 * spin_unlock(&ptl);
480 *
481 * flush_tlb_range();
482 * atomic_dec(&mm->tlb_flush_pending);
483 *
484 * Where the increment if constrained by the PTL unlock, it thus
485 * ensures that the increment is visible if the PTE modification is
486 * visible. After all, if there is no PTE modification, nobody cares
487 * about TLB flushes either.
488 *
489 * This very much relies on users (mm_tlb_flush_pending() and
490 * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and
491 * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc
492 * locks (PPC) the unlock of one doesn't order against the lock of
493 * another PTL.
494 *
495 * The decrement is ordered by the flush_tlb_range(), such that
496 * mm_tlb_flush_pending() will not return false unless all flushes have
497 * completed.
498 */
499}
500
501static inline void dec_tlb_flush_pending(struct mm_struct *mm)
502{
503 /*
504 * See inc_tlb_flush_pending().
505 *
506 * This cannot be smp_mb__before_atomic() because smp_mb() simply does
507 * not order against TLB invalidate completion, which is what we need.
508 *
509 * Therefore we must rely on tlb_flush_*() to guarantee order.
510 */
511 atomic_dec(&mm->tlb_flush_pending);
512}
513
514static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
515{
516 /*
517 * Must be called after having acquired the PTL; orders against that
518 * PTLs release and therefore ensures that if we observe the modified
519 * PTE we must also observe the increment from inc_tlb_flush_pending().
520 *
521 * That is, it only guarantees to return true if there is a flush
522 * pending for _this_ PTL.
523 */
524 return atomic_read(&mm->tlb_flush_pending);
525}
526
527static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
528{
529 /*
530 * Similar to mm_tlb_flush_pending(), we must have acquired the PTL
531 * for which there is a TLB flush pending in order to guarantee
532 * we've seen both that PTE modification and the increment.
533 *
534 * (no requirement on actually still holding the PTL, that is irrelevant)
535 */
536 return atomic_read(&mm->tlb_flush_pending) > 1;
537}
538
999dad82
PX
539/*
540 * If this pte is wr-protected by uffd-wp in any form, arm the special pte to
541 * replace a none pte. NOTE! This should only be called when *pte is already
542 * cleared so we will never accidentally replace something valuable. Meanwhile
543 * none pte also means we are not demoting the pte so tlb flushed is not needed.
544 * E.g., when pte cleared the caller should have taken care of the tlb flush.
545 *
546 * Must be called with pgtable lock held so that no thread will see the none
547 * pte, and if they see it, they'll fault and serialize at the pgtable lock.
548 *
549 * This function is a no-op if PTE_MARKER_UFFD_WP is not enabled.
550 */
551static inline void
552pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
553 pte_t *pte, pte_t pteval)
554{
555#ifdef CONFIG_PTE_MARKER_UFFD_WP
556 bool arm_uffd_pte = false;
557
558 /* The current status of the pte should be "cleared" before calling */
559 WARN_ON_ONCE(!pte_none(*pte));
560
561 if (vma_is_anonymous(vma) || !userfaultfd_wp(vma))
562 return;
563
564 /* A uffd-wp wr-protected normal pte */
565 if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval)))
566 arm_uffd_pte = true;
567
568 /*
569 * A uffd-wp wr-protected swap pte. Note: this should even cover an
570 * existing pte marker with uffd-wp bit set.
571 */
572 if (unlikely(pte_swp_uffd_wp_any(pteval)))
573 arm_uffd_pte = true;
574
575 if (unlikely(arm_uffd_pte))
576 set_pte_at(vma->vm_mm, addr, pte,
577 make_pte_marker(PTE_MARKER_UFFD_WP));
578#endif
579}
36090def 580
b2e18538 581#endif