Commit | Line | Data |
---|---|---|
b2441318 | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
5b99cd0e HC |
2 | #ifndef _LINUX_MM_TYPES_H |
3 | #define _LINUX_MM_TYPES_H | |
4 | ||
2e58f173 IM |
5 | #include <linux/mm_types_task.h> |
6 | ||
4f9a58d7 | 7 | #include <linux/auxvec.h> |
78db3412 | 8 | #include <linux/kref.h> |
5b99cd0e HC |
9 | #include <linux/list.h> |
10 | #include <linux/spinlock.h> | |
c92ff1bd | 11 | #include <linux/rbtree.h> |
d4af56c5 | 12 | #include <linux/maple_tree.h> |
c92ff1bd MS |
13 | #include <linux/rwsem.h> |
14 | #include <linux/completion.h> | |
cddb8a5c | 15 | #include <linux/cpumask.h> |
d4b3b638 | 16 | #include <linux/uprobes.h> |
8d491de6 | 17 | #include <linux/rcupdate.h> |
bbeae5b0 | 18 | #include <linux/page-flags-layout.h> |
ec8d7c14 | 19 | #include <linux/workqueue.h> |
57efa1fe | 20 | #include <linux/seqlock.h> |
f1a79412 | 21 | #include <linux/percpu_counter.h> |
2e58f173 | 22 | |
c92ff1bd | 23 | #include <asm/mmu.h> |
5b99cd0e | 24 | |
4f9a58d7 OH |
25 | #ifndef AT_VECTOR_SIZE_ARCH |
26 | #define AT_VECTOR_SIZE_ARCH 0 | |
27 | #endif | |
28 | #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) | |
29 | ||
82e69a12 | 30 | #define INIT_PASID 0 |
1c8f4220 | 31 | |
5b99cd0e | 32 | struct address_space; |
1306a85a | 33 | struct mem_cgroup; |
5b99cd0e HC |
34 | |
35 | /* | |
36 | * Each physical page in the system has a struct page associated with | |
37 | * it to keep track of whatever it is we are using the page for at the | |
38 | * moment. Note that we have no way to track which tasks are using | |
39 | * a page, though if it is a pagecache page, rmap structures can tell us | |
97b4a671 | 40 | * who is mapping it. |
be50015d | 41 | * |
97b4a671 MW |
42 | * If you allocate the page using alloc_pages(), you can use some of the |
43 | * space in struct page for your own purposes. The five words in the main | |
44 | * union are available, except for bit 0 of the first word which must be | |
45 | * kept clear. Many users use this word to store a pointer to an object | |
46 | * which is guaranteed to be aligned. If you use the same storage as | |
47 | * page->mapping, you must restore it to NULL before freeing the page. | |
be50015d | 48 | * |
97b4a671 MW |
49 | * If your page will not be mapped to userspace, you can also use the four |
50 | * bytes in the mapcount union, but you must call page_mapcount_reset() | |
51 | * before freeing it. | |
52 | * | |
53 | * If you want to use the refcount field, it must be used in such a way | |
54 | * that other CPUs temporarily incrementing and then decrementing the | |
55 | * refcount does not cause problems. On receiving the page from | |
56 | * alloc_pages(), the refcount will be positive. | |
57 | * | |
58 | * If you allocate pages of order > 0, you can use some of the fields | |
59 | * in each subpage, but you may need to restore some of their values | |
60 | * afterwards. | |
fc9bb8c7 | 61 | * |
d122019b MWO |
62 | * SLUB uses cmpxchg_double() to atomically update its freelist and counters. |
63 | * That requires that freelist & counters in struct slab be adjacent and | |
64 | * double-word aligned. Because struct slab currently just reinterprets the | |
65 | * bits of struct page, we align all struct pages to double-word boundaries, | |
66 | * and ensure that 'freelist' is aligned within struct slab. | |
5b99cd0e | 67 | */ |
e20df2c6 MW |
68 | #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE |
69 | #define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) | |
70 | #else | |
70fb4fdf | 71 | #define _struct_page_alignment __aligned(sizeof(unsigned long)) |
7d27a04b | 72 | #endif |
e20df2c6 | 73 | |
5b99cd0e HC |
74 | struct page { |
75 | unsigned long flags; /* Atomic flags, some possibly | |
76 | * updated asynchronously */ | |
b7ccc7f8 | 77 | /* |
4da1984e MW |
78 | * Five words (20/40 bytes) are available in this union. |
79 | * WARNING: bit 0 of the first word is used for PageTail(). That | |
80 | * means the other users of this union MUST NOT use the bit to | |
b7ccc7f8 MW |
81 | * avoid collision and false-positive PageTail(). |
82 | */ | |
8456a648 | 83 | union { |
66a6ffd2 | 84 | struct { /* Page cache and anonymous pages */ |
4da1984e MW |
85 | /** |
86 | * @lru: Pageout list, eg. active_list protected by | |
15b44736 | 87 | * lruvec->lru_lock. Sometimes used as a generic list |
4da1984e MW |
88 | * by the page owner. |
89 | */ | |
07ca7606 HD |
90 | union { |
91 | struct list_head lru; | |
bf75f200 | 92 | |
07ca7606 HD |
93 | /* Or, for the Unevictable "LRU list" slot */ |
94 | struct { | |
95 | /* Always even, to negate PageTail */ | |
96 | void *__filler; | |
97 | /* Count page's or folio's mlocks */ | |
98 | unsigned int mlock_count; | |
99 | }; | |
bf75f200 MG |
100 | |
101 | /* Or, free page */ | |
102 | struct list_head buddy_list; | |
103 | struct list_head pcp_list; | |
07ca7606 | 104 | }; |
66a6ffd2 MW |
105 | /* See page-flags.h for PAGE_MAPPING_FLAGS */ |
106 | struct address_space *mapping; | |
16900426 SR |
107 | union { |
108 | pgoff_t index; /* Our offset within mapping. */ | |
109 | unsigned long share; /* share count for fsdax */ | |
110 | }; | |
66a6ffd2 MW |
111 | /** |
112 | * @private: Mapping-private opaque data. | |
113 | * Usually used for buffer_heads if PagePrivate. | |
114 | * Used for swp_entry_t if PageSwapCache. | |
115 | * Indicates order in the buddy system if PageBuddy. | |
116 | */ | |
117 | unsigned long private; | |
118 | }; | |
c25fff71 | 119 | struct { /* page_pool used by netstack */ |
c07aea3e MC |
120 | /** |
121 | * @pp_magic: magic value to avoid recycling non | |
122 | * page_pool allocated pages. | |
123 | */ | |
124 | unsigned long pp_magic; | |
125 | struct page_pool *pp; | |
126 | unsigned long _pp_mapping_pad; | |
0e9d2a0a | 127 | unsigned long dma_addr; |
f915b75b YL |
128 | union { |
129 | /** | |
130 | * dma_addr_upper: might require a 64-bit | |
131 | * value on 32-bit architectures. | |
132 | */ | |
133 | unsigned long dma_addr_upper; | |
134 | /** | |
135 | * For frag page support, not supported in | |
136 | * 32-bit architectures with 64-bit DMA. | |
137 | */ | |
138 | atomic_long_t pp_frag_count; | |
139 | }; | |
c25fff71 | 140 | }; |
4da1984e MW |
141 | struct { /* Tail pages of compound page */ |
142 | unsigned long compound_head; /* Bit zero is set */ | |
dad6a5eb | 143 | }; |
50e7fbc3 MW |
144 | struct { /* ZONE_DEVICE pages */ |
145 | /** @pgmap: Points to the hosting device page map. */ | |
146 | struct dev_pagemap *pgmap; | |
8a164fef | 147 | void *zone_device_data; |
76470ccd RC |
148 | /* |
149 | * ZONE_DEVICE private pages are counted as being | |
150 | * mapped so the next 3 words hold the mapping, index, | |
151 | * and private fields from the source anonymous or | |
152 | * page cache page while the page is migrated to device | |
153 | * private memory. | |
154 | * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also | |
155 | * use the mapping, index, and private fields when | |
156 | * pmem backed DAX files are mapped. | |
157 | */ | |
50e7fbc3 | 158 | }; |
4da1984e MW |
159 | |
160 | /** @rcu_head: You can use this to free a page by RCU. */ | |
161 | struct rcu_head rcu_head; | |
7d27a04b MW |
162 | }; |
163 | ||
b21999da MW |
164 | union { /* This union is 4 bytes in size. */ |
165 | /* | |
166 | * If the page can be mapped to userspace, encodes the number | |
167 | * of times this page is referenced by a page table. | |
168 | */ | |
169 | atomic_t _mapcount; | |
170 | ||
6e292b9b MW |
171 | /* |
172 | * If the page is neither PageSlab nor mappable to userspace, | |
173 | * the value stored here may help determine what this page | |
174 | * is used for. See page-flags.h for a list of page types | |
175 | * which are currently stored here. | |
176 | */ | |
177 | unsigned int page_type; | |
81819f0f | 178 | }; |
fc9bb8c7 | 179 | |
b21999da MW |
180 | /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */ |
181 | atomic_t _refcount; | |
182 | ||
1306a85a | 183 | #ifdef CONFIG_MEMCG |
bcfe06bf | 184 | unsigned long memcg_data; |
1306a85a JW |
185 | #endif |
186 | ||
5b99cd0e HC |
187 | /* |
188 | * On machines where all RAM is mapped into kernel address space, | |
189 | * we can simply calculate the virtual address. On machines with | |
190 | * highmem some memory is mapped into kernel virtual memory | |
191 | * dynamically, so we need a place to store that address. | |
192 | * Note that this field could be 16 bits on x86 ... ;) | |
193 | * | |
194 | * Architectures with slow multiplication can define | |
195 | * WANT_PAGE_VIRTUAL in asm/page.h | |
196 | */ | |
197 | #if defined(WANT_PAGE_VIRTUAL) | |
198 | void *virtual; /* Kernel virtual address (NULL if | |
199 | not kmapped, ie. highmem) */ | |
200 | #endif /* WANT_PAGE_VIRTUAL */ | |
dfec072e | 201 | |
f80be457 AP |
202 | #ifdef CONFIG_KMSAN |
203 | /* | |
204 | * KMSAN metadata for this page: | |
205 | * - shadow page: every bit indicates whether the corresponding | |
206 | * bit of the original page is initialized (0) or not (1); | |
207 | * - origin page: every 4 bytes contain an id of the stack trace | |
208 | * where the uninitialized value was created. | |
209 | */ | |
210 | struct page *kmsan_shadow; | |
211 | struct page *kmsan_origin; | |
212 | #endif | |
213 | ||
90572890 PZ |
214 | #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS |
215 | int _last_cpupid; | |
57e0a030 | 216 | #endif |
e20df2c6 | 217 | } _struct_page_alignment; |
5b99cd0e | 218 | |
70fb4fdf LT |
219 | /* |
220 | * struct encoded_page - a nonexistent type marking this pointer | |
221 | * | |
222 | * An 'encoded_page' pointer is a pointer to a regular 'struct page', but | |
223 | * with the low bits of the pointer indicating extra context-dependent | |
224 | * information. Not super-common, but happens in mmu_gather and mlock | |
225 | * handling, and this acts as a type system check on that use. | |
226 | * | |
227 | * We only really have two guaranteed bits in general, although you could | |
228 | * play with 'struct page' alignment (see CONFIG_HAVE_ALIGNED_STRUCT_PAGE) | |
229 | * for more. | |
230 | * | |
231 | * Use the supplied helper functions to endcode/decode the pointer and bits. | |
232 | */ | |
233 | struct encoded_page; | |
234 | #define ENCODE_PAGE_BITS 3ul | |
235 | static __always_inline struct encoded_page *encode_page(struct page *page, unsigned long flags) | |
236 | { | |
237 | BUILD_BUG_ON(flags > ENCODE_PAGE_BITS); | |
238 | return (struct encoded_page *)(flags | (unsigned long)page); | |
239 | } | |
240 | ||
241 | static inline unsigned long encoded_page_flags(struct encoded_page *page) | |
242 | { | |
243 | return ENCODE_PAGE_BITS & (unsigned long)page; | |
244 | } | |
245 | ||
246 | static inline struct page *encoded_page_ptr(struct encoded_page *page) | |
247 | { | |
248 | return (struct page *)(~ENCODE_PAGE_BITS & (unsigned long)page); | |
249 | } | |
250 | ||
85a13334 MW |
251 | /* |
252 | * A swap entry has to fit into a "unsigned long", as the entry is hidden | |
253 | * in the "index" field of the swapper address space. | |
254 | */ | |
255 | typedef struct { | |
256 | unsigned long val; | |
257 | } swp_entry_t; | |
258 | ||
7b230db3 MWO |
259 | /** |
260 | * struct folio - Represents a contiguous set of bytes. | |
261 | * @flags: Identical to the page flags. | |
262 | * @lru: Least Recently Used list; tracks how recently this folio was used. | |
334f6f53 | 263 | * @mlock_count: Number of times this folio has been pinned by mlock(). |
7b230db3 MWO |
264 | * @mapping: The file this page belongs to, or refers to the anon_vma for |
265 | * anonymous memory. | |
266 | * @index: Offset within the file, in units of pages. For anonymous memory, | |
267 | * this is the index from the beginning of the mmap. | |
268 | * @private: Filesystem per-folio data (see folio_attach_private()). | |
85a13334 | 269 | * @swap: Used for swp_entry_t if folio_test_swapcache(). |
7b230db3 MWO |
270 | * @_mapcount: Do not access this member directly. Use folio_mapcount() to |
271 | * find out how many times this folio is mapped by userspace. | |
272 | * @_refcount: Do not access this member directly. Use folio_ref_count() | |
273 | * to find how many references there are to this folio. | |
274 | * @memcg_data: Memory Control Group data. | |
b14224fb | 275 | * @_entire_mapcount: Do not use directly, call folio_entire_mapcount(). |
eec20426 | 276 | * @_nr_pages_mapped: Do not use directly, call folio_mapcount(). |
379708ff MWO |
277 | * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). |
278 | * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). | |
dad6a5eb HD |
279 | * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h. |
280 | * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h. | |
281 | * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h. | |
282 | * @_hugetlb_hwpoison: Do not use directly, call raw_hwp_list_head(). | |
4375a553 | 283 | * @_deferred_list: Folios to be split under memory pressure. |
7b230db3 MWO |
284 | * |
285 | * A folio is a physically, virtually and logically contiguous set | |
286 | * of bytes. It is a power-of-two in size, and it is aligned to that | |
287 | * same power-of-two. It is at least as large as %PAGE_SIZE. If it is | |
288 | * in the page cache, it is at a file offset which is a multiple of that | |
289 | * power-of-two. It may be mapped into userspace at an address which is | |
290 | * at an arbitrary page offset, but its kernel virtual address is aligned | |
291 | * to its size. | |
292 | */ | |
293 | struct folio { | |
294 | /* private: don't document the anon union */ | |
295 | union { | |
296 | struct { | |
297 | /* public: */ | |
298 | unsigned long flags; | |
07ca7606 HD |
299 | union { |
300 | struct list_head lru; | |
334f6f53 | 301 | /* private: avoid cluttering the output */ |
07ca7606 HD |
302 | struct { |
303 | void *__filler; | |
334f6f53 | 304 | /* public: */ |
07ca7606 | 305 | unsigned int mlock_count; |
334f6f53 | 306 | /* private: */ |
07ca7606 | 307 | }; |
334f6f53 | 308 | /* public: */ |
07ca7606 | 309 | }; |
7b230db3 MWO |
310 | struct address_space *mapping; |
311 | pgoff_t index; | |
85a13334 MW |
312 | union { |
313 | void *private; | |
314 | swp_entry_t swap; | |
315 | }; | |
7b230db3 MWO |
316 | atomic_t _mapcount; |
317 | atomic_t _refcount; | |
318 | #ifdef CONFIG_MEMCG | |
319 | unsigned long memcg_data; | |
320 | #endif | |
321 | /* private: the union with struct page is transitional */ | |
322 | }; | |
323 | struct page page; | |
324 | }; | |
dad6a5eb HD |
325 | union { |
326 | struct { | |
327 | unsigned long _flags_1; | |
328 | unsigned long _head_1; | |
ebc1baf5 | 329 | unsigned long _folio_avail; |
a8d55327 | 330 | /* public: */ |
b14224fb | 331 | atomic_t _entire_mapcount; |
eec20426 | 332 | atomic_t _nr_pages_mapped; |
dad6a5eb | 333 | atomic_t _pincount; |
379708ff | 334 | #ifdef CONFIG_64BIT |
dad6a5eb | 335 | unsigned int _folio_nr_pages; |
b10ff04d | 336 | #endif |
cfeed8ff | 337 | /* private: the union with struct page is transitional */ |
dad6a5eb HD |
338 | }; |
339 | struct page __page_1; | |
340 | }; | |
341 | union { | |
342 | struct { | |
343 | unsigned long _flags_2; | |
344 | unsigned long _head_2; | |
a8d55327 | 345 | /* public: */ |
dad6a5eb HD |
346 | void *_hugetlb_subpool; |
347 | void *_hugetlb_cgroup; | |
348 | void *_hugetlb_cgroup_rsvd; | |
349 | void *_hugetlb_hwpoison; | |
4375a553 MWO |
350 | /* private: the union with struct page is transitional */ |
351 | }; | |
352 | struct { | |
353 | unsigned long _flags_2a; | |
354 | unsigned long _head_2a; | |
355 | /* public: */ | |
356 | struct list_head _deferred_list; | |
a8d55327 | 357 | /* private: the union with struct page is transitional */ |
dad6a5eb HD |
358 | }; |
359 | struct page __page_2; | |
360 | }; | |
7b230db3 MWO |
361 | }; |
362 | ||
7b230db3 MWO |
363 | #define FOLIO_MATCH(pg, fl) \ |
364 | static_assert(offsetof(struct page, pg) == offsetof(struct folio, fl)) | |
365 | FOLIO_MATCH(flags, flags); | |
366 | FOLIO_MATCH(lru, lru); | |
536f4217 | 367 | FOLIO_MATCH(mapping, mapping); |
7b230db3 MWO |
368 | FOLIO_MATCH(compound_head, lru); |
369 | FOLIO_MATCH(index, index); | |
370 | FOLIO_MATCH(private, private); | |
371 | FOLIO_MATCH(_mapcount, _mapcount); | |
372 | FOLIO_MATCH(_refcount, _refcount); | |
373 | #ifdef CONFIG_MEMCG | |
374 | FOLIO_MATCH(memcg_data, memcg_data); | |
375 | #endif | |
376 | #undef FOLIO_MATCH | |
379708ff MWO |
377 | #define FOLIO_MATCH(pg, fl) \ |
378 | static_assert(offsetof(struct folio, fl) == \ | |
379 | offsetof(struct page, pg) + sizeof(struct page)) | |
380 | FOLIO_MATCH(flags, _flags_1); | |
dad6a5eb | 381 | FOLIO_MATCH(compound_head, _head_1); |
379708ff | 382 | #undef FOLIO_MATCH |
dad6a5eb HD |
383 | #define FOLIO_MATCH(pg, fl) \ |
384 | static_assert(offsetof(struct folio, fl) == \ | |
385 | offsetof(struct page, pg) + 2 * sizeof(struct page)) | |
386 | FOLIO_MATCH(flags, _flags_2); | |
387 | FOLIO_MATCH(compound_head, _head_2); | |
b10ff04d MWO |
388 | FOLIO_MATCH(flags, _flags_2a); |
389 | FOLIO_MATCH(compound_head, _head_2a); | |
dad6a5eb | 390 | #undef FOLIO_MATCH |
7b230db3 | 391 | |
9a35de4f VMO |
392 | /** |
393 | * struct ptdesc - Memory descriptor for page tables. | |
394 | * @__page_flags: Same as page flags. Unused for page tables. | |
395 | * @pt_rcu_head: For freeing page table pages. | |
396 | * @pt_list: List of used page tables. Used for s390 and x86. | |
397 | * @_pt_pad_1: Padding that aliases with page's compound head. | |
398 | * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs. | |
399 | * @__page_mapping: Aliases with page->mapping. Unused for page tables. | |
400 | * @pt_mm: Used for x86 pgds. | |
401 | * @pt_frag_refcount: For fragmented page table tracking. Powerpc and s390 only. | |
402 | * @_pt_pad_2: Padding to ensure proper alignment. | |
403 | * @ptl: Lock for the page table. | |
404 | * @__page_type: Same as page->page_type. Unused for page tables. | |
405 | * @_refcount: Same as page refcount. Used for s390 page tables. | |
406 | * @pt_memcg_data: Memcg data. Tracked for page tables here. | |
407 | * | |
408 | * This struct overlays struct page for now. Do not modify without a good | |
409 | * understanding of the issues. | |
410 | */ | |
411 | struct ptdesc { | |
412 | unsigned long __page_flags; | |
413 | ||
414 | union { | |
415 | struct rcu_head pt_rcu_head; | |
416 | struct list_head pt_list; | |
417 | struct { | |
418 | unsigned long _pt_pad_1; | |
419 | pgtable_t pmd_huge_pte; | |
420 | }; | |
421 | }; | |
422 | unsigned long __page_mapping; | |
423 | ||
424 | union { | |
425 | struct mm_struct *pt_mm; | |
426 | atomic_t pt_frag_refcount; | |
427 | }; | |
428 | ||
429 | union { | |
430 | unsigned long _pt_pad_2; | |
431 | #if ALLOC_SPLIT_PTLOCKS | |
432 | spinlock_t *ptl; | |
433 | #else | |
434 | spinlock_t ptl; | |
435 | #endif | |
436 | }; | |
437 | unsigned int __page_type; | |
438 | atomic_t _refcount; | |
439 | #ifdef CONFIG_MEMCG | |
440 | unsigned long pt_memcg_data; | |
441 | #endif | |
442 | }; | |
443 | ||
444 | #define TABLE_MATCH(pg, pt) \ | |
445 | static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt)) | |
446 | TABLE_MATCH(flags, __page_flags); | |
447 | TABLE_MATCH(compound_head, pt_list); | |
448 | TABLE_MATCH(compound_head, _pt_pad_1); | |
9a35de4f | 449 | TABLE_MATCH(mapping, __page_mapping); |
9a35de4f VMO |
450 | TABLE_MATCH(rcu_head, pt_rcu_head); |
451 | TABLE_MATCH(page_type, __page_type); | |
452 | TABLE_MATCH(_refcount, _refcount); | |
453 | #ifdef CONFIG_MEMCG | |
454 | TABLE_MATCH(memcg_data, pt_memcg_data); | |
455 | #endif | |
456 | #undef TABLE_MATCH | |
457 | static_assert(sizeof(struct ptdesc) <= sizeof(struct page)); | |
458 | ||
bf2d4334 VMO |
459 | #define ptdesc_page(pt) (_Generic((pt), \ |
460 | const struct ptdesc *: (const struct page *)(pt), \ | |
461 | struct ptdesc *: (struct page *)(pt))) | |
462 | ||
463 | #define ptdesc_folio(pt) (_Generic((pt), \ | |
464 | const struct ptdesc *: (const struct folio *)(pt), \ | |
465 | struct ptdesc *: (struct folio *)(pt))) | |
466 | ||
467 | #define page_ptdesc(p) (_Generic((p), \ | |
468 | const struct page *: (const struct ptdesc *)(p), \ | |
469 | struct page *: (struct ptdesc *)(p))) | |
470 | ||
d1402fc7 LG |
471 | /* |
472 | * Used for sizing the vmemmap region on some architectures | |
473 | */ | |
474 | #define STRUCT_PAGE_MAX_SHIFT (order_base_2(sizeof(struct page))) | |
475 | ||
b63ae8ca AD |
476 | #define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) |
477 | #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) | |
478 | ||
85d0a2ed MWO |
479 | /* |
480 | * page_private can be used on tail pages. However, PagePrivate is only | |
481 | * checked by the VM on the head page. So page_private on the tail pages | |
482 | * should be used for data that's ancillary to the head page (eg attaching | |
483 | * buffer heads to tail pages after attaching buffer heads to the head page) | |
484 | */ | |
b03641af | 485 | #define page_private(page) ((page)->private) |
60e65a6f GJ |
486 | |
487 | static inline void set_page_private(struct page *page, unsigned long private) | |
488 | { | |
489 | page->private = private; | |
490 | } | |
b03641af | 491 | |
85d0a2ed MWO |
492 | static inline void *folio_get_private(struct folio *folio) |
493 | { | |
494 | return folio->private; | |
495 | } | |
496 | ||
b63ae8ca AD |
497 | struct page_frag_cache { |
498 | void * va; | |
499 | #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) | |
500 | __u16 offset; | |
501 | __u16 size; | |
502 | #else | |
503 | __u32 offset; | |
504 | #endif | |
505 | /* we maintain a pagecount bias, so that we dont dirty cache line | |
0139aa7b | 506 | * containing page->_refcount every time we allocate a fragment. |
b63ae8ca AD |
507 | */ |
508 | unsigned int pagecnt_bias; | |
509 | bool pfmemalloc; | |
510 | }; | |
511 | ||
64b990d2 | 512 | typedef unsigned long vm_flags_t; |
ca16d140 | 513 | |
8feae131 DH |
514 | /* |
515 | * A region containing a mapping of a non-memory backed file under NOMMU | |
516 | * conditions. These are held in a global tree and are pinned by the VMAs that | |
517 | * map parts of them. | |
518 | */ | |
519 | struct vm_region { | |
520 | struct rb_node vm_rb; /* link in global region tree */ | |
ca16d140 | 521 | vm_flags_t vm_flags; /* VMA vm_flags */ |
8feae131 DH |
522 | unsigned long vm_start; /* start address of region */ |
523 | unsigned long vm_end; /* region initialised to here */ | |
dd8632a1 | 524 | unsigned long vm_top; /* region allocated to here */ |
8feae131 DH |
525 | unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */ |
526 | struct file *vm_file; /* the backing file or NULL */ | |
527 | ||
1e2ae599 | 528 | int vm_usage; /* region usage count (access under nommu_region_sem) */ |
cfe79c00 MF |
529 | bool vm_icache_flushed : 1; /* true if the icache has been flushed for |
530 | * this region */ | |
8feae131 DH |
531 | }; |
532 | ||
745f234b AA |
533 | #ifdef CONFIG_USERFAULTFD |
534 | #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, }) | |
535 | struct vm_userfaultfd_ctx { | |
536 | struct userfaultfd_ctx *ctx; | |
537 | }; | |
538 | #else /* CONFIG_USERFAULTFD */ | |
539 | #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {}) | |
540 | struct vm_userfaultfd_ctx {}; | |
541 | #endif /* CONFIG_USERFAULTFD */ | |
542 | ||
78db3412 SB |
543 | struct anon_vma_name { |
544 | struct kref kref; | |
545 | /* The name needs to be at the end because it is dynamically sized. */ | |
546 | char name[]; | |
547 | }; | |
548 | ||
c7f8f31c SB |
549 | struct vma_lock { |
550 | struct rw_semaphore lock; | |
551 | }; | |
552 | ||
ef6a22b7 MG |
553 | struct vma_numab_state { |
554 | unsigned long next_scan; | |
20f58648 R |
555 | unsigned long next_pid_reset; |
556 | unsigned long access_pids[2]; | |
ef6a22b7 MG |
557 | }; |
558 | ||
c92ff1bd | 559 | /* |
552657b7 | 560 | * This struct describes a virtual memory area. There is one of these |
561 | * per VM-area/task. A VM area is any part of the process virtual memory | |
c92ff1bd MS |
562 | * space that has a special rule for the page-fault handlers (ie a shared |
563 | * library, the executable area etc). | |
564 | */ | |
565 | struct vm_area_struct { | |
e4c6bfd2 RR |
566 | /* The first cache line has the info for VMA tree walking. */ |
567 | ||
20cce633 ML |
568 | union { |
569 | struct { | |
570 | /* VMA covers [vm_start; vm_end) addresses within mm */ | |
571 | unsigned long vm_start; | |
572 | unsigned long vm_end; | |
573 | }; | |
574 | #ifdef CONFIG_PER_VMA_LOCK | |
575 | struct rcu_head vm_rcu; /* Used for deferred freeing. */ | |
576 | #endif | |
577 | }; | |
c92ff1bd | 578 | |
e4c6bfd2 | 579 | struct mm_struct *vm_mm; /* The address space we belong to. */ |
28d8b812 | 580 | pgprot_t vm_page_prot; /* Access permissions of this VMA. */ |
bc292ab0 SB |
581 | |
582 | /* | |
583 | * Flags, see mm.h. | |
584 | * To modify use vm_flags_{init|reset|set|clear|mod} functions. | |
585 | */ | |
586 | union { | |
587 | const vm_flags_t vm_flags; | |
588 | vm_flags_t __private __vm_flags; | |
589 | }; | |
e4c6bfd2 | 590 | |
5e31275c | 591 | #ifdef CONFIG_PER_VMA_LOCK |
b1f02b95 JH |
592 | /* |
593 | * Can only be written (using WRITE_ONCE()) while holding both: | |
594 | * - mmap_lock (in write mode) | |
595 | * - vm_lock->lock (in write mode) | |
596 | * Can be read reliably while holding one of: | |
597 | * - mmap_lock (in read or write mode) | |
598 | * - vm_lock->lock (in read or write mode) | |
599 | * Can be read unreliably (using READ_ONCE()) for pessimistic bailout | |
600 | * while holding nothing (except RCU to keep the VMA struct allocated). | |
601 | * | |
602 | * This sequence counter is explicitly allowed to overflow; sequence | |
603 | * counter reuse can only lead to occasional unnecessary use of the | |
604 | * slowpath. | |
605 | */ | |
5e31275c | 606 | int vm_lock_seq; |
c7f8f31c | 607 | struct vma_lock *vm_lock; |
457f67be SB |
608 | |
609 | /* Flag to indicate areas detached from the mm->mm_mt tree */ | |
610 | bool detached; | |
5e31275c SB |
611 | #endif |
612 | ||
c92ff1bd MS |
613 | /* |
614 | * For areas with an address space and backing store, | |
27ba0644 | 615 | * linkage into the address_space->i_mmap interval tree. |
9a10064f | 616 | * |
c92ff1bd | 617 | */ |
d09e8ca6 PT |
618 | struct { |
619 | struct rb_node rb; | |
620 | unsigned long rb_subtree_last; | |
621 | } shared; | |
c92ff1bd MS |
622 | |
623 | /* | |
624 | * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma | |
625 | * list, after a COW of one of the file pages. A MAP_SHARED vma | |
626 | * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack | |
627 | * or brk vma (with NULL file) can only be in an anon_vma list. | |
628 | */ | |
c1e8d7c6 | 629 | struct list_head anon_vma_chain; /* Serialized by mmap_lock & |
5beb4930 | 630 | * page_table_lock */ |
c92ff1bd MS |
631 | struct anon_vma *anon_vma; /* Serialized by page_table_lock */ |
632 | ||
633 | /* Function pointers to deal with this struct. */ | |
f0f37e2f | 634 | const struct vm_operations_struct *vm_ops; |
c92ff1bd MS |
635 | |
636 | /* Information about our backing store: */ | |
637 | unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE | |
ea1754a0 | 638 | units */ |
c92ff1bd MS |
639 | struct file * vm_file; /* File we map to (can be NULL). */ |
640 | void * vm_private_data; /* was vm_pte (shared mem) */ | |
c92ff1bd | 641 | |
d09e8ca6 PT |
642 | #ifdef CONFIG_ANON_VMA_NAME |
643 | /* | |
644 | * For private and shared anonymous mappings, a pointer to a null | |
645 | * terminated string containing the name given to the vma, or NULL if | |
8651a137 | 646 | * unnamed. Serialized by mmap_lock. Use anon_vma_name to access. |
d09e8ca6 PT |
647 | */ |
648 | struct anon_vma_name *anon_name; | |
649 | #endif | |
219f8a2e | 650 | #ifdef CONFIG_SWAP |
ec560175 | 651 | atomic_long_t swap_readahead_info; |
219f8a2e | 652 | #endif |
c92ff1bd | 653 | #ifndef CONFIG_MMU |
8feae131 | 654 | struct vm_region *vm_region; /* NOMMU mapping region */ |
c92ff1bd MS |
655 | #endif |
656 | #ifdef CONFIG_NUMA | |
657 | struct mempolicy *vm_policy; /* NUMA policy for the VMA */ | |
ef6a22b7 MG |
658 | #endif |
659 | #ifdef CONFIG_NUMA_BALANCING | |
660 | struct vma_numab_state *numab_state; /* NUMA Balancing state */ | |
c92ff1bd | 661 | #endif |
745f234b | 662 | struct vm_userfaultfd_ctx vm_userfaultfd_ctx; |
3859a271 | 663 | } __randomize_layout; |
c92ff1bd | 664 | |
223baf9d MD |
665 | #ifdef CONFIG_SCHED_MM_CID |
666 | struct mm_cid { | |
667 | u64 time; | |
668 | int cid; | |
669 | }; | |
670 | #endif | |
671 | ||
db446a08 | 672 | struct kioctx_table; |
c92ff1bd | 673 | struct mm_struct { |
c1a2f7f0 | 674 | struct { |
c1753fd0 MD |
675 | /* |
676 | * Fields which are often written to are placed in a separate | |
677 | * cache line. | |
678 | */ | |
679 | struct { | |
680 | /** | |
681 | * @mm_count: The number of references to &struct | |
682 | * mm_struct (@mm_users count as 1). | |
683 | * | |
684 | * Use mmgrab()/mmdrop() to modify. When this drops to | |
685 | * 0, the &struct mm_struct is freed. | |
686 | */ | |
687 | atomic_t mm_count; | |
688 | } ____cacheline_aligned_in_smp; | |
689 | ||
d4af56c5 | 690 | struct maple_tree mm_mt; |
efc1a3b1 | 691 | #ifdef CONFIG_MMU |
c1a2f7f0 | 692 | unsigned long (*get_unmapped_area) (struct file *filp, |
c92ff1bd MS |
693 | unsigned long addr, unsigned long len, |
694 | unsigned long pgoff, unsigned long flags); | |
efc1a3b1 | 695 | #endif |
c1a2f7f0 RR |
696 | unsigned long mmap_base; /* base of mmap area */ |
697 | unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ | |
1b028f78 | 698 | #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES |
041711ce | 699 | /* Base addresses for compatible mmap() */ |
c1a2f7f0 RR |
700 | unsigned long mmap_compat_base; |
701 | unsigned long mmap_compat_legacy_base; | |
1b028f78 | 702 | #endif |
c1a2f7f0 | 703 | unsigned long task_size; /* size of task vm space */ |
c1a2f7f0 RR |
704 | pgd_t * pgd; |
705 | ||
227a4aad MD |
706 | #ifdef CONFIG_MEMBARRIER |
707 | /** | |
708 | * @membarrier_state: Flags controlling membarrier behavior. | |
709 | * | |
710 | * This field is close to @pgd to hopefully fit in the same | |
711 | * cache-line, which needs to be touched by switch_mm(). | |
712 | */ | |
713 | atomic_t membarrier_state; | |
714 | #endif | |
715 | ||
c1a2f7f0 RR |
716 | /** |
717 | * @mm_users: The number of users including userspace. | |
718 | * | |
719 | * Use mmget()/mmget_not_zero()/mmput() to modify. When this | |
720 | * drops to 0 (i.e. when the task exits and there are no other | |
721 | * temporary reference holders), we also release a reference on | |
722 | * @mm_count (which may then free the &struct mm_struct if | |
723 | * @mm_count also drops to 0). | |
724 | */ | |
725 | atomic_t mm_users; | |
726 | ||
af7f588d MD |
727 | #ifdef CONFIG_SCHED_MM_CID |
728 | /** | |
223baf9d | 729 | * @pcpu_cid: Per-cpu current cid. |
af7f588d | 730 | * |
223baf9d MD |
731 | * Keep track of the currently allocated mm_cid for each cpu. |
732 | * The per-cpu mm_cid values are serialized by their respective | |
733 | * runqueue locks. | |
af7f588d | 734 | */ |
223baf9d MD |
735 | struct mm_cid __percpu *pcpu_cid; |
736 | /* | |
737 | * @mm_cid_next_scan: Next mm_cid scan (in jiffies). | |
738 | * | |
739 | * When the next mm_cid scan is due (in jiffies). | |
740 | */ | |
741 | unsigned long mm_cid_next_scan; | |
af7f588d | 742 | #endif |
c4812909 | 743 | #ifdef CONFIG_MMU |
3783e172 | 744 | atomic_long_t pgtables_bytes; /* size of all page tables */ |
5a3fbef3 | 745 | #endif |
c1a2f7f0 | 746 | int map_count; /* number of VMAs */ |
481b4bb5 | 747 | |
c1a2f7f0 RR |
748 | spinlock_t page_table_lock; /* Protects page tables and some |
749 | * counters | |
750 | */ | |
2e302543 FT |
751 | /* |
752 | * With some kernel config, the current mmap_lock's offset | |
753 | * inside 'mm_struct' is at 0x120, which is very optimal, as | |
754 | * its two hot fields 'count' and 'owner' sit in 2 different | |
755 | * cachelines, and when mmap_lock is highly contended, both | |
756 | * of the 2 fields will be accessed frequently, current layout | |
757 | * will help to reduce cache bouncing. | |
758 | * | |
759 | * So please be careful with adding new fields before | |
760 | * mmap_lock, which can easily push the 2 fields into one | |
761 | * cacheline. | |
762 | */ | |
da1c55f1 | 763 | struct rw_semaphore mmap_lock; |
c92ff1bd | 764 | |
c1a2f7f0 RR |
765 | struct list_head mmlist; /* List of maybe swapped mm's. These |
766 | * are globally strung together off | |
767 | * init_mm.mmlist, and are protected | |
768 | * by mmlist_lock | |
769 | */ | |
5e31275c | 770 | #ifdef CONFIG_PER_VMA_LOCK |
b1f02b95 JH |
771 | /* |
772 | * This field has lock-like semantics, meaning it is sometimes | |
773 | * accessed with ACQUIRE/RELEASE semantics. | |
774 | * Roughly speaking, incrementing the sequence number is | |
775 | * equivalent to releasing locks on VMAs; reading the sequence | |
776 | * number can be part of taking a read lock on a VMA. | |
777 | * | |
778 | * Can be modified under write mmap_lock using RELEASE | |
779 | * semantics. | |
780 | * Can be read with no other protection when holding write | |
781 | * mmap_lock. | |
782 | * Can be read with ACQUIRE semantics if not holding write | |
783 | * mmap_lock. | |
784 | */ | |
5e31275c SB |
785 | int mm_lock_seq; |
786 | #endif | |
c92ff1bd | 787 | |
c92ff1bd | 788 | |
c1a2f7f0 RR |
789 | unsigned long hiwater_rss; /* High-watermark of RSS usage */ |
790 | unsigned long hiwater_vm; /* High-water virtual memory usage */ | |
c92ff1bd | 791 | |
c1a2f7f0 RR |
792 | unsigned long total_vm; /* Total pages mapped */ |
793 | unsigned long locked_vm; /* Pages that have PG_mlocked set */ | |
70f8a3ca | 794 | atomic64_t pinned_vm; /* Refcount permanently increased */ |
c1a2f7f0 RR |
795 | unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ |
796 | unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ | |
797 | unsigned long stack_vm; /* VM_STACK */ | |
798 | unsigned long def_flags; | |
88aa7cc6 | 799 | |
2e302543 FT |
800 | /** |
801 | * @write_protect_seq: Locked when any thread is write | |
802 | * protecting pages mapped by this mm to enforce a later COW, | |
803 | * for instance during page table copying for fork(). | |
804 | */ | |
805 | seqcount_t write_protect_seq; | |
806 | ||
c1a2f7f0 | 807 | spinlock_t arg_lock; /* protect the below fields */ |
2e302543 | 808 | |
c1a2f7f0 RR |
809 | unsigned long start_code, end_code, start_data, end_data; |
810 | unsigned long start_brk, brk, start_stack; | |
811 | unsigned long arg_start, arg_end, env_start, env_end; | |
c92ff1bd | 812 | |
c1a2f7f0 | 813 | unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ |
c92ff1bd | 814 | |
f1a79412 | 815 | struct percpu_counter rss_stat[NR_MM_COUNTERS]; |
801460d0 | 816 | |
c1a2f7f0 | 817 | struct linux_binfmt *binfmt; |
6345d24d | 818 | |
c1a2f7f0 RR |
819 | /* Architecture-specific MM context */ |
820 | mm_context_t context; | |
c92ff1bd | 821 | |
c1a2f7f0 | 822 | unsigned long flags; /* Must use atomic bitops to access */ |
c92ff1bd | 823 | |
858f0993 | 824 | #ifdef CONFIG_AIO |
c1a2f7f0 RR |
825 | spinlock_t ioctx_lock; |
826 | struct kioctx_table __rcu *ioctx_table; | |
858f0993 | 827 | #endif |
f98bafa0 | 828 | #ifdef CONFIG_MEMCG |
c1a2f7f0 RR |
829 | /* |
830 | * "owner" points to a task that is regarded as the canonical | |
831 | * user/owner of this mm. All of the following must be true in | |
832 | * order for it to be changed: | |
833 | * | |
834 | * current == mm->owner | |
835 | * current->mm != mm | |
836 | * new_owner->mm == mm | |
837 | * new_owner->alloc_lock is held | |
838 | */ | |
839 | struct task_struct __rcu *owner; | |
78fb7466 | 840 | #endif |
c1a2f7f0 | 841 | struct user_namespace *user_ns; |
925d1c40 | 842 | |
c1a2f7f0 RR |
843 | /* store ref to file /proc/<pid>/exe symlink points to */ |
844 | struct file __rcu *exe_file; | |
cddb8a5c | 845 | #ifdef CONFIG_MMU_NOTIFIER |
984cfe4e | 846 | struct mmu_notifier_subscriptions *notifier_subscriptions; |
e7a00c45 | 847 | #endif |
e009bb30 | 848 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS |
c1a2f7f0 | 849 | pgtable_t pmd_huge_pte; /* protected by page_table_lock */ |
cbee9f88 PZ |
850 | #endif |
851 | #ifdef CONFIG_NUMA_BALANCING | |
c1a2f7f0 | 852 | /* |
7014887a DH |
853 | * numa_next_scan is the next time that PTEs will be remapped |
854 | * PROT_NONE to trigger NUMA hinting faults; such faults gather | |
855 | * statistics and migrate pages to new nodes if necessary. | |
c1a2f7f0 RR |
856 | */ |
857 | unsigned long numa_next_scan; | |
cbee9f88 | 858 | |
7014887a | 859 | /* Restart point for scanning and remapping PTEs. */ |
c1a2f7f0 | 860 | unsigned long numa_scan_offset; |
6e5fb223 | 861 | |
7014887a | 862 | /* numa_scan_seq prevents two threads remapping PTEs. */ |
c1a2f7f0 | 863 | int numa_scan_seq; |
20841405 | 864 | #endif |
c1a2f7f0 RR |
865 | /* |
866 | * An operation with batched TLB flushing is going on. Anything | |
867 | * that can move process memory needs to flush the TLB when | |
7014887a | 868 | * moving a PROT_NONE mapped page. |
c1a2f7f0 RR |
869 | */ |
870 | atomic_t tlb_flush_pending; | |
3ea27719 | 871 | #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH |
c1a2f7f0 | 872 | /* See flush_tlb_batched_pending() */ |
5ee2fa2f | 873 | atomic_t tlb_flush_batched; |
6345d24d | 874 | #endif |
c1a2f7f0 | 875 | struct uprobes_state uprobes_state; |
8d491de6 TG |
876 | #ifdef CONFIG_PREEMPT_RT |
877 | struct rcu_head delayed_drop; | |
878 | #endif | |
5d317b2b | 879 | #ifdef CONFIG_HUGETLB_PAGE |
c1a2f7f0 | 880 | atomic_long_t hugetlb_usage; |
5d317b2b | 881 | #endif |
c1a2f7f0 | 882 | struct work_struct async_put_work; |
52ad9bc6 | 883 | |
7a853c2d | 884 | #ifdef CONFIG_IOMMU_SVA |
52ad9bc6 | 885 | u32 pasid; |
76093853 | 886 | #endif |
887 | #ifdef CONFIG_KSM | |
888 | /* | |
889 | * Represent how many pages of this process are involved in KSM | |
6080d19f | 890 | * merging (not including ksm_zero_pages). |
76093853 | 891 | */ |
892 | unsigned long ksm_merging_pages; | |
cb4df4ca | 893 | /* |
894 | * Represent how many pages are checked for ksm merging | |
895 | * including merged and not merged. | |
896 | */ | |
897 | unsigned long ksm_rmap_items; | |
6080d19f | 898 | /* |
899 | * Represent how many empty pages are merged with kernel zero | |
900 | * pages when enabling KSM use_zero_pages. | |
901 | */ | |
902 | unsigned long ksm_zero_pages; | |
903 | #endif /* CONFIG_KSM */ | |
bd74fdae YZ |
904 | #ifdef CONFIG_LRU_GEN |
905 | struct { | |
906 | /* this mm_struct is on lru_gen_mm_list */ | |
907 | struct list_head list; | |
908 | /* | |
909 | * Set when switching to this mm_struct, as a hint of | |
910 | * whether it has been used since the last time per-node | |
911 | * page table walkers cleared the corresponding bits. | |
912 | */ | |
913 | unsigned long bitmap; | |
914 | #ifdef CONFIG_MEMCG | |
915 | /* points to the memcg of "owner" above */ | |
916 | struct mem_cgroup *memcg; | |
917 | #endif | |
918 | } lru_gen; | |
919 | #endif /* CONFIG_LRU_GEN */ | |
c1a2f7f0 RR |
920 | } __randomize_layout; |
921 | ||
922 | /* | |
923 | * The mm_cpumask needs to be at the end of mm_struct, because it | |
924 | * is dynamically sized based on nr_cpu_ids. | |
925 | */ | |
926 | unsigned long cpu_bitmap[]; | |
927 | }; | |
c92ff1bd | 928 | |
3dd44325 LH |
929 | #define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \ |
930 | MT_FLAGS_USE_RCU) | |
abe722a1 IM |
931 | extern struct mm_struct init_mm; |
932 | ||
c1a2f7f0 | 933 | /* Pointer magic because the dynamic array size confuses some compilers. */ |
6345d24d LT |
934 | static inline void mm_init_cpumask(struct mm_struct *mm) |
935 | { | |
c1a2f7f0 RR |
936 | unsigned long cpu_bitmap = (unsigned long)mm; |
937 | ||
938 | cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap); | |
939 | cpumask_clear((struct cpumask *)cpu_bitmap); | |
6345d24d LT |
940 | } |
941 | ||
45e575ab | 942 | /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ |
de03c72c KM |
943 | static inline cpumask_t *mm_cpumask(struct mm_struct *mm) |
944 | { | |
c1a2f7f0 | 945 | return (struct cpumask *)&mm->cpu_bitmap; |
de03c72c | 946 | } |
45e575ab | 947 | |
bd74fdae YZ |
948 | #ifdef CONFIG_LRU_GEN |
949 | ||
950 | struct lru_gen_mm_list { | |
951 | /* mm_struct list for page table walkers */ | |
952 | struct list_head fifo; | |
953 | /* protects the list above */ | |
954 | spinlock_t lock; | |
955 | }; | |
956 | ||
957 | void lru_gen_add_mm(struct mm_struct *mm); | |
958 | void lru_gen_del_mm(struct mm_struct *mm); | |
959 | #ifdef CONFIG_MEMCG | |
960 | void lru_gen_migrate_mm(struct mm_struct *mm); | |
961 | #endif | |
962 | ||
963 | static inline void lru_gen_init_mm(struct mm_struct *mm) | |
964 | { | |
965 | INIT_LIST_HEAD(&mm->lru_gen.list); | |
966 | mm->lru_gen.bitmap = 0; | |
967 | #ifdef CONFIG_MEMCG | |
968 | mm->lru_gen.memcg = NULL; | |
969 | #endif | |
970 | } | |
971 | ||
972 | static inline void lru_gen_use_mm(struct mm_struct *mm) | |
973 | { | |
974 | /* | |
975 | * When the bitmap is set, page reclaim knows this mm_struct has been | |
976 | * used since the last time it cleared the bitmap. So it might be worth | |
977 | * walking the page tables of this mm_struct to clear the accessed bit. | |
978 | */ | |
979 | WRITE_ONCE(mm->lru_gen.bitmap, -1); | |
980 | } | |
981 | ||
982 | #else /* !CONFIG_LRU_GEN */ | |
983 | ||
984 | static inline void lru_gen_add_mm(struct mm_struct *mm) | |
985 | { | |
986 | } | |
987 | ||
988 | static inline void lru_gen_del_mm(struct mm_struct *mm) | |
989 | { | |
990 | } | |
991 | ||
992 | #ifdef CONFIG_MEMCG | |
993 | static inline void lru_gen_migrate_mm(struct mm_struct *mm) | |
994 | { | |
995 | } | |
996 | #endif | |
997 | ||
998 | static inline void lru_gen_init_mm(struct mm_struct *mm) | |
999 | { | |
1000 | } | |
1001 | ||
1002 | static inline void lru_gen_use_mm(struct mm_struct *mm) | |
1003 | { | |
1004 | } | |
1005 | ||
1006 | #endif /* CONFIG_LRU_GEN */ | |
1007 | ||
f39af059 MWO |
1008 | struct vma_iterator { |
1009 | struct ma_state mas; | |
1010 | }; | |
1011 | ||
1012 | #define VMA_ITERATOR(name, __mm, __addr) \ | |
1013 | struct vma_iterator name = { \ | |
1014 | .mas = { \ | |
1015 | .tree = &(__mm)->mm_mt, \ | |
1016 | .index = __addr, \ | |
1017 | .node = MAS_START, \ | |
1018 | }, \ | |
1019 | } | |
1020 | ||
1021 | static inline void vma_iter_init(struct vma_iterator *vmi, | |
1022 | struct mm_struct *mm, unsigned long addr) | |
1023 | { | |
b62b633e | 1024 | mas_init(&vmi->mas, &mm->mm_mt, addr); |
f39af059 MWO |
1025 | } |
1026 | ||
af7f588d | 1027 | #ifdef CONFIG_SCHED_MM_CID |
223baf9d MD |
1028 | |
1029 | enum mm_cid_state { | |
1030 | MM_CID_UNSET = -1U, /* Unset state has lazy_put flag set. */ | |
1031 | MM_CID_LAZY_PUT = (1U << 31), | |
1032 | }; | |
1033 | ||
1034 | static inline bool mm_cid_is_unset(int cid) | |
1035 | { | |
1036 | return cid == MM_CID_UNSET; | |
1037 | } | |
1038 | ||
1039 | static inline bool mm_cid_is_lazy_put(int cid) | |
1040 | { | |
1041 | return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT); | |
1042 | } | |
1043 | ||
1044 | static inline bool mm_cid_is_valid(int cid) | |
1045 | { | |
1046 | return !(cid & MM_CID_LAZY_PUT); | |
1047 | } | |
1048 | ||
1049 | static inline int mm_cid_set_lazy_put(int cid) | |
1050 | { | |
1051 | return cid | MM_CID_LAZY_PUT; | |
1052 | } | |
1053 | ||
1054 | static inline int mm_cid_clear_lazy_put(int cid) | |
1055 | { | |
1056 | return cid & ~MM_CID_LAZY_PUT; | |
1057 | } | |
1058 | ||
af7f588d MD |
1059 | /* Accessor for struct mm_struct's cidmask. */ |
1060 | static inline cpumask_t *mm_cidmask(struct mm_struct *mm) | |
1061 | { | |
1062 | unsigned long cid_bitmap = (unsigned long)mm; | |
1063 | ||
1064 | cid_bitmap += offsetof(struct mm_struct, cpu_bitmap); | |
1065 | /* Skip cpu_bitmap */ | |
1066 | cid_bitmap += cpumask_size(); | |
1067 | return (struct cpumask *)cid_bitmap; | |
1068 | } | |
1069 | ||
1070 | static inline void mm_init_cid(struct mm_struct *mm) | |
1071 | { | |
223baf9d MD |
1072 | int i; |
1073 | ||
1074 | for_each_possible_cpu(i) { | |
1075 | struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i); | |
1076 | ||
1077 | pcpu_cid->cid = MM_CID_UNSET; | |
1078 | pcpu_cid->time = 0; | |
1079 | } | |
af7f588d MD |
1080 | cpumask_clear(mm_cidmask(mm)); |
1081 | } | |
1082 | ||
223baf9d MD |
1083 | static inline int mm_alloc_cid(struct mm_struct *mm) |
1084 | { | |
1085 | mm->pcpu_cid = alloc_percpu(struct mm_cid); | |
1086 | if (!mm->pcpu_cid) | |
1087 | return -ENOMEM; | |
1088 | mm_init_cid(mm); | |
1089 | return 0; | |
1090 | } | |
1091 | ||
1092 | static inline void mm_destroy_cid(struct mm_struct *mm) | |
1093 | { | |
1094 | free_percpu(mm->pcpu_cid); | |
1095 | mm->pcpu_cid = NULL; | |
1096 | } | |
1097 | ||
af7f588d MD |
1098 | static inline unsigned int mm_cid_size(void) |
1099 | { | |
1100 | return cpumask_size(); | |
1101 | } | |
1102 | #else /* CONFIG_SCHED_MM_CID */ | |
1103 | static inline void mm_init_cid(struct mm_struct *mm) { } | |
223baf9d MD |
1104 | static inline int mm_alloc_cid(struct mm_struct *mm) { return 0; } |
1105 | static inline void mm_destroy_cid(struct mm_struct *mm) { } | |
af7f588d MD |
1106 | static inline unsigned int mm_cid_size(void) |
1107 | { | |
1108 | return 0; | |
1109 | } | |
1110 | #endif /* CONFIG_SCHED_MM_CID */ | |
1111 | ||
56236a59 | 1112 | struct mmu_gather; |
a72afd87 | 1113 | extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); |
d8b45053 | 1114 | extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); |
ae8eba8b | 1115 | extern void tlb_finish_mmu(struct mmu_gather *tlb); |
56236a59 | 1116 | |
f872f540 AL |
1117 | struct vm_fault; |
1118 | ||
3d353901 SJ |
1119 | /** |
1120 | * typedef vm_fault_t - Return type for page fault handlers. | |
1121 | * | |
1122 | * Page fault handlers return a bitmask of %VM_FAULT values. | |
1123 | */ | |
1124 | typedef __bitwise unsigned int vm_fault_t; | |
1125 | ||
1126 | /** | |
1127 | * enum vm_fault_reason - Page fault handlers return a bitmask of | |
1128 | * these values to tell the core VM what happened when handling the | |
1129 | * fault. Used to decide whether a process gets delivered SIGBUS or | |
1130 | * just gets major/minor fault counters bumped up. | |
1131 | * | |
1132 | * @VM_FAULT_OOM: Out Of Memory | |
1133 | * @VM_FAULT_SIGBUS: Bad access | |
1134 | * @VM_FAULT_MAJOR: Page read from storage | |
3d353901 SJ |
1135 | * @VM_FAULT_HWPOISON: Hit poisoned small page |
1136 | * @VM_FAULT_HWPOISON_LARGE: Hit poisoned large page. Index encoded | |
1137 | * in upper bits | |
1138 | * @VM_FAULT_SIGSEGV: segmentation fault | |
1139 | * @VM_FAULT_NOPAGE: ->fault installed the pte, not return page | |
1140 | * @VM_FAULT_LOCKED: ->fault locked the returned page | |
1141 | * @VM_FAULT_RETRY: ->fault blocked, must retry | |
1142 | * @VM_FAULT_FALLBACK: huge page fault failed, fall back to small | |
1143 | * @VM_FAULT_DONE_COW: ->fault has fully handled COW | |
1144 | * @VM_FAULT_NEEDDSYNC: ->fault did not modify page tables and needs | |
1145 | * fsync() to complete (for synchronous page faults | |
1146 | * in DAX) | |
d9272525 | 1147 | * @VM_FAULT_COMPLETED: ->fault completed, meanwhile mmap lock released |
3d353901 SJ |
1148 | * @VM_FAULT_HINDEX_MASK: mask HINDEX value |
1149 | * | |
1150 | */ | |
1151 | enum vm_fault_reason { | |
1152 | VM_FAULT_OOM = (__force vm_fault_t)0x000001, | |
1153 | VM_FAULT_SIGBUS = (__force vm_fault_t)0x000002, | |
1154 | VM_FAULT_MAJOR = (__force vm_fault_t)0x000004, | |
3d353901 SJ |
1155 | VM_FAULT_HWPOISON = (__force vm_fault_t)0x000010, |
1156 | VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020, | |
1157 | VM_FAULT_SIGSEGV = (__force vm_fault_t)0x000040, | |
1158 | VM_FAULT_NOPAGE = (__force vm_fault_t)0x000100, | |
1159 | VM_FAULT_LOCKED = (__force vm_fault_t)0x000200, | |
1160 | VM_FAULT_RETRY = (__force vm_fault_t)0x000400, | |
1161 | VM_FAULT_FALLBACK = (__force vm_fault_t)0x000800, | |
1162 | VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000, | |
1163 | VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000, | |
d9272525 | 1164 | VM_FAULT_COMPLETED = (__force vm_fault_t)0x004000, |
3d353901 SJ |
1165 | VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000, |
1166 | }; | |
1167 | ||
1168 | /* Encode hstate index for a hwpoisoned large page */ | |
1169 | #define VM_FAULT_SET_HINDEX(x) ((__force vm_fault_t)((x) << 16)) | |
fcae96ff | 1170 | #define VM_FAULT_GET_HINDEX(x) (((__force unsigned int)(x) >> 16) & 0xf) |
3d353901 SJ |
1171 | |
1172 | #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | \ | |
1173 | VM_FAULT_SIGSEGV | VM_FAULT_HWPOISON | \ | |
1174 | VM_FAULT_HWPOISON_LARGE | VM_FAULT_FALLBACK) | |
1175 | ||
1176 | #define VM_FAULT_RESULT_TRACE \ | |
1177 | { VM_FAULT_OOM, "OOM" }, \ | |
1178 | { VM_FAULT_SIGBUS, "SIGBUS" }, \ | |
1179 | { VM_FAULT_MAJOR, "MAJOR" }, \ | |
3d353901 SJ |
1180 | { VM_FAULT_HWPOISON, "HWPOISON" }, \ |
1181 | { VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \ | |
1182 | { VM_FAULT_SIGSEGV, "SIGSEGV" }, \ | |
1183 | { VM_FAULT_NOPAGE, "NOPAGE" }, \ | |
1184 | { VM_FAULT_LOCKED, "LOCKED" }, \ | |
1185 | { VM_FAULT_RETRY, "RETRY" }, \ | |
1186 | { VM_FAULT_FALLBACK, "FALLBACK" }, \ | |
1187 | { VM_FAULT_DONE_COW, "DONE_COW" }, \ | |
7a32b58b SB |
1188 | { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }, \ |
1189 | { VM_FAULT_COMPLETED, "COMPLETED" } | |
3d353901 | 1190 | |
f872f540 AL |
1191 | struct vm_special_mapping { |
1192 | const char *name; /* The name, e.g. "[vdso]". */ | |
1193 | ||
1194 | /* | |
1195 | * If .fault is not provided, this points to a | |
1196 | * NULL-terminated array of pages that back the special mapping. | |
1197 | * | |
1198 | * This must not be NULL unless .fault is provided. | |
1199 | */ | |
a62c34bd | 1200 | struct page **pages; |
f872f540 AL |
1201 | |
1202 | /* | |
1203 | * If non-NULL, then this is called to resolve page faults | |
1204 | * on the special mapping. If used, .pages is not checked. | |
1205 | */ | |
b3ec9f33 SJ |
1206 | vm_fault_t (*fault)(const struct vm_special_mapping *sm, |
1207 | struct vm_area_struct *vma, | |
1208 | struct vm_fault *vmf); | |
b059a453 DS |
1209 | |
1210 | int (*mremap)(const struct vm_special_mapping *sm, | |
1211 | struct vm_area_struct *new_vma); | |
a62c34bd AL |
1212 | }; |
1213 | ||
d17d8f9d DH |
1214 | enum tlb_flush_reason { |
1215 | TLB_FLUSH_ON_TASK_SWITCH, | |
1216 | TLB_REMOTE_SHOOTDOWN, | |
1217 | TLB_LOCAL_SHOOTDOWN, | |
1218 | TLB_LOCAL_MM_SHOOTDOWN, | |
5b74283a | 1219 | TLB_REMOTE_SEND_IPI, |
d17d8f9d DH |
1220 | NR_TLB_FLUSH_REASONS, |
1221 | }; | |
1222 | ||
36090def AB |
1223 | /** |
1224 | * enum fault_flag - Fault flag definitions. | |
1225 | * @FAULT_FLAG_WRITE: Fault was a write fault. | |
1226 | * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE. | |
1227 | * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked. | |
1228 | * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying. | |
1229 | * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region. | |
1230 | * @FAULT_FLAG_TRIED: The fault has been tried once. | |
1231 | * @FAULT_FLAG_USER: The fault originated in userspace. | |
1232 | * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. | |
1233 | * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. | |
1234 | * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. | |
8d6a0ac0 DH |
1235 | * @FAULT_FLAG_UNSHARE: The fault is an unsharing request to break COW in a |
1236 | * COW mapping, making sure that an exclusive anon page is | |
1237 | * mapped after the fault. | |
f46f2ade PX |
1238 | * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached. |
1239 | * We should only access orig_pte if this flag set. | |
55324e46 | 1240 | * @FAULT_FLAG_VMA_LOCK: The fault is handled under VMA lock. |
36090def AB |
1241 | * |
1242 | * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify | |
1243 | * whether we would allow page faults to retry by specifying these two | |
1244 | * fault flags correctly. Currently there can be three legal combinations: | |
1245 | * | |
1246 | * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and | |
1247 | * this is the first try | |
1248 | * | |
1249 | * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and | |
1250 | * we've already tried at least once | |
1251 | * | |
1252 | * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry | |
1253 | * | |
1254 | * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never | |
1255 | * be used. Note that page faults can be allowed to retry for multiple times, | |
1256 | * in which case we'll have an initial fault with flags (a) then later on | |
1257 | * continuous faults with flags (b). We should always try to detect pending | |
1258 | * signals before a retry to make sure the continuous page faults can still be | |
1259 | * interrupted if necessary. | |
c89357e2 DH |
1260 | * |
1261 | * The combination FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE is illegal. | |
1262 | * FAULT_FLAG_UNSHARE is ignored and treated like an ordinary read fault when | |
8d6a0ac0 | 1263 | * applied to mappings that are not COW mappings. |
36090def AB |
1264 | */ |
1265 | enum fault_flag { | |
1266 | FAULT_FLAG_WRITE = 1 << 0, | |
1267 | FAULT_FLAG_MKWRITE = 1 << 1, | |
1268 | FAULT_FLAG_ALLOW_RETRY = 1 << 2, | |
1269 | FAULT_FLAG_RETRY_NOWAIT = 1 << 3, | |
1270 | FAULT_FLAG_KILLABLE = 1 << 4, | |
1271 | FAULT_FLAG_TRIED = 1 << 5, | |
1272 | FAULT_FLAG_USER = 1 << 6, | |
1273 | FAULT_FLAG_REMOTE = 1 << 7, | |
1274 | FAULT_FLAG_INSTRUCTION = 1 << 8, | |
1275 | FAULT_FLAG_INTERRUPTIBLE = 1 << 9, | |
c89357e2 | 1276 | FAULT_FLAG_UNSHARE = 1 << 10, |
f46f2ade | 1277 | FAULT_FLAG_ORIG_PTE_VALID = 1 << 11, |
55324e46 | 1278 | FAULT_FLAG_VMA_LOCK = 1 << 12, |
36090def AB |
1279 | }; |
1280 | ||
05e90bd0 PX |
1281 | typedef unsigned int __bitwise zap_flags_t; |
1282 | ||
b5054174 DH |
1283 | /* |
1284 | * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each | |
1285 | * other. Here is what they mean, and how to use them: | |
1286 | * | |
b5054174 DH |
1287 | * |
1288 | * FIXME: For pages which are part of a filesystem, mappings are subject to the | |
1289 | * lifetime enforced by the filesystem and we need guarantees that longterm | |
1290 | * users like RDMA and V4L2 only establish mappings which coordinate usage with | |
1291 | * the filesystem. Ideas for this coordination include revoking the longterm | |
1292 | * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was | |
1293 | * added after the problem with filesystems was found FS DAX VMAs are | |
1294 | * specifically failed. Filesystem pages are still subject to bugs and use of | |
1295 | * FOLL_LONGTERM should be avoided on those pages. | |
1296 | * | |
b5054174 DH |
1297 | * In the CMA case: long term pins in a CMA region would unnecessarily fragment |
1298 | * that region. And so, CMA attempts to migrate the page before pinning, when | |
1299 | * FOLL_LONGTERM is specified. | |
1300 | * | |
1301 | * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount, | |
1302 | * but an additional pin counting system) will be invoked. This is intended for | |
1303 | * anything that gets a page reference and then touches page data (for example, | |
1304 | * Direct IO). This lets the filesystem know that some non-file-system entity is | |
1305 | * potentially changing the pages' data. In contrast to FOLL_GET (whose pages | |
1306 | * are released via put_page()), FOLL_PIN pages must be released, ultimately, by | |
1307 | * a call to unpin_user_page(). | |
1308 | * | |
1309 | * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different | |
1310 | * and separate refcounting mechanisms, however, and that means that each has | |
1311 | * its own acquire and release mechanisms: | |
1312 | * | |
1313 | * FOLL_GET: get_user_pages*() to acquire, and put_page() to release. | |
1314 | * | |
1315 | * FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release. | |
1316 | * | |
1317 | * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call. | |
1318 | * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based | |
1319 | * calls applied to them, and that's perfectly OK. This is a constraint on the | |
1320 | * callers, not on the pages.) | |
1321 | * | |
1322 | * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never | |
1323 | * directly by the caller. That's in order to help avoid mismatches when | |
1324 | * releasing pages: get_user_pages*() pages must be released via put_page(), | |
1325 | * while pin_user_pages*() pages must be released via unpin_user_page(). | |
1326 | * | |
1327 | * Please see Documentation/core-api/pin_user_pages.rst for more information. | |
1328 | */ | |
1329 | ||
2c224108 JG |
1330 | enum { |
1331 | /* check pte is writable */ | |
1332 | FOLL_WRITE = 1 << 0, | |
1333 | /* do get_page on page */ | |
1334 | FOLL_GET = 1 << 1, | |
1335 | /* give error on hole if it would be zero */ | |
1336 | FOLL_DUMP = 1 << 2, | |
1337 | /* get_user_pages read/write w/o permission */ | |
1338 | FOLL_FORCE = 1 << 3, | |
1339 | /* | |
1340 | * if a disk transfer is needed, start the IO and return without waiting | |
1341 | * upon it | |
1342 | */ | |
1343 | FOLL_NOWAIT = 1 << 4, | |
1344 | /* do not fault in pages */ | |
1345 | FOLL_NOFAULT = 1 << 5, | |
1346 | /* check page is hwpoisoned */ | |
1347 | FOLL_HWPOISON = 1 << 6, | |
1348 | /* don't do file mappings */ | |
1349 | FOLL_ANON = 1 << 7, | |
1350 | /* | |
1351 | * FOLL_LONGTERM indicates that the page will be held for an indefinite | |
1352 | * time period _often_ under userspace control. This is in contrast to | |
1353 | * iov_iter_get_pages(), whose usages are transient. | |
1354 | */ | |
1355 | FOLL_LONGTERM = 1 << 8, | |
1356 | /* split huge pmd before returning */ | |
1357 | FOLL_SPLIT_PMD = 1 << 9, | |
1358 | /* allow returning PCI P2PDMA pages */ | |
1359 | FOLL_PCI_P2PDMA = 1 << 10, | |
1360 | /* allow interrupts from generic signals */ | |
1361 | FOLL_INTERRUPTIBLE = 1 << 11, | |
d74943a2 DH |
1362 | /* |
1363 | * Always honor (trigger) NUMA hinting faults. | |
1364 | * | |
1365 | * FOLL_WRITE implicitly honors NUMA hinting faults because a | |
1366 | * PROT_NONE-mapped page is not writable (exceptions with FOLL_FORCE | |
1367 | * apply). get_user_pages_fast_only() always implicitly honors NUMA | |
1368 | * hinting faults. | |
1369 | */ | |
1370 | FOLL_HONOR_NUMA_FAULT = 1 << 12, | |
2c224108 JG |
1371 | |
1372 | /* See also internal only FOLL flags in mm/internal.h */ | |
1373 | }; | |
b5054174 | 1374 | |
5b99cd0e | 1375 | #endif /* _LINUX_MM_TYPES_H */ |