Commit | Line | Data |
---|---|---|
b2441318 | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
5b99cd0e HC |
2 | #ifndef _LINUX_MM_TYPES_H |
3 | #define _LINUX_MM_TYPES_H | |
4 | ||
2e58f173 IM |
5 | #include <linux/mm_types_task.h> |
6 | ||
4f9a58d7 | 7 | #include <linux/auxvec.h> |
78db3412 | 8 | #include <linux/kref.h> |
5b99cd0e HC |
9 | #include <linux/list.h> |
10 | #include <linux/spinlock.h> | |
c92ff1bd | 11 | #include <linux/rbtree.h> |
d4af56c5 | 12 | #include <linux/maple_tree.h> |
c92ff1bd MS |
13 | #include <linux/rwsem.h> |
14 | #include <linux/completion.h> | |
cddb8a5c | 15 | #include <linux/cpumask.h> |
d4b3b638 | 16 | #include <linux/uprobes.h> |
8d491de6 | 17 | #include <linux/rcupdate.h> |
bbeae5b0 | 18 | #include <linux/page-flags-layout.h> |
ec8d7c14 | 19 | #include <linux/workqueue.h> |
57efa1fe | 20 | #include <linux/seqlock.h> |
f1a79412 | 21 | #include <linux/percpu_counter.h> |
2e58f173 | 22 | |
c92ff1bd | 23 | #include <asm/mmu.h> |
5b99cd0e | 24 | |
4f9a58d7 OH |
25 | #ifndef AT_VECTOR_SIZE_ARCH |
26 | #define AT_VECTOR_SIZE_ARCH 0 | |
27 | #endif | |
28 | #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) | |
29 | ||
82e69a12 | 30 | #define INIT_PASID 0 |
1c8f4220 | 31 | |
5b99cd0e | 32 | struct address_space; |
1306a85a | 33 | struct mem_cgroup; |
5b99cd0e HC |
34 | |
35 | /* | |
36 | * Each physical page in the system has a struct page associated with | |
37 | * it to keep track of whatever it is we are using the page for at the | |
38 | * moment. Note that we have no way to track which tasks are using | |
39 | * a page, though if it is a pagecache page, rmap structures can tell us | |
97b4a671 | 40 | * who is mapping it. |
be50015d | 41 | * |
97b4a671 MW |
42 | * If you allocate the page using alloc_pages(), you can use some of the |
43 | * space in struct page for your own purposes. The five words in the main | |
44 | * union are available, except for bit 0 of the first word which must be | |
45 | * kept clear. Many users use this word to store a pointer to an object | |
46 | * which is guaranteed to be aligned. If you use the same storage as | |
47 | * page->mapping, you must restore it to NULL before freeing the page. | |
be50015d | 48 | * |
97b4a671 MW |
49 | * If your page will not be mapped to userspace, you can also use the four |
50 | * bytes in the mapcount union, but you must call page_mapcount_reset() | |
51 | * before freeing it. | |
52 | * | |
53 | * If you want to use the refcount field, it must be used in such a way | |
54 | * that other CPUs temporarily incrementing and then decrementing the | |
55 | * refcount does not cause problems. On receiving the page from | |
56 | * alloc_pages(), the refcount will be positive. | |
57 | * | |
58 | * If you allocate pages of order > 0, you can use some of the fields | |
59 | * in each subpage, but you may need to restore some of their values | |
60 | * afterwards. | |
fc9bb8c7 | 61 | * |
d122019b MWO |
62 | * SLUB uses cmpxchg_double() to atomically update its freelist and counters. |
63 | * That requires that freelist & counters in struct slab be adjacent and | |
64 | * double-word aligned. Because struct slab currently just reinterprets the | |
65 | * bits of struct page, we align all struct pages to double-word boundaries, | |
66 | * and ensure that 'freelist' is aligned within struct slab. | |
5b99cd0e | 67 | */ |
e20df2c6 MW |
68 | #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE |
69 | #define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) | |
70 | #else | |
70fb4fdf | 71 | #define _struct_page_alignment __aligned(sizeof(unsigned long)) |
7d27a04b | 72 | #endif |
e20df2c6 | 73 | |
5b99cd0e HC |
74 | struct page { |
75 | unsigned long flags; /* Atomic flags, some possibly | |
76 | * updated asynchronously */ | |
b7ccc7f8 | 77 | /* |
4da1984e MW |
78 | * Five words (20/40 bytes) are available in this union. |
79 | * WARNING: bit 0 of the first word is used for PageTail(). That | |
80 | * means the other users of this union MUST NOT use the bit to | |
b7ccc7f8 MW |
81 | * avoid collision and false-positive PageTail(). |
82 | */ | |
8456a648 | 83 | union { |
66a6ffd2 | 84 | struct { /* Page cache and anonymous pages */ |
4da1984e MW |
85 | /** |
86 | * @lru: Pageout list, eg. active_list protected by | |
15b44736 | 87 | * lruvec->lru_lock. Sometimes used as a generic list |
4da1984e MW |
88 | * by the page owner. |
89 | */ | |
07ca7606 HD |
90 | union { |
91 | struct list_head lru; | |
bf75f200 | 92 | |
07ca7606 HD |
93 | /* Or, for the Unevictable "LRU list" slot */ |
94 | struct { | |
95 | /* Always even, to negate PageTail */ | |
96 | void *__filler; | |
97 | /* Count page's or folio's mlocks */ | |
98 | unsigned int mlock_count; | |
99 | }; | |
bf75f200 MG |
100 | |
101 | /* Or, free page */ | |
102 | struct list_head buddy_list; | |
103 | struct list_head pcp_list; | |
07ca7606 | 104 | }; |
66a6ffd2 MW |
105 | /* See page-flags.h for PAGE_MAPPING_FLAGS */ |
106 | struct address_space *mapping; | |
16900426 SR |
107 | union { |
108 | pgoff_t index; /* Our offset within mapping. */ | |
109 | unsigned long share; /* share count for fsdax */ | |
110 | }; | |
66a6ffd2 MW |
111 | /** |
112 | * @private: Mapping-private opaque data. | |
113 | * Usually used for buffer_heads if PagePrivate. | |
114 | * Used for swp_entry_t if PageSwapCache. | |
115 | * Indicates order in the buddy system if PageBuddy. | |
116 | */ | |
117 | unsigned long private; | |
118 | }; | |
c25fff71 | 119 | struct { /* page_pool used by netstack */ |
c07aea3e MC |
120 | /** |
121 | * @pp_magic: magic value to avoid recycling non | |
122 | * page_pool allocated pages. | |
123 | */ | |
124 | unsigned long pp_magic; | |
125 | struct page_pool *pp; | |
126 | unsigned long _pp_mapping_pad; | |
0e9d2a0a | 127 | unsigned long dma_addr; |
f915b75b YL |
128 | union { |
129 | /** | |
130 | * dma_addr_upper: might require a 64-bit | |
131 | * value on 32-bit architectures. | |
132 | */ | |
133 | unsigned long dma_addr_upper; | |
134 | /** | |
135 | * For frag page support, not supported in | |
136 | * 32-bit architectures with 64-bit DMA. | |
137 | */ | |
138 | atomic_long_t pp_frag_count; | |
139 | }; | |
c25fff71 | 140 | }; |
4da1984e MW |
141 | struct { /* Tail pages of compound page */ |
142 | unsigned long compound_head; /* Bit zero is set */ | |
dad6a5eb | 143 | }; |
66a6ffd2 | 144 | struct { /* Page table pages */ |
4da1984e MW |
145 | unsigned long _pt_pad_1; /* compound_head */ |
146 | pgtable_t pmd_huge_pte; /* protected by page->ptl */ | |
66a6ffd2 | 147 | unsigned long _pt_pad_2; /* mapping */ |
4231aba0 NP |
148 | union { |
149 | struct mm_struct *pt_mm; /* x86 pgds only */ | |
150 | atomic_t pt_frag_refcount; /* powerpc */ | |
151 | }; | |
7d27a04b | 152 | #if ALLOC_SPLIT_PTLOCKS |
66a6ffd2 | 153 | spinlock_t *ptl; |
7d27a04b | 154 | #else |
66a6ffd2 | 155 | spinlock_t ptl; |
7d27a04b | 156 | #endif |
7d27a04b | 157 | }; |
50e7fbc3 MW |
158 | struct { /* ZONE_DEVICE pages */ |
159 | /** @pgmap: Points to the hosting device page map. */ | |
160 | struct dev_pagemap *pgmap; | |
8a164fef | 161 | void *zone_device_data; |
76470ccd RC |
162 | /* |
163 | * ZONE_DEVICE private pages are counted as being | |
164 | * mapped so the next 3 words hold the mapping, index, | |
165 | * and private fields from the source anonymous or | |
166 | * page cache page while the page is migrated to device | |
167 | * private memory. | |
168 | * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also | |
169 | * use the mapping, index, and private fields when | |
170 | * pmem backed DAX files are mapped. | |
171 | */ | |
50e7fbc3 | 172 | }; |
4da1984e MW |
173 | |
174 | /** @rcu_head: You can use this to free a page by RCU. */ | |
175 | struct rcu_head rcu_head; | |
7d27a04b MW |
176 | }; |
177 | ||
b21999da MW |
178 | union { /* This union is 4 bytes in size. */ |
179 | /* | |
180 | * If the page can be mapped to userspace, encodes the number | |
181 | * of times this page is referenced by a page table. | |
182 | */ | |
183 | atomic_t _mapcount; | |
184 | ||
6e292b9b MW |
185 | /* |
186 | * If the page is neither PageSlab nor mappable to userspace, | |
187 | * the value stored here may help determine what this page | |
188 | * is used for. See page-flags.h for a list of page types | |
189 | * which are currently stored here. | |
190 | */ | |
191 | unsigned int page_type; | |
81819f0f | 192 | }; |
fc9bb8c7 | 193 | |
b21999da MW |
194 | /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */ |
195 | atomic_t _refcount; | |
196 | ||
1306a85a | 197 | #ifdef CONFIG_MEMCG |
bcfe06bf | 198 | unsigned long memcg_data; |
1306a85a JW |
199 | #endif |
200 | ||
5b99cd0e HC |
201 | /* |
202 | * On machines where all RAM is mapped into kernel address space, | |
203 | * we can simply calculate the virtual address. On machines with | |
204 | * highmem some memory is mapped into kernel virtual memory | |
205 | * dynamically, so we need a place to store that address. | |
206 | * Note that this field could be 16 bits on x86 ... ;) | |
207 | * | |
208 | * Architectures with slow multiplication can define | |
209 | * WANT_PAGE_VIRTUAL in asm/page.h | |
210 | */ | |
211 | #if defined(WANT_PAGE_VIRTUAL) | |
212 | void *virtual; /* Kernel virtual address (NULL if | |
213 | not kmapped, ie. highmem) */ | |
214 | #endif /* WANT_PAGE_VIRTUAL */ | |
dfec072e | 215 | |
f80be457 AP |
216 | #ifdef CONFIG_KMSAN |
217 | /* | |
218 | * KMSAN metadata for this page: | |
219 | * - shadow page: every bit indicates whether the corresponding | |
220 | * bit of the original page is initialized (0) or not (1); | |
221 | * - origin page: every 4 bytes contain an id of the stack trace | |
222 | * where the uninitialized value was created. | |
223 | */ | |
224 | struct page *kmsan_shadow; | |
225 | struct page *kmsan_origin; | |
226 | #endif | |
227 | ||
90572890 PZ |
228 | #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS |
229 | int _last_cpupid; | |
57e0a030 | 230 | #endif |
e20df2c6 | 231 | } _struct_page_alignment; |
5b99cd0e | 232 | |
70fb4fdf LT |
233 | /* |
234 | * struct encoded_page - a nonexistent type marking this pointer | |
235 | * | |
236 | * An 'encoded_page' pointer is a pointer to a regular 'struct page', but | |
237 | * with the low bits of the pointer indicating extra context-dependent | |
238 | * information. Not super-common, but happens in mmu_gather and mlock | |
239 | * handling, and this acts as a type system check on that use. | |
240 | * | |
241 | * We only really have two guaranteed bits in general, although you could | |
242 | * play with 'struct page' alignment (see CONFIG_HAVE_ALIGNED_STRUCT_PAGE) | |
243 | * for more. | |
244 | * | |
245 | * Use the supplied helper functions to endcode/decode the pointer and bits. | |
246 | */ | |
247 | struct encoded_page; | |
248 | #define ENCODE_PAGE_BITS 3ul | |
249 | static __always_inline struct encoded_page *encode_page(struct page *page, unsigned long flags) | |
250 | { | |
251 | BUILD_BUG_ON(flags > ENCODE_PAGE_BITS); | |
252 | return (struct encoded_page *)(flags | (unsigned long)page); | |
253 | } | |
254 | ||
255 | static inline unsigned long encoded_page_flags(struct encoded_page *page) | |
256 | { | |
257 | return ENCODE_PAGE_BITS & (unsigned long)page; | |
258 | } | |
259 | ||
260 | static inline struct page *encoded_page_ptr(struct encoded_page *page) | |
261 | { | |
262 | return (struct page *)(~ENCODE_PAGE_BITS & (unsigned long)page); | |
263 | } | |
264 | ||
7b230db3 MWO |
265 | /** |
266 | * struct folio - Represents a contiguous set of bytes. | |
267 | * @flags: Identical to the page flags. | |
268 | * @lru: Least Recently Used list; tracks how recently this folio was used. | |
334f6f53 | 269 | * @mlock_count: Number of times this folio has been pinned by mlock(). |
7b230db3 MWO |
270 | * @mapping: The file this page belongs to, or refers to the anon_vma for |
271 | * anonymous memory. | |
272 | * @index: Offset within the file, in units of pages. For anonymous memory, | |
273 | * this is the index from the beginning of the mmap. | |
274 | * @private: Filesystem per-folio data (see folio_attach_private()). | |
275 | * Used for swp_entry_t if folio_test_swapcache(). | |
276 | * @_mapcount: Do not access this member directly. Use folio_mapcount() to | |
277 | * find out how many times this folio is mapped by userspace. | |
278 | * @_refcount: Do not access this member directly. Use folio_ref_count() | |
279 | * to find how many references there are to this folio. | |
280 | * @memcg_data: Memory Control Group data. | |
379708ff MWO |
281 | * @_folio_dtor: Which destructor to use for this folio. |
282 | * @_folio_order: Do not use directly, call folio_order(). | |
b14224fb | 283 | * @_entire_mapcount: Do not use directly, call folio_entire_mapcount(). |
eec20426 | 284 | * @_nr_pages_mapped: Do not use directly, call folio_mapcount(). |
379708ff MWO |
285 | * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). |
286 | * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). | |
dad6a5eb HD |
287 | * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h. |
288 | * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h. | |
289 | * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h. | |
290 | * @_hugetlb_hwpoison: Do not use directly, call raw_hwp_list_head(). | |
4375a553 | 291 | * @_deferred_list: Folios to be split under memory pressure. |
7b230db3 MWO |
292 | * |
293 | * A folio is a physically, virtually and logically contiguous set | |
294 | * of bytes. It is a power-of-two in size, and it is aligned to that | |
295 | * same power-of-two. It is at least as large as %PAGE_SIZE. If it is | |
296 | * in the page cache, it is at a file offset which is a multiple of that | |
297 | * power-of-two. It may be mapped into userspace at an address which is | |
298 | * at an arbitrary page offset, but its kernel virtual address is aligned | |
299 | * to its size. | |
300 | */ | |
301 | struct folio { | |
302 | /* private: don't document the anon union */ | |
303 | union { | |
304 | struct { | |
305 | /* public: */ | |
306 | unsigned long flags; | |
07ca7606 HD |
307 | union { |
308 | struct list_head lru; | |
334f6f53 | 309 | /* private: avoid cluttering the output */ |
07ca7606 HD |
310 | struct { |
311 | void *__filler; | |
334f6f53 | 312 | /* public: */ |
07ca7606 | 313 | unsigned int mlock_count; |
334f6f53 | 314 | /* private: */ |
07ca7606 | 315 | }; |
334f6f53 | 316 | /* public: */ |
07ca7606 | 317 | }; |
7b230db3 MWO |
318 | struct address_space *mapping; |
319 | pgoff_t index; | |
320 | void *private; | |
321 | atomic_t _mapcount; | |
322 | atomic_t _refcount; | |
323 | #ifdef CONFIG_MEMCG | |
324 | unsigned long memcg_data; | |
325 | #endif | |
326 | /* private: the union with struct page is transitional */ | |
327 | }; | |
328 | struct page page; | |
329 | }; | |
dad6a5eb HD |
330 | union { |
331 | struct { | |
332 | unsigned long _flags_1; | |
333 | unsigned long _head_1; | |
a8d55327 | 334 | /* public: */ |
dad6a5eb HD |
335 | unsigned char _folio_dtor; |
336 | unsigned char _folio_order; | |
b14224fb | 337 | atomic_t _entire_mapcount; |
eec20426 | 338 | atomic_t _nr_pages_mapped; |
dad6a5eb | 339 | atomic_t _pincount; |
379708ff | 340 | #ifdef CONFIG_64BIT |
dad6a5eb | 341 | unsigned int _folio_nr_pages; |
379708ff | 342 | #endif |
a8d55327 | 343 | /* private: the union with struct page is transitional */ |
dad6a5eb HD |
344 | }; |
345 | struct page __page_1; | |
346 | }; | |
347 | union { | |
348 | struct { | |
349 | unsigned long _flags_2; | |
350 | unsigned long _head_2; | |
a8d55327 | 351 | /* public: */ |
dad6a5eb HD |
352 | void *_hugetlb_subpool; |
353 | void *_hugetlb_cgroup; | |
354 | void *_hugetlb_cgroup_rsvd; | |
355 | void *_hugetlb_hwpoison; | |
4375a553 MWO |
356 | /* private: the union with struct page is transitional */ |
357 | }; | |
358 | struct { | |
359 | unsigned long _flags_2a; | |
360 | unsigned long _head_2a; | |
361 | /* public: */ | |
362 | struct list_head _deferred_list; | |
a8d55327 | 363 | /* private: the union with struct page is transitional */ |
dad6a5eb HD |
364 | }; |
365 | struct page __page_2; | |
366 | }; | |
7b230db3 MWO |
367 | }; |
368 | ||
7b230db3 MWO |
369 | #define FOLIO_MATCH(pg, fl) \ |
370 | static_assert(offsetof(struct page, pg) == offsetof(struct folio, fl)) | |
371 | FOLIO_MATCH(flags, flags); | |
372 | FOLIO_MATCH(lru, lru); | |
536f4217 | 373 | FOLIO_MATCH(mapping, mapping); |
7b230db3 MWO |
374 | FOLIO_MATCH(compound_head, lru); |
375 | FOLIO_MATCH(index, index); | |
376 | FOLIO_MATCH(private, private); | |
377 | FOLIO_MATCH(_mapcount, _mapcount); | |
378 | FOLIO_MATCH(_refcount, _refcount); | |
379 | #ifdef CONFIG_MEMCG | |
380 | FOLIO_MATCH(memcg_data, memcg_data); | |
381 | #endif | |
382 | #undef FOLIO_MATCH | |
379708ff MWO |
383 | #define FOLIO_MATCH(pg, fl) \ |
384 | static_assert(offsetof(struct folio, fl) == \ | |
385 | offsetof(struct page, pg) + sizeof(struct page)) | |
386 | FOLIO_MATCH(flags, _flags_1); | |
dad6a5eb | 387 | FOLIO_MATCH(compound_head, _head_1); |
379708ff | 388 | #undef FOLIO_MATCH |
dad6a5eb HD |
389 | #define FOLIO_MATCH(pg, fl) \ |
390 | static_assert(offsetof(struct folio, fl) == \ | |
391 | offsetof(struct page, pg) + 2 * sizeof(struct page)) | |
392 | FOLIO_MATCH(flags, _flags_2); | |
393 | FOLIO_MATCH(compound_head, _head_2); | |
dad6a5eb | 394 | #undef FOLIO_MATCH |
7b230db3 | 395 | |
d1402fc7 LG |
396 | /* |
397 | * Used for sizing the vmemmap region on some architectures | |
398 | */ | |
399 | #define STRUCT_PAGE_MAX_SHIFT (order_base_2(sizeof(struct page))) | |
400 | ||
b63ae8ca AD |
401 | #define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) |
402 | #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) | |
403 | ||
85d0a2ed MWO |
404 | /* |
405 | * page_private can be used on tail pages. However, PagePrivate is only | |
406 | * checked by the VM on the head page. So page_private on the tail pages | |
407 | * should be used for data that's ancillary to the head page (eg attaching | |
408 | * buffer heads to tail pages after attaching buffer heads to the head page) | |
409 | */ | |
b03641af | 410 | #define page_private(page) ((page)->private) |
60e65a6f GJ |
411 | |
412 | static inline void set_page_private(struct page *page, unsigned long private) | |
413 | { | |
414 | page->private = private; | |
415 | } | |
b03641af | 416 | |
85d0a2ed MWO |
417 | static inline void *folio_get_private(struct folio *folio) |
418 | { | |
419 | return folio->private; | |
420 | } | |
421 | ||
b63ae8ca AD |
422 | struct page_frag_cache { |
423 | void * va; | |
424 | #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) | |
425 | __u16 offset; | |
426 | __u16 size; | |
427 | #else | |
428 | __u32 offset; | |
429 | #endif | |
430 | /* we maintain a pagecount bias, so that we dont dirty cache line | |
0139aa7b | 431 | * containing page->_refcount every time we allocate a fragment. |
b63ae8ca AD |
432 | */ |
433 | unsigned int pagecnt_bias; | |
434 | bool pfmemalloc; | |
435 | }; | |
436 | ||
64b990d2 | 437 | typedef unsigned long vm_flags_t; |
ca16d140 | 438 | |
8feae131 DH |
439 | /* |
440 | * A region containing a mapping of a non-memory backed file under NOMMU | |
441 | * conditions. These are held in a global tree and are pinned by the VMAs that | |
442 | * map parts of them. | |
443 | */ | |
444 | struct vm_region { | |
445 | struct rb_node vm_rb; /* link in global region tree */ | |
ca16d140 | 446 | vm_flags_t vm_flags; /* VMA vm_flags */ |
8feae131 DH |
447 | unsigned long vm_start; /* start address of region */ |
448 | unsigned long vm_end; /* region initialised to here */ | |
dd8632a1 | 449 | unsigned long vm_top; /* region allocated to here */ |
8feae131 DH |
450 | unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */ |
451 | struct file *vm_file; /* the backing file or NULL */ | |
452 | ||
1e2ae599 | 453 | int vm_usage; /* region usage count (access under nommu_region_sem) */ |
cfe79c00 MF |
454 | bool vm_icache_flushed : 1; /* true if the icache has been flushed for |
455 | * this region */ | |
8feae131 DH |
456 | }; |
457 | ||
745f234b AA |
458 | #ifdef CONFIG_USERFAULTFD |
459 | #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, }) | |
460 | struct vm_userfaultfd_ctx { | |
461 | struct userfaultfd_ctx *ctx; | |
462 | }; | |
463 | #else /* CONFIG_USERFAULTFD */ | |
464 | #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {}) | |
465 | struct vm_userfaultfd_ctx {}; | |
466 | #endif /* CONFIG_USERFAULTFD */ | |
467 | ||
78db3412 SB |
468 | struct anon_vma_name { |
469 | struct kref kref; | |
470 | /* The name needs to be at the end because it is dynamically sized. */ | |
471 | char name[]; | |
472 | }; | |
473 | ||
c7f8f31c SB |
474 | struct vma_lock { |
475 | struct rw_semaphore lock; | |
476 | }; | |
477 | ||
ef6a22b7 MG |
478 | struct vma_numab_state { |
479 | unsigned long next_scan; | |
20f58648 R |
480 | unsigned long next_pid_reset; |
481 | unsigned long access_pids[2]; | |
ef6a22b7 MG |
482 | }; |
483 | ||
c92ff1bd | 484 | /* |
552657b7 | 485 | * This struct describes a virtual memory area. There is one of these |
486 | * per VM-area/task. A VM area is any part of the process virtual memory | |
c92ff1bd MS |
487 | * space that has a special rule for the page-fault handlers (ie a shared |
488 | * library, the executable area etc). | |
489 | */ | |
490 | struct vm_area_struct { | |
e4c6bfd2 RR |
491 | /* The first cache line has the info for VMA tree walking. */ |
492 | ||
20cce633 ML |
493 | union { |
494 | struct { | |
495 | /* VMA covers [vm_start; vm_end) addresses within mm */ | |
496 | unsigned long vm_start; | |
497 | unsigned long vm_end; | |
498 | }; | |
499 | #ifdef CONFIG_PER_VMA_LOCK | |
500 | struct rcu_head vm_rcu; /* Used for deferred freeing. */ | |
501 | #endif | |
502 | }; | |
c92ff1bd | 503 | |
e4c6bfd2 | 504 | struct mm_struct *vm_mm; /* The address space we belong to. */ |
28d8b812 | 505 | pgprot_t vm_page_prot; /* Access permissions of this VMA. */ |
bc292ab0 SB |
506 | |
507 | /* | |
508 | * Flags, see mm.h. | |
509 | * To modify use vm_flags_{init|reset|set|clear|mod} functions. | |
510 | */ | |
511 | union { | |
512 | const vm_flags_t vm_flags; | |
513 | vm_flags_t __private __vm_flags; | |
514 | }; | |
e4c6bfd2 | 515 | |
5e31275c SB |
516 | #ifdef CONFIG_PER_VMA_LOCK |
517 | int vm_lock_seq; | |
c7f8f31c | 518 | struct vma_lock *vm_lock; |
457f67be SB |
519 | |
520 | /* Flag to indicate areas detached from the mm->mm_mt tree */ | |
521 | bool detached; | |
5e31275c SB |
522 | #endif |
523 | ||
c92ff1bd MS |
524 | /* |
525 | * For areas with an address space and backing store, | |
27ba0644 | 526 | * linkage into the address_space->i_mmap interval tree. |
9a10064f | 527 | * |
c92ff1bd | 528 | */ |
d09e8ca6 PT |
529 | struct { |
530 | struct rb_node rb; | |
531 | unsigned long rb_subtree_last; | |
532 | } shared; | |
c92ff1bd MS |
533 | |
534 | /* | |
535 | * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma | |
536 | * list, after a COW of one of the file pages. A MAP_SHARED vma | |
537 | * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack | |
538 | * or brk vma (with NULL file) can only be in an anon_vma list. | |
539 | */ | |
c1e8d7c6 | 540 | struct list_head anon_vma_chain; /* Serialized by mmap_lock & |
5beb4930 | 541 | * page_table_lock */ |
c92ff1bd MS |
542 | struct anon_vma *anon_vma; /* Serialized by page_table_lock */ |
543 | ||
544 | /* Function pointers to deal with this struct. */ | |
f0f37e2f | 545 | const struct vm_operations_struct *vm_ops; |
c92ff1bd MS |
546 | |
547 | /* Information about our backing store: */ | |
548 | unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE | |
ea1754a0 | 549 | units */ |
c92ff1bd MS |
550 | struct file * vm_file; /* File we map to (can be NULL). */ |
551 | void * vm_private_data; /* was vm_pte (shared mem) */ | |
c92ff1bd | 552 | |
d09e8ca6 PT |
553 | #ifdef CONFIG_ANON_VMA_NAME |
554 | /* | |
555 | * For private and shared anonymous mappings, a pointer to a null | |
556 | * terminated string containing the name given to the vma, or NULL if | |
8651a137 | 557 | * unnamed. Serialized by mmap_lock. Use anon_vma_name to access. |
d09e8ca6 PT |
558 | */ |
559 | struct anon_vma_name *anon_name; | |
560 | #endif | |
219f8a2e | 561 | #ifdef CONFIG_SWAP |
ec560175 | 562 | atomic_long_t swap_readahead_info; |
219f8a2e | 563 | #endif |
c92ff1bd | 564 | #ifndef CONFIG_MMU |
8feae131 | 565 | struct vm_region *vm_region; /* NOMMU mapping region */ |
c92ff1bd MS |
566 | #endif |
567 | #ifdef CONFIG_NUMA | |
568 | struct mempolicy *vm_policy; /* NUMA policy for the VMA */ | |
ef6a22b7 MG |
569 | #endif |
570 | #ifdef CONFIG_NUMA_BALANCING | |
571 | struct vma_numab_state *numab_state; /* NUMA Balancing state */ | |
c92ff1bd | 572 | #endif |
745f234b | 573 | struct vm_userfaultfd_ctx vm_userfaultfd_ctx; |
3859a271 | 574 | } __randomize_layout; |
c92ff1bd | 575 | |
223baf9d MD |
576 | #ifdef CONFIG_SCHED_MM_CID |
577 | struct mm_cid { | |
578 | u64 time; | |
579 | int cid; | |
580 | }; | |
581 | #endif | |
582 | ||
db446a08 | 583 | struct kioctx_table; |
c92ff1bd | 584 | struct mm_struct { |
c1a2f7f0 | 585 | struct { |
d4af56c5 | 586 | struct maple_tree mm_mt; |
efc1a3b1 | 587 | #ifdef CONFIG_MMU |
c1a2f7f0 | 588 | unsigned long (*get_unmapped_area) (struct file *filp, |
c92ff1bd MS |
589 | unsigned long addr, unsigned long len, |
590 | unsigned long pgoff, unsigned long flags); | |
efc1a3b1 | 591 | #endif |
c1a2f7f0 RR |
592 | unsigned long mmap_base; /* base of mmap area */ |
593 | unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ | |
1b028f78 | 594 | #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES |
041711ce | 595 | /* Base addresses for compatible mmap() */ |
c1a2f7f0 RR |
596 | unsigned long mmap_compat_base; |
597 | unsigned long mmap_compat_legacy_base; | |
1b028f78 | 598 | #endif |
c1a2f7f0 | 599 | unsigned long task_size; /* size of task vm space */ |
c1a2f7f0 RR |
600 | pgd_t * pgd; |
601 | ||
227a4aad MD |
602 | #ifdef CONFIG_MEMBARRIER |
603 | /** | |
604 | * @membarrier_state: Flags controlling membarrier behavior. | |
605 | * | |
606 | * This field is close to @pgd to hopefully fit in the same | |
607 | * cache-line, which needs to be touched by switch_mm(). | |
608 | */ | |
609 | atomic_t membarrier_state; | |
610 | #endif | |
611 | ||
c1a2f7f0 RR |
612 | /** |
613 | * @mm_users: The number of users including userspace. | |
614 | * | |
615 | * Use mmget()/mmget_not_zero()/mmput() to modify. When this | |
616 | * drops to 0 (i.e. when the task exits and there are no other | |
617 | * temporary reference holders), we also release a reference on | |
618 | * @mm_count (which may then free the &struct mm_struct if | |
619 | * @mm_count also drops to 0). | |
620 | */ | |
621 | atomic_t mm_users; | |
622 | ||
623 | /** | |
624 | * @mm_count: The number of references to &struct mm_struct | |
625 | * (@mm_users count as 1). | |
626 | * | |
627 | * Use mmgrab()/mmdrop() to modify. When this drops to 0, the | |
628 | * &struct mm_struct is freed. | |
629 | */ | |
630 | atomic_t mm_count; | |
af7f588d MD |
631 | #ifdef CONFIG_SCHED_MM_CID |
632 | /** | |
223baf9d | 633 | * @pcpu_cid: Per-cpu current cid. |
af7f588d | 634 | * |
223baf9d MD |
635 | * Keep track of the currently allocated mm_cid for each cpu. |
636 | * The per-cpu mm_cid values are serialized by their respective | |
637 | * runqueue locks. | |
af7f588d | 638 | */ |
223baf9d MD |
639 | struct mm_cid __percpu *pcpu_cid; |
640 | /* | |
641 | * @mm_cid_next_scan: Next mm_cid scan (in jiffies). | |
642 | * | |
643 | * When the next mm_cid scan is due (in jiffies). | |
644 | */ | |
645 | unsigned long mm_cid_next_scan; | |
af7f588d | 646 | #endif |
c4812909 | 647 | #ifdef CONFIG_MMU |
3783e172 | 648 | atomic_long_t pgtables_bytes; /* size of all page tables */ |
5a3fbef3 | 649 | #endif |
c1a2f7f0 | 650 | int map_count; /* number of VMAs */ |
481b4bb5 | 651 | |
c1a2f7f0 RR |
652 | spinlock_t page_table_lock; /* Protects page tables and some |
653 | * counters | |
654 | */ | |
2e302543 FT |
655 | /* |
656 | * With some kernel config, the current mmap_lock's offset | |
657 | * inside 'mm_struct' is at 0x120, which is very optimal, as | |
658 | * its two hot fields 'count' and 'owner' sit in 2 different | |
659 | * cachelines, and when mmap_lock is highly contended, both | |
660 | * of the 2 fields will be accessed frequently, current layout | |
661 | * will help to reduce cache bouncing. | |
662 | * | |
663 | * So please be careful with adding new fields before | |
664 | * mmap_lock, which can easily push the 2 fields into one | |
665 | * cacheline. | |
666 | */ | |
da1c55f1 | 667 | struct rw_semaphore mmap_lock; |
c92ff1bd | 668 | |
c1a2f7f0 RR |
669 | struct list_head mmlist; /* List of maybe swapped mm's. These |
670 | * are globally strung together off | |
671 | * init_mm.mmlist, and are protected | |
672 | * by mmlist_lock | |
673 | */ | |
5e31275c SB |
674 | #ifdef CONFIG_PER_VMA_LOCK |
675 | int mm_lock_seq; | |
676 | #endif | |
c92ff1bd | 677 | |
c92ff1bd | 678 | |
c1a2f7f0 RR |
679 | unsigned long hiwater_rss; /* High-watermark of RSS usage */ |
680 | unsigned long hiwater_vm; /* High-water virtual memory usage */ | |
c92ff1bd | 681 | |
c1a2f7f0 RR |
682 | unsigned long total_vm; /* Total pages mapped */ |
683 | unsigned long locked_vm; /* Pages that have PG_mlocked set */ | |
70f8a3ca | 684 | atomic64_t pinned_vm; /* Refcount permanently increased */ |
c1a2f7f0 RR |
685 | unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ |
686 | unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ | |
687 | unsigned long stack_vm; /* VM_STACK */ | |
688 | unsigned long def_flags; | |
88aa7cc6 | 689 | |
2e302543 FT |
690 | /** |
691 | * @write_protect_seq: Locked when any thread is write | |
692 | * protecting pages mapped by this mm to enforce a later COW, | |
693 | * for instance during page table copying for fork(). | |
694 | */ | |
695 | seqcount_t write_protect_seq; | |
696 | ||
c1a2f7f0 | 697 | spinlock_t arg_lock; /* protect the below fields */ |
2e302543 | 698 | |
c1a2f7f0 RR |
699 | unsigned long start_code, end_code, start_data, end_data; |
700 | unsigned long start_brk, brk, start_stack; | |
701 | unsigned long arg_start, arg_end, env_start, env_end; | |
c92ff1bd | 702 | |
c1a2f7f0 | 703 | unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ |
c92ff1bd | 704 | |
f1a79412 | 705 | struct percpu_counter rss_stat[NR_MM_COUNTERS]; |
801460d0 | 706 | |
c1a2f7f0 | 707 | struct linux_binfmt *binfmt; |
6345d24d | 708 | |
c1a2f7f0 RR |
709 | /* Architecture-specific MM context */ |
710 | mm_context_t context; | |
c92ff1bd | 711 | |
c1a2f7f0 | 712 | unsigned long flags; /* Must use atomic bitops to access */ |
c92ff1bd | 713 | |
858f0993 | 714 | #ifdef CONFIG_AIO |
c1a2f7f0 RR |
715 | spinlock_t ioctx_lock; |
716 | struct kioctx_table __rcu *ioctx_table; | |
858f0993 | 717 | #endif |
f98bafa0 | 718 | #ifdef CONFIG_MEMCG |
c1a2f7f0 RR |
719 | /* |
720 | * "owner" points to a task that is regarded as the canonical | |
721 | * user/owner of this mm. All of the following must be true in | |
722 | * order for it to be changed: | |
723 | * | |
724 | * current == mm->owner | |
725 | * current->mm != mm | |
726 | * new_owner->mm == mm | |
727 | * new_owner->alloc_lock is held | |
728 | */ | |
729 | struct task_struct __rcu *owner; | |
78fb7466 | 730 | #endif |
c1a2f7f0 | 731 | struct user_namespace *user_ns; |
925d1c40 | 732 | |
c1a2f7f0 RR |
733 | /* store ref to file /proc/<pid>/exe symlink points to */ |
734 | struct file __rcu *exe_file; | |
cddb8a5c | 735 | #ifdef CONFIG_MMU_NOTIFIER |
984cfe4e | 736 | struct mmu_notifier_subscriptions *notifier_subscriptions; |
e7a00c45 | 737 | #endif |
e009bb30 | 738 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS |
c1a2f7f0 | 739 | pgtable_t pmd_huge_pte; /* protected by page_table_lock */ |
cbee9f88 PZ |
740 | #endif |
741 | #ifdef CONFIG_NUMA_BALANCING | |
c1a2f7f0 | 742 | /* |
7014887a DH |
743 | * numa_next_scan is the next time that PTEs will be remapped |
744 | * PROT_NONE to trigger NUMA hinting faults; such faults gather | |
745 | * statistics and migrate pages to new nodes if necessary. | |
c1a2f7f0 RR |
746 | */ |
747 | unsigned long numa_next_scan; | |
cbee9f88 | 748 | |
7014887a | 749 | /* Restart point for scanning and remapping PTEs. */ |
c1a2f7f0 | 750 | unsigned long numa_scan_offset; |
6e5fb223 | 751 | |
7014887a | 752 | /* numa_scan_seq prevents two threads remapping PTEs. */ |
c1a2f7f0 | 753 | int numa_scan_seq; |
20841405 | 754 | #endif |
c1a2f7f0 RR |
755 | /* |
756 | * An operation with batched TLB flushing is going on. Anything | |
757 | * that can move process memory needs to flush the TLB when | |
7014887a | 758 | * moving a PROT_NONE mapped page. |
c1a2f7f0 RR |
759 | */ |
760 | atomic_t tlb_flush_pending; | |
3ea27719 | 761 | #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH |
c1a2f7f0 | 762 | /* See flush_tlb_batched_pending() */ |
5ee2fa2f | 763 | atomic_t tlb_flush_batched; |
6345d24d | 764 | #endif |
c1a2f7f0 | 765 | struct uprobes_state uprobes_state; |
8d491de6 TG |
766 | #ifdef CONFIG_PREEMPT_RT |
767 | struct rcu_head delayed_drop; | |
768 | #endif | |
5d317b2b | 769 | #ifdef CONFIG_HUGETLB_PAGE |
c1a2f7f0 | 770 | atomic_long_t hugetlb_usage; |
5d317b2b | 771 | #endif |
c1a2f7f0 | 772 | struct work_struct async_put_work; |
52ad9bc6 | 773 | |
7a853c2d | 774 | #ifdef CONFIG_IOMMU_SVA |
52ad9bc6 | 775 | u32 pasid; |
76093853 | 776 | #endif |
777 | #ifdef CONFIG_KSM | |
778 | /* | |
779 | * Represent how many pages of this process are involved in KSM | |
780 | * merging. | |
781 | */ | |
782 | unsigned long ksm_merging_pages; | |
cb4df4ca | 783 | /* |
784 | * Represent how many pages are checked for ksm merging | |
785 | * including merged and not merged. | |
786 | */ | |
787 | unsigned long ksm_rmap_items; | |
52ad9bc6 | 788 | #endif |
bd74fdae YZ |
789 | #ifdef CONFIG_LRU_GEN |
790 | struct { | |
791 | /* this mm_struct is on lru_gen_mm_list */ | |
792 | struct list_head list; | |
793 | /* | |
794 | * Set when switching to this mm_struct, as a hint of | |
795 | * whether it has been used since the last time per-node | |
796 | * page table walkers cleared the corresponding bits. | |
797 | */ | |
798 | unsigned long bitmap; | |
799 | #ifdef CONFIG_MEMCG | |
800 | /* points to the memcg of "owner" above */ | |
801 | struct mem_cgroup *memcg; | |
802 | #endif | |
803 | } lru_gen; | |
804 | #endif /* CONFIG_LRU_GEN */ | |
c1a2f7f0 RR |
805 | } __randomize_layout; |
806 | ||
807 | /* | |
808 | * The mm_cpumask needs to be at the end of mm_struct, because it | |
809 | * is dynamically sized based on nr_cpu_ids. | |
810 | */ | |
811 | unsigned long cpu_bitmap[]; | |
812 | }; | |
c92ff1bd | 813 | |
3dd44325 LH |
814 | #define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \ |
815 | MT_FLAGS_USE_RCU) | |
abe722a1 IM |
816 | extern struct mm_struct init_mm; |
817 | ||
c1a2f7f0 | 818 | /* Pointer magic because the dynamic array size confuses some compilers. */ |
6345d24d LT |
819 | static inline void mm_init_cpumask(struct mm_struct *mm) |
820 | { | |
c1a2f7f0 RR |
821 | unsigned long cpu_bitmap = (unsigned long)mm; |
822 | ||
823 | cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap); | |
824 | cpumask_clear((struct cpumask *)cpu_bitmap); | |
6345d24d LT |
825 | } |
826 | ||
45e575ab | 827 | /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ |
de03c72c KM |
828 | static inline cpumask_t *mm_cpumask(struct mm_struct *mm) |
829 | { | |
c1a2f7f0 | 830 | return (struct cpumask *)&mm->cpu_bitmap; |
de03c72c | 831 | } |
45e575ab | 832 | |
bd74fdae YZ |
833 | #ifdef CONFIG_LRU_GEN |
834 | ||
835 | struct lru_gen_mm_list { | |
836 | /* mm_struct list for page table walkers */ | |
837 | struct list_head fifo; | |
838 | /* protects the list above */ | |
839 | spinlock_t lock; | |
840 | }; | |
841 | ||
842 | void lru_gen_add_mm(struct mm_struct *mm); | |
843 | void lru_gen_del_mm(struct mm_struct *mm); | |
844 | #ifdef CONFIG_MEMCG | |
845 | void lru_gen_migrate_mm(struct mm_struct *mm); | |
846 | #endif | |
847 | ||
848 | static inline void lru_gen_init_mm(struct mm_struct *mm) | |
849 | { | |
850 | INIT_LIST_HEAD(&mm->lru_gen.list); | |
851 | mm->lru_gen.bitmap = 0; | |
852 | #ifdef CONFIG_MEMCG | |
853 | mm->lru_gen.memcg = NULL; | |
854 | #endif | |
855 | } | |
856 | ||
857 | static inline void lru_gen_use_mm(struct mm_struct *mm) | |
858 | { | |
859 | /* | |
860 | * When the bitmap is set, page reclaim knows this mm_struct has been | |
861 | * used since the last time it cleared the bitmap. So it might be worth | |
862 | * walking the page tables of this mm_struct to clear the accessed bit. | |
863 | */ | |
864 | WRITE_ONCE(mm->lru_gen.bitmap, -1); | |
865 | } | |
866 | ||
867 | #else /* !CONFIG_LRU_GEN */ | |
868 | ||
869 | static inline void lru_gen_add_mm(struct mm_struct *mm) | |
870 | { | |
871 | } | |
872 | ||
873 | static inline void lru_gen_del_mm(struct mm_struct *mm) | |
874 | { | |
875 | } | |
876 | ||
877 | #ifdef CONFIG_MEMCG | |
878 | static inline void lru_gen_migrate_mm(struct mm_struct *mm) | |
879 | { | |
880 | } | |
881 | #endif | |
882 | ||
883 | static inline void lru_gen_init_mm(struct mm_struct *mm) | |
884 | { | |
885 | } | |
886 | ||
887 | static inline void lru_gen_use_mm(struct mm_struct *mm) | |
888 | { | |
889 | } | |
890 | ||
891 | #endif /* CONFIG_LRU_GEN */ | |
892 | ||
f39af059 MWO |
893 | struct vma_iterator { |
894 | struct ma_state mas; | |
895 | }; | |
896 | ||
897 | #define VMA_ITERATOR(name, __mm, __addr) \ | |
898 | struct vma_iterator name = { \ | |
899 | .mas = { \ | |
900 | .tree = &(__mm)->mm_mt, \ | |
901 | .index = __addr, \ | |
902 | .node = MAS_START, \ | |
903 | }, \ | |
904 | } | |
905 | ||
906 | static inline void vma_iter_init(struct vma_iterator *vmi, | |
907 | struct mm_struct *mm, unsigned long addr) | |
908 | { | |
b62b633e | 909 | mas_init(&vmi->mas, &mm->mm_mt, addr); |
f39af059 MWO |
910 | } |
911 | ||
af7f588d | 912 | #ifdef CONFIG_SCHED_MM_CID |
223baf9d MD |
913 | |
914 | enum mm_cid_state { | |
915 | MM_CID_UNSET = -1U, /* Unset state has lazy_put flag set. */ | |
916 | MM_CID_LAZY_PUT = (1U << 31), | |
917 | }; | |
918 | ||
919 | static inline bool mm_cid_is_unset(int cid) | |
920 | { | |
921 | return cid == MM_CID_UNSET; | |
922 | } | |
923 | ||
924 | static inline bool mm_cid_is_lazy_put(int cid) | |
925 | { | |
926 | return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT); | |
927 | } | |
928 | ||
929 | static inline bool mm_cid_is_valid(int cid) | |
930 | { | |
931 | return !(cid & MM_CID_LAZY_PUT); | |
932 | } | |
933 | ||
934 | static inline int mm_cid_set_lazy_put(int cid) | |
935 | { | |
936 | return cid | MM_CID_LAZY_PUT; | |
937 | } | |
938 | ||
939 | static inline int mm_cid_clear_lazy_put(int cid) | |
940 | { | |
941 | return cid & ~MM_CID_LAZY_PUT; | |
942 | } | |
943 | ||
af7f588d MD |
944 | /* Accessor for struct mm_struct's cidmask. */ |
945 | static inline cpumask_t *mm_cidmask(struct mm_struct *mm) | |
946 | { | |
947 | unsigned long cid_bitmap = (unsigned long)mm; | |
948 | ||
949 | cid_bitmap += offsetof(struct mm_struct, cpu_bitmap); | |
950 | /* Skip cpu_bitmap */ | |
951 | cid_bitmap += cpumask_size(); | |
952 | return (struct cpumask *)cid_bitmap; | |
953 | } | |
954 | ||
955 | static inline void mm_init_cid(struct mm_struct *mm) | |
956 | { | |
223baf9d MD |
957 | int i; |
958 | ||
959 | for_each_possible_cpu(i) { | |
960 | struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i); | |
961 | ||
962 | pcpu_cid->cid = MM_CID_UNSET; | |
963 | pcpu_cid->time = 0; | |
964 | } | |
af7f588d MD |
965 | cpumask_clear(mm_cidmask(mm)); |
966 | } | |
967 | ||
223baf9d MD |
968 | static inline int mm_alloc_cid(struct mm_struct *mm) |
969 | { | |
970 | mm->pcpu_cid = alloc_percpu(struct mm_cid); | |
971 | if (!mm->pcpu_cid) | |
972 | return -ENOMEM; | |
973 | mm_init_cid(mm); | |
974 | return 0; | |
975 | } | |
976 | ||
977 | static inline void mm_destroy_cid(struct mm_struct *mm) | |
978 | { | |
979 | free_percpu(mm->pcpu_cid); | |
980 | mm->pcpu_cid = NULL; | |
981 | } | |
982 | ||
af7f588d MD |
983 | static inline unsigned int mm_cid_size(void) |
984 | { | |
985 | return cpumask_size(); | |
986 | } | |
987 | #else /* CONFIG_SCHED_MM_CID */ | |
988 | static inline void mm_init_cid(struct mm_struct *mm) { } | |
223baf9d MD |
989 | static inline int mm_alloc_cid(struct mm_struct *mm) { return 0; } |
990 | static inline void mm_destroy_cid(struct mm_struct *mm) { } | |
af7f588d MD |
991 | static inline unsigned int mm_cid_size(void) |
992 | { | |
993 | return 0; | |
994 | } | |
995 | #endif /* CONFIG_SCHED_MM_CID */ | |
996 | ||
56236a59 | 997 | struct mmu_gather; |
a72afd87 | 998 | extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); |
d8b45053 | 999 | extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); |
ae8eba8b | 1000 | extern void tlb_finish_mmu(struct mmu_gather *tlb); |
56236a59 | 1001 | |
f872f540 AL |
1002 | struct vm_fault; |
1003 | ||
3d353901 SJ |
1004 | /** |
1005 | * typedef vm_fault_t - Return type for page fault handlers. | |
1006 | * | |
1007 | * Page fault handlers return a bitmask of %VM_FAULT values. | |
1008 | */ | |
1009 | typedef __bitwise unsigned int vm_fault_t; | |
1010 | ||
1011 | /** | |
1012 | * enum vm_fault_reason - Page fault handlers return a bitmask of | |
1013 | * these values to tell the core VM what happened when handling the | |
1014 | * fault. Used to decide whether a process gets delivered SIGBUS or | |
1015 | * just gets major/minor fault counters bumped up. | |
1016 | * | |
1017 | * @VM_FAULT_OOM: Out Of Memory | |
1018 | * @VM_FAULT_SIGBUS: Bad access | |
1019 | * @VM_FAULT_MAJOR: Page read from storage | |
3d353901 SJ |
1020 | * @VM_FAULT_HWPOISON: Hit poisoned small page |
1021 | * @VM_FAULT_HWPOISON_LARGE: Hit poisoned large page. Index encoded | |
1022 | * in upper bits | |
1023 | * @VM_FAULT_SIGSEGV: segmentation fault | |
1024 | * @VM_FAULT_NOPAGE: ->fault installed the pte, not return page | |
1025 | * @VM_FAULT_LOCKED: ->fault locked the returned page | |
1026 | * @VM_FAULT_RETRY: ->fault blocked, must retry | |
1027 | * @VM_FAULT_FALLBACK: huge page fault failed, fall back to small | |
1028 | * @VM_FAULT_DONE_COW: ->fault has fully handled COW | |
1029 | * @VM_FAULT_NEEDDSYNC: ->fault did not modify page tables and needs | |
1030 | * fsync() to complete (for synchronous page faults | |
1031 | * in DAX) | |
d9272525 | 1032 | * @VM_FAULT_COMPLETED: ->fault completed, meanwhile mmap lock released |
3d353901 SJ |
1033 | * @VM_FAULT_HINDEX_MASK: mask HINDEX value |
1034 | * | |
1035 | */ | |
1036 | enum vm_fault_reason { | |
1037 | VM_FAULT_OOM = (__force vm_fault_t)0x000001, | |
1038 | VM_FAULT_SIGBUS = (__force vm_fault_t)0x000002, | |
1039 | VM_FAULT_MAJOR = (__force vm_fault_t)0x000004, | |
3d353901 SJ |
1040 | VM_FAULT_HWPOISON = (__force vm_fault_t)0x000010, |
1041 | VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020, | |
1042 | VM_FAULT_SIGSEGV = (__force vm_fault_t)0x000040, | |
1043 | VM_FAULT_NOPAGE = (__force vm_fault_t)0x000100, | |
1044 | VM_FAULT_LOCKED = (__force vm_fault_t)0x000200, | |
1045 | VM_FAULT_RETRY = (__force vm_fault_t)0x000400, | |
1046 | VM_FAULT_FALLBACK = (__force vm_fault_t)0x000800, | |
1047 | VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000, | |
1048 | VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000, | |
d9272525 | 1049 | VM_FAULT_COMPLETED = (__force vm_fault_t)0x004000, |
3d353901 SJ |
1050 | VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000, |
1051 | }; | |
1052 | ||
1053 | /* Encode hstate index for a hwpoisoned large page */ | |
1054 | #define VM_FAULT_SET_HINDEX(x) ((__force vm_fault_t)((x) << 16)) | |
fcae96ff | 1055 | #define VM_FAULT_GET_HINDEX(x) (((__force unsigned int)(x) >> 16) & 0xf) |
3d353901 SJ |
1056 | |
1057 | #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | \ | |
1058 | VM_FAULT_SIGSEGV | VM_FAULT_HWPOISON | \ | |
1059 | VM_FAULT_HWPOISON_LARGE | VM_FAULT_FALLBACK) | |
1060 | ||
1061 | #define VM_FAULT_RESULT_TRACE \ | |
1062 | { VM_FAULT_OOM, "OOM" }, \ | |
1063 | { VM_FAULT_SIGBUS, "SIGBUS" }, \ | |
1064 | { VM_FAULT_MAJOR, "MAJOR" }, \ | |
3d353901 SJ |
1065 | { VM_FAULT_HWPOISON, "HWPOISON" }, \ |
1066 | { VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \ | |
1067 | { VM_FAULT_SIGSEGV, "SIGSEGV" }, \ | |
1068 | { VM_FAULT_NOPAGE, "NOPAGE" }, \ | |
1069 | { VM_FAULT_LOCKED, "LOCKED" }, \ | |
1070 | { VM_FAULT_RETRY, "RETRY" }, \ | |
1071 | { VM_FAULT_FALLBACK, "FALLBACK" }, \ | |
1072 | { VM_FAULT_DONE_COW, "DONE_COW" }, \ | |
1073 | { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" } | |
1074 | ||
f872f540 AL |
1075 | struct vm_special_mapping { |
1076 | const char *name; /* The name, e.g. "[vdso]". */ | |
1077 | ||
1078 | /* | |
1079 | * If .fault is not provided, this points to a | |
1080 | * NULL-terminated array of pages that back the special mapping. | |
1081 | * | |
1082 | * This must not be NULL unless .fault is provided. | |
1083 | */ | |
a62c34bd | 1084 | struct page **pages; |
f872f540 AL |
1085 | |
1086 | /* | |
1087 | * If non-NULL, then this is called to resolve page faults | |
1088 | * on the special mapping. If used, .pages is not checked. | |
1089 | */ | |
b3ec9f33 SJ |
1090 | vm_fault_t (*fault)(const struct vm_special_mapping *sm, |
1091 | struct vm_area_struct *vma, | |
1092 | struct vm_fault *vmf); | |
b059a453 DS |
1093 | |
1094 | int (*mremap)(const struct vm_special_mapping *sm, | |
1095 | struct vm_area_struct *new_vma); | |
a62c34bd AL |
1096 | }; |
1097 | ||
d17d8f9d DH |
1098 | enum tlb_flush_reason { |
1099 | TLB_FLUSH_ON_TASK_SWITCH, | |
1100 | TLB_REMOTE_SHOOTDOWN, | |
1101 | TLB_LOCAL_SHOOTDOWN, | |
1102 | TLB_LOCAL_MM_SHOOTDOWN, | |
5b74283a | 1103 | TLB_REMOTE_SEND_IPI, |
d17d8f9d DH |
1104 | NR_TLB_FLUSH_REASONS, |
1105 | }; | |
1106 | ||
bd6dace7 TH |
1107 | /* |
1108 | * A swap entry has to fit into a "unsigned long", as the entry is hidden | |
1109 | * in the "index" field of the swapper address space. | |
1110 | */ | |
1111 | typedef struct { | |
1112 | unsigned long val; | |
1113 | } swp_entry_t; | |
1114 | ||
36090def AB |
1115 | /** |
1116 | * enum fault_flag - Fault flag definitions. | |
1117 | * @FAULT_FLAG_WRITE: Fault was a write fault. | |
1118 | * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE. | |
1119 | * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked. | |
1120 | * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying. | |
1121 | * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region. | |
1122 | * @FAULT_FLAG_TRIED: The fault has been tried once. | |
1123 | * @FAULT_FLAG_USER: The fault originated in userspace. | |
1124 | * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. | |
1125 | * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. | |
1126 | * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. | |
8d6a0ac0 DH |
1127 | * @FAULT_FLAG_UNSHARE: The fault is an unsharing request to break COW in a |
1128 | * COW mapping, making sure that an exclusive anon page is | |
1129 | * mapped after the fault. | |
f46f2ade PX |
1130 | * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached. |
1131 | * We should only access orig_pte if this flag set. | |
55324e46 | 1132 | * @FAULT_FLAG_VMA_LOCK: The fault is handled under VMA lock. |
36090def AB |
1133 | * |
1134 | * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify | |
1135 | * whether we would allow page faults to retry by specifying these two | |
1136 | * fault flags correctly. Currently there can be three legal combinations: | |
1137 | * | |
1138 | * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and | |
1139 | * this is the first try | |
1140 | * | |
1141 | * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and | |
1142 | * we've already tried at least once | |
1143 | * | |
1144 | * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry | |
1145 | * | |
1146 | * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never | |
1147 | * be used. Note that page faults can be allowed to retry for multiple times, | |
1148 | * in which case we'll have an initial fault with flags (a) then later on | |
1149 | * continuous faults with flags (b). We should always try to detect pending | |
1150 | * signals before a retry to make sure the continuous page faults can still be | |
1151 | * interrupted if necessary. | |
c89357e2 DH |
1152 | * |
1153 | * The combination FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE is illegal. | |
1154 | * FAULT_FLAG_UNSHARE is ignored and treated like an ordinary read fault when | |
8d6a0ac0 | 1155 | * applied to mappings that are not COW mappings. |
36090def AB |
1156 | */ |
1157 | enum fault_flag { | |
1158 | FAULT_FLAG_WRITE = 1 << 0, | |
1159 | FAULT_FLAG_MKWRITE = 1 << 1, | |
1160 | FAULT_FLAG_ALLOW_RETRY = 1 << 2, | |
1161 | FAULT_FLAG_RETRY_NOWAIT = 1 << 3, | |
1162 | FAULT_FLAG_KILLABLE = 1 << 4, | |
1163 | FAULT_FLAG_TRIED = 1 << 5, | |
1164 | FAULT_FLAG_USER = 1 << 6, | |
1165 | FAULT_FLAG_REMOTE = 1 << 7, | |
1166 | FAULT_FLAG_INSTRUCTION = 1 << 8, | |
1167 | FAULT_FLAG_INTERRUPTIBLE = 1 << 9, | |
c89357e2 | 1168 | FAULT_FLAG_UNSHARE = 1 << 10, |
f46f2ade | 1169 | FAULT_FLAG_ORIG_PTE_VALID = 1 << 11, |
55324e46 | 1170 | FAULT_FLAG_VMA_LOCK = 1 << 12, |
36090def AB |
1171 | }; |
1172 | ||
05e90bd0 PX |
1173 | typedef unsigned int __bitwise zap_flags_t; |
1174 | ||
b5054174 DH |
1175 | /* |
1176 | * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each | |
1177 | * other. Here is what they mean, and how to use them: | |
1178 | * | |
b5054174 DH |
1179 | * |
1180 | * FIXME: For pages which are part of a filesystem, mappings are subject to the | |
1181 | * lifetime enforced by the filesystem and we need guarantees that longterm | |
1182 | * users like RDMA and V4L2 only establish mappings which coordinate usage with | |
1183 | * the filesystem. Ideas for this coordination include revoking the longterm | |
1184 | * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was | |
1185 | * added after the problem with filesystems was found FS DAX VMAs are | |
1186 | * specifically failed. Filesystem pages are still subject to bugs and use of | |
1187 | * FOLL_LONGTERM should be avoided on those pages. | |
1188 | * | |
b5054174 DH |
1189 | * In the CMA case: long term pins in a CMA region would unnecessarily fragment |
1190 | * that region. And so, CMA attempts to migrate the page before pinning, when | |
1191 | * FOLL_LONGTERM is specified. | |
1192 | * | |
1193 | * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount, | |
1194 | * but an additional pin counting system) will be invoked. This is intended for | |
1195 | * anything that gets a page reference and then touches page data (for example, | |
1196 | * Direct IO). This lets the filesystem know that some non-file-system entity is | |
1197 | * potentially changing the pages' data. In contrast to FOLL_GET (whose pages | |
1198 | * are released via put_page()), FOLL_PIN pages must be released, ultimately, by | |
1199 | * a call to unpin_user_page(). | |
1200 | * | |
1201 | * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different | |
1202 | * and separate refcounting mechanisms, however, and that means that each has | |
1203 | * its own acquire and release mechanisms: | |
1204 | * | |
1205 | * FOLL_GET: get_user_pages*() to acquire, and put_page() to release. | |
1206 | * | |
1207 | * FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release. | |
1208 | * | |
1209 | * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call. | |
1210 | * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based | |
1211 | * calls applied to them, and that's perfectly OK. This is a constraint on the | |
1212 | * callers, not on the pages.) | |
1213 | * | |
1214 | * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never | |
1215 | * directly by the caller. That's in order to help avoid mismatches when | |
1216 | * releasing pages: get_user_pages*() pages must be released via put_page(), | |
1217 | * while pin_user_pages*() pages must be released via unpin_user_page(). | |
1218 | * | |
1219 | * Please see Documentation/core-api/pin_user_pages.rst for more information. | |
1220 | */ | |
1221 | ||
2c224108 JG |
1222 | enum { |
1223 | /* check pte is writable */ | |
1224 | FOLL_WRITE = 1 << 0, | |
1225 | /* do get_page on page */ | |
1226 | FOLL_GET = 1 << 1, | |
1227 | /* give error on hole if it would be zero */ | |
1228 | FOLL_DUMP = 1 << 2, | |
1229 | /* get_user_pages read/write w/o permission */ | |
1230 | FOLL_FORCE = 1 << 3, | |
1231 | /* | |
1232 | * if a disk transfer is needed, start the IO and return without waiting | |
1233 | * upon it | |
1234 | */ | |
1235 | FOLL_NOWAIT = 1 << 4, | |
1236 | /* do not fault in pages */ | |
1237 | FOLL_NOFAULT = 1 << 5, | |
1238 | /* check page is hwpoisoned */ | |
1239 | FOLL_HWPOISON = 1 << 6, | |
1240 | /* don't do file mappings */ | |
1241 | FOLL_ANON = 1 << 7, | |
1242 | /* | |
1243 | * FOLL_LONGTERM indicates that the page will be held for an indefinite | |
1244 | * time period _often_ under userspace control. This is in contrast to | |
1245 | * iov_iter_get_pages(), whose usages are transient. | |
1246 | */ | |
1247 | FOLL_LONGTERM = 1 << 8, | |
1248 | /* split huge pmd before returning */ | |
1249 | FOLL_SPLIT_PMD = 1 << 9, | |
1250 | /* allow returning PCI P2PDMA pages */ | |
1251 | FOLL_PCI_P2PDMA = 1 << 10, | |
1252 | /* allow interrupts from generic signals */ | |
1253 | FOLL_INTERRUPTIBLE = 1 << 11, | |
1254 | ||
1255 | /* See also internal only FOLL flags in mm/internal.h */ | |
1256 | }; | |
b5054174 | 1257 | |
5b99cd0e | 1258 | #endif /* _LINUX_MM_TYPES_H */ |