proc: pass a folio to smaps_page_accumulate()
[linux-2.6-block.git] / fs / proc / task_mmu.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
a520110e 2#include <linux/pagewalk.h>
17fca131 3#include <linux/mm_inline.h>
1da177e4 4#include <linux/hugetlb.h>
22e057c5 5#include <linux/huge_mm.h>
1da177e4 6#include <linux/mount.h>
8b479335 7#include <linux/ksm.h>
1da177e4 8#include <linux/seq_file.h>
e070ad49 9#include <linux/highmem.h>
5096add8 10#include <linux/ptrace.h>
5a0e3ad6 11#include <linux/slab.h>
6e21c8f1
CL
12#include <linux/pagemap.h>
13#include <linux/mempolicy.h>
22e057c5 14#include <linux/rmap.h>
85863e47 15#include <linux/swap.h>
6e84f315 16#include <linux/sched/mm.h>
85863e47 17#include <linux/swapops.h>
0f8975ec 18#include <linux/mmu_notifier.h>
33c3fc71 19#include <linux/page_idle.h>
6a15a370 20#include <linux/shmem_fs.h>
b3a81d08 21#include <linux/uaccess.h>
27cca866 22#include <linux/pkeys.h>
52526ca7
MUA
23#include <linux/minmax.h>
24#include <linux/overflow.h>
e070ad49 25
1da177e4 26#include <asm/elf.h>
b3a81d08 27#include <asm/tlb.h>
e070ad49 28#include <asm/tlbflush.h>
1da177e4
LT
29#include "internal.h"
30
d1be35cb
AV
31#define SEQ_PUT_DEC(str, val) \
32 seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8)
df5f8314 33void task_mem(struct seq_file *m, struct mm_struct *mm)
1da177e4 34{
af5b0f6a 35 unsigned long text, lib, swap, anon, file, shmem;
365e9c87
HD
36 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
37
8cee852e
JM
38 anon = get_mm_counter(mm, MM_ANONPAGES);
39 file = get_mm_counter(mm, MM_FILEPAGES);
40 shmem = get_mm_counter(mm, MM_SHMEMPAGES);
41
365e9c87
HD
42 /*
43 * Note: to minimize their overhead, mm maintains hiwater_vm and
44 * hiwater_rss only when about to *lower* total_vm or rss. Any
45 * collector of these hiwater stats must therefore get total_vm
46 * and rss too, which will usually be the higher. Barriers? not
47 * worth the effort, such snapshots can always be inconsistent.
48 */
49 hiwater_vm = total_vm = mm->total_vm;
50 if (hiwater_vm < mm->hiwater_vm)
51 hiwater_vm = mm->hiwater_vm;
8cee852e 52 hiwater_rss = total_rss = anon + file + shmem;
365e9c87
HD
53 if (hiwater_rss < mm->hiwater_rss)
54 hiwater_rss = mm->hiwater_rss;
1da177e4 55
8526d84f
KK
56 /* split executable areas between text and lib */
57 text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK);
58 text = min(text, mm->exec_vm << PAGE_SHIFT);
59 lib = (mm->exec_vm << PAGE_SHIFT) - text;
60
b084d435 61 swap = get_mm_counter(mm, MM_SWAPENTS);
d1be35cb
AV
62 SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
63 SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
64 SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
70f8a3ca 65 SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm));
d1be35cb
AV
66 SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss);
67 SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss);
68 SEQ_PUT_DEC(" kB\nRssAnon:\t", anon);
69 SEQ_PUT_DEC(" kB\nRssFile:\t", file);
70 SEQ_PUT_DEC(" kB\nRssShmem:\t", shmem);
71 SEQ_PUT_DEC(" kB\nVmData:\t", mm->data_vm);
72 SEQ_PUT_DEC(" kB\nVmStk:\t", mm->stack_vm);
73 seq_put_decimal_ull_width(m,
74 " kB\nVmExe:\t", text >> 10, 8);
75 seq_put_decimal_ull_width(m,
76 " kB\nVmLib:\t", lib >> 10, 8);
77 seq_put_decimal_ull_width(m,
78 " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8);
79 SEQ_PUT_DEC(" kB\nVmSwap:\t", swap);
80 seq_puts(m, " kB\n");
5d317b2b 81 hugetlb_report_usage(m, mm);
1da177e4 82}
d1be35cb 83#undef SEQ_PUT_DEC
1da177e4
LT
84
85unsigned long task_vsize(struct mm_struct *mm)
86{
87 return PAGE_SIZE * mm->total_vm;
88}
89
a2ade7b6
AD
90unsigned long task_statm(struct mm_struct *mm,
91 unsigned long *shared, unsigned long *text,
92 unsigned long *data, unsigned long *resident)
1da177e4 93{
eca56ff9
JM
94 *shared = get_mm_counter(mm, MM_FILEPAGES) +
95 get_mm_counter(mm, MM_SHMEMPAGES);
1da177e4
LT
96 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
97 >> PAGE_SHIFT;
84638335 98 *data = mm->data_vm + mm->stack_vm;
d559db08 99 *resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
1da177e4
LT
100 return mm->total_vm;
101}
102
9e781440
KH
103#ifdef CONFIG_NUMA
104/*
498f2371 105 * Save get_task_policy() for show_numa_map().
9e781440
KH
106 */
107static void hold_task_mempolicy(struct proc_maps_private *priv)
108{
109 struct task_struct *task = priv->task;
110
111 task_lock(task);
498f2371 112 priv->task_mempolicy = get_task_policy(task);
9e781440
KH
113 mpol_get(priv->task_mempolicy);
114 task_unlock(task);
115}
116static void release_task_mempolicy(struct proc_maps_private *priv)
117{
118 mpol_put(priv->task_mempolicy);
119}
120#else
121static void hold_task_mempolicy(struct proc_maps_private *priv)
122{
123}
124static void release_task_mempolicy(struct proc_maps_private *priv)
125{
126}
127#endif
128
c4c84f06
MWO
129static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
130 loff_t *ppos)
131{
132 struct vm_area_struct *vma = vma_next(&priv->iter);
133
134 if (vma) {
135 *ppos = vma->vm_start;
136 } else {
137 *ppos = -2UL;
138 vma = get_gate_vma(priv->mm);
139 }
140
141 return vma;
142}
143
0c255321 144static void *m_start(struct seq_file *m, loff_t *ppos)
e070ad49 145{
a6198797 146 struct proc_maps_private *priv = m->private;
4781f2c3 147 unsigned long last_addr = *ppos;
a6198797 148 struct mm_struct *mm;
a6198797 149
c2e88d22 150 /* See m_next(). Zero at the start or after lseek. */
b8c20a9b
ON
151 if (last_addr == -1UL)
152 return NULL;
153
2c03376d 154 priv->task = get_proc_task(priv->inode);
a6198797 155 if (!priv->task)
ec6fd8a4 156 return ERR_PTR(-ESRCH);
a6198797 157
29a40ace 158 mm = priv->mm;
d07ded61
MWO
159 if (!mm || !mmget_not_zero(mm)) {
160 put_task_struct(priv->task);
161 priv->task = NULL;
29a40ace 162 return NULL;
d07ded61 163 }
a6198797 164
d8ed45c5 165 if (mmap_read_lock_killable(mm)) {
8a713e7d 166 mmput(mm);
d07ded61
MWO
167 put_task_struct(priv->task);
168 priv->task = NULL;
8a713e7d
KK
169 return ERR_PTR(-EINTR);
170 }
171
c4c84f06 172 vma_iter_init(&priv->iter, mm, last_addr);
9e781440 173 hold_task_mempolicy(priv);
c4c84f06
MWO
174 if (last_addr == -2UL)
175 return get_gate_vma(mm);
59b4bf12 176
c4c84f06 177 return proc_get_vma(priv, ppos);
a6198797
MM
178}
179
4781f2c3 180static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
a6198797 181{
c4c84f06
MWO
182 if (*ppos == -2UL) {
183 *ppos = -1UL;
184 return NULL;
185 }
186 return proc_get_vma(m->private, ppos);
a6198797
MM
187}
188
189static void m_stop(struct seq_file *m, void *v)
190{
191 struct proc_maps_private *priv = m->private;
d07ded61 192 struct mm_struct *mm = priv->mm;
a6198797 193
d07ded61
MWO
194 if (!priv->task)
195 return;
196
197 release_task_mempolicy(priv);
d8ed45c5 198 mmap_read_unlock(mm);
d07ded61
MWO
199 mmput(mm);
200 put_task_struct(priv->task);
201 priv->task = NULL;
a6198797
MM
202}
203
4db7d0ee
ON
204static int proc_maps_open(struct inode *inode, struct file *file,
205 const struct seq_operations *ops, int psize)
206{
207 struct proc_maps_private *priv = __seq_open_private(file, ops, psize);
208
209 if (!priv)
210 return -ENOMEM;
211
2c03376d 212 priv->inode = inode;
29a40ace
ON
213 priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
214 if (IS_ERR(priv->mm)) {
215 int err = PTR_ERR(priv->mm);
216
217 seq_release_private(inode, file);
218 return err;
219 }
220
4db7d0ee
ON
221 return 0;
222}
223
29a40ace
ON
224static int proc_map_release(struct inode *inode, struct file *file)
225{
226 struct seq_file *seq = file->private_data;
227 struct proc_maps_private *priv = seq->private;
228
229 if (priv->mm)
230 mmdrop(priv->mm);
231
232 return seq_release_private(inode, file);
233}
234
a6198797 235static int do_maps_open(struct inode *inode, struct file *file,
03a44825 236 const struct seq_operations *ops)
a6198797 237{
4db7d0ee
ON
238 return proc_maps_open(inode, file, ops,
239 sizeof(struct proc_maps_private));
a6198797 240}
e070ad49 241
493b0e9d
DC
242static void show_vma_header_prefix(struct seq_file *m,
243 unsigned long start, unsigned long end,
244 vm_flags_t flags, unsigned long long pgoff,
245 dev_t dev, unsigned long ino)
246{
247 seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
0e3dc019
AV
248 seq_put_hex_ll(m, NULL, start, 8);
249 seq_put_hex_ll(m, "-", end, 8);
250 seq_putc(m, ' ');
251 seq_putc(m, flags & VM_READ ? 'r' : '-');
252 seq_putc(m, flags & VM_WRITE ? 'w' : '-');
253 seq_putc(m, flags & VM_EXEC ? 'x' : '-');
254 seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p');
255 seq_put_hex_ll(m, " ", pgoff, 8);
256 seq_put_hex_ll(m, " ", MAJOR(dev), 2);
257 seq_put_hex_ll(m, ":", MINOR(dev), 2);
258 seq_put_decimal_ull(m, " ", ino);
259 seq_putc(m, ' ');
493b0e9d
DC
260}
261
b7643757 262static void
871305bb 263show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
1da177e4 264{
d09e8ca6 265 struct anon_vma_name *anon_name = NULL;
e070ad49
ML
266 struct mm_struct *mm = vma->vm_mm;
267 struct file *file = vma->vm_file;
ca16d140 268 vm_flags_t flags = vma->vm_flags;
1da177e4 269 unsigned long ino = 0;
6260a4b0 270 unsigned long long pgoff = 0;
a09a79f6 271 unsigned long start, end;
1da177e4 272 dev_t dev = 0;
b7643757 273 const char *name = NULL;
1da177e4
LT
274
275 if (file) {
3efdc78f
AV
276 const struct inode *inode = file_user_inode(vma->vm_file);
277
1da177e4
LT
278 dev = inode->i_sb->s_dev;
279 ino = inode->i_ino;
6260a4b0 280 pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
1da177e4
LT
281 }
282
d7824370 283 start = vma->vm_start;
a09a79f6 284 end = vma->vm_end;
493b0e9d 285 show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
d09e8ca6
PT
286 if (mm)
287 anon_name = anon_vma_name(vma);
1da177e4
LT
288
289 /*
290 * Print the dentry name for named mappings, and a
291 * special [heap] marker for the heap:
292 */
e070ad49 293 if (file) {
652586df 294 seq_pad(m, ' ');
d09e8ca6
PT
295 /*
296 * If user named this anon shared memory via
297 * prctl(PR_SET_VMA ..., use the provided name.
298 */
299 if (anon_name)
300 seq_printf(m, "[anon_shmem:%s]", anon_name->name);
301 else
08582d67 302 seq_path(m, file_user_path(file), "\n");
b7643757
SP
303 goto done;
304 }
305
78d683e8
AL
306 if (vma->vm_ops && vma->vm_ops->name) {
307 name = vma->vm_ops->name(vma);
308 if (name)
309 goto done;
310 }
311
b7643757
SP
312 name = arch_vma_name(vma);
313 if (!name) {
b7643757
SP
314 if (!mm) {
315 name = "[vdso]";
316 goto done;
317 }
318
11250fd1 319 if (vma_is_initial_heap(vma)) {
b7643757
SP
320 name = "[heap]";
321 goto done;
322 }
323
11250fd1 324 if (vma_is_initial_stack(vma)) {
65376df5 325 name = "[stack]";
9a10064f
CC
326 goto done;
327 }
328
9a10064f
CC
329 if (anon_name) {
330 seq_pad(m, ' ');
5c26f6ac 331 seq_printf(m, "[anon:%s]", anon_name->name);
9a10064f 332 }
b7643757
SP
333 }
334
335done:
336 if (name) {
652586df 337 seq_pad(m, ' ');
b7643757 338 seq_puts(m, name);
1da177e4
LT
339 }
340 seq_putc(m, '\n');
7c88db0c
JK
341}
342
871305bb 343static int show_map(struct seq_file *m, void *v)
7c88db0c 344{
871305bb 345 show_map_vma(m, v);
1da177e4
LT
346 return 0;
347}
348
03a44825 349static const struct seq_operations proc_pid_maps_op = {
a6198797
MM
350 .start = m_start,
351 .next = m_next,
352 .stop = m_stop,
871305bb 353 .show = show_map
a6198797
MM
354};
355
b7643757 356static int pid_maps_open(struct inode *inode, struct file *file)
a6198797
MM
357{
358 return do_maps_open(inode, file, &proc_pid_maps_op);
359}
360
b7643757
SP
361const struct file_operations proc_pid_maps_operations = {
362 .open = pid_maps_open,
363 .read = seq_read,
364 .llseek = seq_lseek,
29a40ace 365 .release = proc_map_release,
b7643757
SP
366};
367
a6198797
MM
368/*
369 * Proportional Set Size(PSS): my share of RSS.
370 *
371 * PSS of a process is the count of pages it has in memory, where each
372 * page is divided by the number of processes sharing it. So if a
373 * process has 1000 pages all to itself, and 1000 shared with one other
374 * process, its PSS will be 1500.
375 *
376 * To keep (accumulated) division errors low, we adopt a 64bit
377 * fixed-point pss counter to minimize division errors. So (pss >>
378 * PSS_SHIFT) would be the real byte count.
379 *
380 * A shift of 12 before division means (assuming 4K page size):
381 * - 1M 3-user-pages add up to 8KB errors;
382 * - supports mapcount up to 2^24, or 16M;
383 * - supports PSS up to 2^52 bytes, or 4PB.
384 */
385#define PSS_SHIFT 12
386
1e883281 387#ifdef CONFIG_PROC_PAGE_MONITOR
214e471f 388struct mem_size_stats {
a6198797
MM
389 unsigned long resident;
390 unsigned long shared_clean;
391 unsigned long shared_dirty;
392 unsigned long private_clean;
393 unsigned long private_dirty;
394 unsigned long referenced;
b40d4f84 395 unsigned long anonymous;
cf8496ea 396 unsigned long lazyfree;
4031a219 397 unsigned long anonymous_thp;
65c45377 398 unsigned long shmem_thp;
60fbf0ab 399 unsigned long file_thp;
214e471f 400 unsigned long swap;
25ee01a2
NH
401 unsigned long shared_hugetlb;
402 unsigned long private_hugetlb;
8b479335 403 unsigned long ksm;
a6198797 404 u64 pss;
ee2ad71b
LS
405 u64 pss_anon;
406 u64 pss_file;
407 u64 pss_shmem;
30934843 408 u64 pss_dirty;
493b0e9d 409 u64 pss_locked;
8334b962 410 u64 swap_pss;
a6198797
MM
411};
412
ee2ad71b 413static void smaps_page_accumulate(struct mem_size_stats *mss,
27bb0a70 414 struct folio *folio, unsigned long size, unsigned long pss,
ee2ad71b
LS
415 bool dirty, bool locked, bool private)
416{
417 mss->pss += pss;
418
cfc96da4 419 if (folio_test_anon(folio))
ee2ad71b 420 mss->pss_anon += pss;
cfc96da4 421 else if (folio_test_swapbacked(folio))
ee2ad71b
LS
422 mss->pss_shmem += pss;
423 else
424 mss->pss_file += pss;
425
426 if (locked)
427 mss->pss_locked += pss;
428
cfc96da4 429 if (dirty || folio_test_dirty(folio)) {
30934843 430 mss->pss_dirty += pss;
ee2ad71b
LS
431 if (private)
432 mss->private_dirty += size;
433 else
434 mss->shared_dirty += size;
435 } else {
436 if (private)
437 mss->private_clean += size;
438 else
439 mss->shared_clean += size;
440 }
441}
442
c164e038 443static void smaps_account(struct mem_size_stats *mss, struct page *page,
24d7275c
YS
444 bool compound, bool young, bool dirty, bool locked,
445 bool migration)
c164e038 446{
6c977f36 447 struct folio *folio = page_folio(page);
d8c6546b 448 int i, nr = compound ? compound_nr(page) : 1;
afd9883f 449 unsigned long size = nr * PAGE_SIZE;
c164e038 450
ee2ad71b
LS
451 /*
452 * First accumulate quantities that depend only on |size| and the type
453 * of the compound page.
454 */
6c977f36 455 if (folio_test_anon(folio)) {
c164e038 456 mss->anonymous += size;
6c977f36
MWO
457 if (!folio_test_swapbacked(folio) && !dirty &&
458 !folio_test_dirty(folio))
cf8496ea
SL
459 mss->lazyfree += size;
460 }
c164e038 461
6c977f36 462 if (folio_test_ksm(folio))
8b479335
SR
463 mss->ksm += size;
464
c164e038
KS
465 mss->resident += size;
466 /* Accumulate the size in pages that have been accessed. */
6c977f36 467 if (young || folio_test_young(folio) || folio_test_referenced(folio))
c164e038 468 mss->referenced += size;
c164e038 469
afd9883f 470 /*
ee2ad71b
LS
471 * Then accumulate quantities that may depend on sharing, or that may
472 * differ page-by-page.
473 *
6c977f36 474 * refcount == 1 guarantees the page is mapped exactly once.
afd9883f 475 * If any subpage of the compound page mapped with PTE it would elevate
6c977f36 476 * the refcount.
24d7275c
YS
477 *
478 * The page_mapcount() is called to get a snapshot of the mapcount.
479 * Without holding the page lock this snapshot can be slightly wrong as
480 * we cannot always read the mapcount atomically. It is not safe to
481 * call page_mapcount() even with PTL held if the page is not mapped,
482 * especially for migration entries. Treat regular migration entries
483 * as mapcount == 1.
afd9883f 484 */
6c977f36 485 if ((folio_ref_count(folio) == 1) || migration) {
27bb0a70
MWO
486 smaps_page_accumulate(mss, folio, size, size << PSS_SHIFT,
487 dirty, locked, true);
afd9883f
KS
488 return;
489 }
afd9883f
KS
490 for (i = 0; i < nr; i++, page++) {
491 int mapcount = page_mapcount(page);
ee2ad71b
LS
492 unsigned long pss = PAGE_SIZE << PSS_SHIFT;
493 if (mapcount >= 2)
494 pss /= mapcount;
27bb0a70
MWO
495 smaps_page_accumulate(mss, folio, PAGE_SIZE, pss,
496 dirty, locked, mapcount < 2);
c164e038
KS
497 }
498}
ae11c4d9 499
c261e7d9 500#ifdef CONFIG_SHMEM
c261e7d9 501static int smaps_pte_hole(unsigned long addr, unsigned long end,
b7a16c7a 502 __always_unused int depth, struct mm_walk *walk)
c261e7d9
VB
503{
504 struct mem_size_stats *mss = walk->private;
10c848c8 505 struct vm_area_struct *vma = walk->vma;
c261e7d9 506
10c848c8
PX
507 mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping,
508 linear_page_index(vma, addr),
509 linear_page_index(vma, end));
c261e7d9
VB
510
511 return 0;
512}
7b86ac33
CH
513#else
514#define smaps_pte_hole NULL
515#endif /* CONFIG_SHMEM */
c261e7d9 516
23010032
PX
517static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk)
518{
519#ifdef CONFIG_SHMEM
520 if (walk->ops->pte_hole) {
521 /* depth is not used */
522 smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk);
523 }
524#endif
525}
526
c164e038
KS
527static void smaps_pte_entry(pte_t *pte, unsigned long addr,
528 struct mm_walk *walk)
ae11c4d9
DH
529{
530 struct mem_size_stats *mss = walk->private;
14eb6fdd 531 struct vm_area_struct *vma = walk->vma;
27dd768e 532 bool locked = !!(vma->vm_flags & VM_LOCKED);
b1d4d9e0 533 struct page *page = NULL;
efd41493 534 bool migration = false, young = false, dirty = false;
c33c7948 535 pte_t ptent = ptep_get(pte);
ae11c4d9 536
c33c7948
RR
537 if (pte_present(ptent)) {
538 page = vm_normal_page(vma, addr, ptent);
539 young = pte_young(ptent);
540 dirty = pte_dirty(ptent);
541 } else if (is_swap_pte(ptent)) {
542 swp_entry_t swpent = pte_to_swp_entry(ptent);
ae11c4d9 543
8334b962
MK
544 if (!non_swap_entry(swpent)) {
545 int mapcount;
546
c164e038 547 mss->swap += PAGE_SIZE;
8334b962
MK
548 mapcount = swp_swapcount(swpent);
549 if (mapcount >= 2) {
550 u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
551
552 do_div(pss_delta, mapcount);
553 mss->swap_pss += pss_delta;
554 } else {
555 mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
556 }
24d7275c
YS
557 } else if (is_pfn_swap_entry(swpent)) {
558 if (is_migration_entry(swpent))
559 migration = true;
af5cdaf8 560 page = pfn_swap_entry_to_page(swpent);
24d7275c 561 }
23010032
PX
562 } else {
563 smaps_pte_hole_lookup(addr, walk);
48131e03 564 return;
b1d4d9e0 565 }
ae11c4d9 566
ae11c4d9
DH
567 if (!page)
568 return;
afd9883f 569
efd41493 570 smaps_account(mss, page, false, young, dirty, locked, migration);
ae11c4d9
DH
571}
572
c164e038
KS
573#ifdef CONFIG_TRANSPARENT_HUGEPAGE
574static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
575 struct mm_walk *walk)
576{
577 struct mem_size_stats *mss = walk->private;
14eb6fdd 578 struct vm_area_struct *vma = walk->vma;
27dd768e 579 bool locked = !!(vma->vm_flags & VM_LOCKED);
c94b6923 580 struct page *page = NULL;
24d7275c 581 bool migration = false;
c94b6923
HY
582
583 if (pmd_present(*pmd)) {
8b9c1cc0 584 page = vm_normal_page_pmd(vma, addr, *pmd);
c94b6923
HY
585 } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
586 swp_entry_t entry = pmd_to_swp_entry(*pmd);
c164e038 587
24d7275c
YS
588 if (is_migration_entry(entry)) {
589 migration = true;
af5cdaf8 590 page = pfn_swap_entry_to_page(entry);
24d7275c 591 }
c94b6923 592 }
c164e038
KS
593 if (IS_ERR_OR_NULL(page))
594 return;
65c45377
KS
595 if (PageAnon(page))
596 mss->anonymous_thp += HPAGE_PMD_SIZE;
597 else if (PageSwapBacked(page))
598 mss->shmem_thp += HPAGE_PMD_SIZE;
ca120cf6
DW
599 else if (is_zone_device_page(page))
600 /* pass */;
65c45377 601 else
60fbf0ab 602 mss->file_thp += HPAGE_PMD_SIZE;
24d7275c
YS
603
604 smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
605 locked, migration);
c164e038
KS
606}
607#else
608static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
609 struct mm_walk *walk)
610{
611}
612#endif
613
b3ae5acb 614static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
2165009b 615 struct mm_walk *walk)
e070ad49 616{
14eb6fdd 617 struct vm_area_struct *vma = walk->vma;
ae11c4d9 618 pte_t *pte;
705e87c0 619 spinlock_t *ptl;
e070ad49 620
b6ec57f4
KS
621 ptl = pmd_trans_huge_lock(pmd, vma);
622 if (ptl) {
c94b6923 623 smaps_pmd_entry(pmd, addr, walk);
bf929152 624 spin_unlock(ptl);
14038302 625 goto out;
22e057c5 626 }
1a5a9906 627
705e87c0 628 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
7780d040
HD
629 if (!pte) {
630 walk->action = ACTION_AGAIN;
631 return 0;
632 }
ae11c4d9 633 for (; addr != end; pte++, addr += PAGE_SIZE)
c164e038 634 smaps_pte_entry(pte, addr, walk);
705e87c0 635 pte_unmap_unlock(pte - 1, ptl);
14038302 636out:
705e87c0 637 cond_resched();
b3ae5acb 638 return 0;
e070ad49
ML
639}
640
834f82e2
CG
641static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
642{
643 /*
644 * Don't forget to update Documentation/ on changes.
645 */
646 static const char mnemonics[BITS_PER_LONG][2] = {
647 /*
648 * In case if we meet a flag we don't know about.
649 */
650 [0 ... (BITS_PER_LONG-1)] = "??",
651
652 [ilog2(VM_READ)] = "rd",
653 [ilog2(VM_WRITE)] = "wr",
654 [ilog2(VM_EXEC)] = "ex",
655 [ilog2(VM_SHARED)] = "sh",
656 [ilog2(VM_MAYREAD)] = "mr",
657 [ilog2(VM_MAYWRITE)] = "mw",
658 [ilog2(VM_MAYEXEC)] = "me",
659 [ilog2(VM_MAYSHARE)] = "ms",
660 [ilog2(VM_GROWSDOWN)] = "gd",
661 [ilog2(VM_PFNMAP)] = "pf",
834f82e2
CG
662 [ilog2(VM_LOCKED)] = "lo",
663 [ilog2(VM_IO)] = "io",
664 [ilog2(VM_SEQ_READ)] = "sr",
665 [ilog2(VM_RAND_READ)] = "rr",
666 [ilog2(VM_DONTCOPY)] = "dc",
667 [ilog2(VM_DONTEXPAND)] = "de",
8614d6c5 668 [ilog2(VM_LOCKONFAULT)] = "lf",
834f82e2
CG
669 [ilog2(VM_ACCOUNT)] = "ac",
670 [ilog2(VM_NORESERVE)] = "nr",
671 [ilog2(VM_HUGETLB)] = "ht",
b6fb293f 672 [ilog2(VM_SYNC)] = "sf",
834f82e2 673 [ilog2(VM_ARCH_1)] = "ar",
d2cd9ede 674 [ilog2(VM_WIPEONFORK)] = "wf",
834f82e2 675 [ilog2(VM_DONTDUMP)] = "dd",
424037b7
DK
676#ifdef CONFIG_ARM64_BTI
677 [ilog2(VM_ARM64_BTI)] = "bt",
678#endif
ec8e41ae
NH
679#ifdef CONFIG_MEM_SOFT_DIRTY
680 [ilog2(VM_SOFTDIRTY)] = "sd",
681#endif
834f82e2
CG
682 [ilog2(VM_MIXEDMAP)] = "mm",
683 [ilog2(VM_HUGEPAGE)] = "hg",
684 [ilog2(VM_NOHUGEPAGE)] = "nh",
685 [ilog2(VM_MERGEABLE)] = "mg",
16ba6f81
AA
686 [ilog2(VM_UFFD_MISSING)]= "um",
687 [ilog2(VM_UFFD_WP)] = "uw",
9f341931
CM
688#ifdef CONFIG_ARM64_MTE
689 [ilog2(VM_MTE)] = "mt",
690 [ilog2(VM_MTE_ALLOWED)] = "",
691#endif
5212213a 692#ifdef CONFIG_ARCH_HAS_PKEYS
c1192f84
DH
693 /* These come out via ProtectionKey: */
694 [ilog2(VM_PKEY_BIT0)] = "",
695 [ilog2(VM_PKEY_BIT1)] = "",
696 [ilog2(VM_PKEY_BIT2)] = "",
697 [ilog2(VM_PKEY_BIT3)] = "",
2c9e0a6f
RP
698#if VM_PKEY_BIT4
699 [ilog2(VM_PKEY_BIT4)] = "",
c1192f84 700#endif
5212213a 701#endif /* CONFIG_ARCH_HAS_PKEYS */
7677f7fd
AR
702#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
703 [ilog2(VM_UFFD_MINOR)] = "ui",
704#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
54007f81
YY
705#ifdef CONFIG_X86_USER_SHADOW_STACK
706 [ilog2(VM_SHADOW_STACK)] = "ss",
707#endif
834f82e2
CG
708 };
709 size_t i;
710
711 seq_puts(m, "VmFlags: ");
712 for (i = 0; i < BITS_PER_LONG; i++) {
c1192f84
DH
713 if (!mnemonics[i][0])
714 continue;
834f82e2 715 if (vma->vm_flags & (1UL << i)) {
f6640663
AV
716 seq_putc(m, mnemonics[i][0]);
717 seq_putc(m, mnemonics[i][1]);
718 seq_putc(m, ' ');
834f82e2
CG
719 }
720 }
721 seq_putc(m, '\n');
722}
723
25ee01a2
NH
724#ifdef CONFIG_HUGETLB_PAGE
725static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
726 unsigned long addr, unsigned long end,
727 struct mm_walk *walk)
728{
729 struct mem_size_stats *mss = walk->private;
730 struct vm_area_struct *vma = walk->vma;
731 struct page *page = NULL;
c33c7948 732 pte_t ptent = ptep_get(pte);
25ee01a2 733
c33c7948
RR
734 if (pte_present(ptent)) {
735 page = vm_normal_page(vma, addr, ptent);
736 } else if (is_swap_pte(ptent)) {
737 swp_entry_t swpent = pte_to_swp_entry(ptent);
25ee01a2 738
af5cdaf8
AP
739 if (is_pfn_swap_entry(swpent))
740 page = pfn_swap_entry_to_page(swpent);
25ee01a2
NH
741 }
742 if (page) {
3489dbb6 743 if (page_mapcount(page) >= 2 || hugetlb_pmd_shared(pte))
25ee01a2
NH
744 mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
745 else
746 mss->private_hugetlb += huge_page_size(hstate_vma(vma));
747 }
748 return 0;
749}
7b86ac33
CH
750#else
751#define smaps_hugetlb_range NULL
25ee01a2
NH
752#endif /* HUGETLB_PAGE */
753
7b86ac33
CH
754static const struct mm_walk_ops smaps_walk_ops = {
755 .pmd_entry = smaps_pte_range,
756 .hugetlb_entry = smaps_hugetlb_range,
49b06385 757 .walk_lock = PGWALK_RDLOCK,
7b86ac33
CH
758};
759
760static const struct mm_walk_ops smaps_shmem_walk_ops = {
761 .pmd_entry = smaps_pte_range,
762 .hugetlb_entry = smaps_hugetlb_range,
763 .pte_hole = smaps_pte_hole,
49b06385 764 .walk_lock = PGWALK_RDLOCK,
7b86ac33
CH
765};
766
03b4b114
CC
767/*
768 * Gather mem stats from @vma with the indicated beginning
769 * address @start, and keep them in @mss.
770 *
771 * Use vm_start of @vma as the beginning address if @start is 0.
772 */
8e68d689 773static void smap_gather_stats(struct vm_area_struct *vma,
03b4b114 774 struct mem_size_stats *mss, unsigned long start)
e070ad49 775{
03b4b114
CC
776 const struct mm_walk_ops *ops = &smaps_walk_ops;
777
778 /* Invalid start */
779 if (start >= vma->vm_end)
780 return;
781
c261e7d9 782 if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
6a15a370
VB
783 /*
784 * For shared or readonly shmem mappings we know that all
785 * swapped out pages belong to the shmem object, and we can
786 * obtain the swap value much more efficiently. For private
787 * writable mappings, we might have COW pages that are
788 * not affected by the parent swapped out pages of the shmem
789 * object, so we have to distinguish them during the page walk.
790 * Unless we know that the shmem object (or the part mapped by
791 * our VMA) has no swapped out pages at all.
792 */
793 unsigned long shmem_swapped = shmem_swap_usage(vma);
794
03b4b114
CC
795 if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
796 !(vma->vm_flags & VM_WRITE))) {
fa76da46 797 mss->swap += shmem_swapped;
6a15a370 798 } else {
03b4b114 799 ops = &smaps_shmem_walk_ops;
6a15a370 800 }
c261e7d9 801 }
b4aca547 802
c1e8d7c6 803 /* mmap_lock is held in m_start */
03b4b114
CC
804 if (!start)
805 walk_page_vma(vma, ops, mss);
806 else
807 walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss);
8e68d689
VB
808}
809
810#define SEQ_PUT_DEC(str, val) \
811 seq_put_decimal_ull_width(m, str, (val) >> 10, 8)
f1547959
VB
812
813/* Show the contents common for smaps and smaps_rollup */
ee2ad71b
LS
814static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
815 bool rollup_mode)
f1547959
VB
816{
817 SEQ_PUT_DEC("Rss: ", mss->resident);
818 SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT);
30934843 819 SEQ_PUT_DEC(" kB\nPss_Dirty: ", mss->pss_dirty >> PSS_SHIFT);
ee2ad71b
LS
820 if (rollup_mode) {
821 /*
822 * These are meaningful only for smaps_rollup, otherwise two of
823 * them are zero, and the other one is the same as Pss.
824 */
825 SEQ_PUT_DEC(" kB\nPss_Anon: ",
826 mss->pss_anon >> PSS_SHIFT);
827 SEQ_PUT_DEC(" kB\nPss_File: ",
828 mss->pss_file >> PSS_SHIFT);
829 SEQ_PUT_DEC(" kB\nPss_Shmem: ",
830 mss->pss_shmem >> PSS_SHIFT);
831 }
f1547959
VB
832 SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean);
833 SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty);
834 SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean);
835 SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty);
836 SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced);
837 SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous);
8b479335 838 SEQ_PUT_DEC(" kB\nKSM: ", mss->ksm);
f1547959
VB
839 SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree);
840 SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp);
841 SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
471e78cc 842 SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp);
f1547959
VB
843 SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb);
844 seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ",
845 mss->private_hugetlb >> 10, 7);
846 SEQ_PUT_DEC(" kB\nSwap: ", mss->swap);
847 SEQ_PUT_DEC(" kB\nSwapPss: ",
848 mss->swap_pss >> PSS_SHIFT);
849 SEQ_PUT_DEC(" kB\nLocked: ",
850 mss->pss_locked >> PSS_SHIFT);
851 seq_puts(m, " kB\n");
852}
853
8e68d689
VB
854static int show_smap(struct seq_file *m, void *v)
855{
8e68d689 856 struct vm_area_struct *vma = v;
860a2e7f 857 struct mem_size_stats mss = {};
258f669e 858
03b4b114 859 smap_gather_stats(vma, &mss, 0);
258f669e
VB
860
861 show_map_vma(m, vma);
862
863 SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start);
864 SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma));
865 SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma));
866 seq_puts(m, " kB\n");
867
ee2ad71b 868 __show_smap(m, &mss, false);
258f669e 869
daa60ae6 870 seq_printf(m, "THPeligible: %8u\n",
3485b883
RR
871 !!thp_vma_allowable_orders(vma, vma->vm_flags, true, false,
872 true, THP_ORDERS_ALL));
7635d9cb 873
258f669e
VB
874 if (arch_pkeys_enabled())
875 seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
876 show_smap_vma_flags(m, vma);
877
258f669e
VB
878 return 0;
879}
880
881static int show_smaps_rollup(struct seq_file *m, void *v)
882{
883 struct proc_maps_private *priv = m->private;
860a2e7f 884 struct mem_size_stats mss = {};
c4c84f06 885 struct mm_struct *mm = priv->mm;
258f669e 886 struct vm_area_struct *vma;
c4c84f06 887 unsigned long vma_start = 0, last_vma_end = 0;
8e68d689 888 int ret = 0;
250cb40f 889 VMA_ITERATOR(vmi, mm, 0);
8e68d689 890
258f669e
VB
891 priv->task = get_proc_task(priv->inode);
892 if (!priv->task)
893 return -ESRCH;
493b0e9d 894
258f669e
VB
895 if (!mm || !mmget_not_zero(mm)) {
896 ret = -ESRCH;
897 goto out_put_task;
493b0e9d 898 }
4752c369 899
d8ed45c5 900 ret = mmap_read_lock_killable(mm);
a26a9781
KK
901 if (ret)
902 goto out_put_mm;
903
258f669e 904 hold_task_mempolicy(priv);
250cb40f 905 vma = vma_next(&vmi);
f1547959 906
c4c84f06
MWO
907 if (unlikely(!vma))
908 goto empty_set;
909
910 vma_start = vma->vm_start;
911 do {
03b4b114 912 smap_gather_stats(vma, &mss, 0);
258f669e 913 last_vma_end = vma->vm_end;
ff9f47f6
CC
914
915 /*
916 * Release mmap_lock temporarily if someone wants to
917 * access it for write request.
918 */
919 if (mmap_lock_is_contended(mm)) {
250cb40f 920 vma_iter_invalidate(&vmi);
ff9f47f6
CC
921 mmap_read_unlock(mm);
922 ret = mmap_read_lock_killable(mm);
923 if (ret) {
924 release_task_mempolicy(priv);
925 goto out_put_mm;
926 }
927
928 /*
929 * After dropping the lock, there are four cases to
930 * consider. See the following example for explanation.
931 *
932 * +------+------+-----------+
933 * | VMA1 | VMA2 | VMA3 |
934 * +------+------+-----------+
935 * | | | |
936 * 4k 8k 16k 400k
937 *
938 * Suppose we drop the lock after reading VMA2 due to
939 * contention, then we get:
940 *
941 * last_vma_end = 16k
942 *
943 * 1) VMA2 is freed, but VMA3 exists:
944 *
250cb40f 945 * vma_next(vmi) will return VMA3.
ff9f47f6
CC
946 * In this case, just continue from VMA3.
947 *
948 * 2) VMA2 still exists:
949 *
250cb40f
LH
950 * vma_next(vmi) will return VMA3.
951 * In this case, just continue from VMA3.
ff9f47f6
CC
952 *
953 * 3) No more VMAs can be found:
954 *
250cb40f 955 * vma_next(vmi) will return NULL.
ff9f47f6
CC
956 * No more things to do, just break.
957 *
958 * 4) (last_vma_end - 1) is the middle of a vma (VMA'):
959 *
250cb40f 960 * vma_next(vmi) will return VMA' whose range
ff9f47f6
CC
961 * contains last_vma_end.
962 * Iterate VMA' from last_vma_end.
963 */
250cb40f 964 vma = vma_next(&vmi);
ff9f47f6
CC
965 /* Case 3 above */
966 if (!vma)
967 break;
968
250cb40f 969 /* Case 1 and 2 above */
ff9f47f6
CC
970 if (vma->vm_start >= last_vma_end)
971 continue;
972
973 /* Case 4 above */
974 if (vma->vm_end > last_vma_end)
975 smap_gather_stats(vma, &mss, last_vma_end);
976 }
250cb40f 977 } for_each_vma(vmi, vma);
258f669e 978
c4c84f06
MWO
979empty_set:
980 show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0);
258f669e
VB
981 seq_pad(m, ' ');
982 seq_puts(m, "[rollup]\n");
983
ee2ad71b 984 __show_smap(m, &mss, true);
258f669e
VB
985
986 release_task_mempolicy(priv);
d8ed45c5 987 mmap_read_unlock(mm);
258f669e 988
a26a9781
KK
989out_put_mm:
990 mmput(mm);
258f669e
VB
991out_put_task:
992 put_task_struct(priv->task);
993 priv->task = NULL;
994
493b0e9d 995 return ret;
e070ad49 996}
d1be35cb 997#undef SEQ_PUT_DEC
e070ad49 998
03a44825 999static const struct seq_operations proc_pid_smaps_op = {
a6198797
MM
1000 .start = m_start,
1001 .next = m_next,
1002 .stop = m_stop,
871305bb 1003 .show = show_smap
a6198797
MM
1004};
1005
b7643757 1006static int pid_smaps_open(struct inode *inode, struct file *file)
a6198797
MM
1007{
1008 return do_maps_open(inode, file, &proc_pid_smaps_op);
1009}
1010
258f669e 1011static int smaps_rollup_open(struct inode *inode, struct file *file)
493b0e9d 1012{
258f669e 1013 int ret;
493b0e9d 1014 struct proc_maps_private *priv;
258f669e
VB
1015
1016 priv = kzalloc(sizeof(*priv), GFP_KERNEL_ACCOUNT);
1017 if (!priv)
493b0e9d 1018 return -ENOMEM;
258f669e
VB
1019
1020 ret = single_open(file, show_smaps_rollup, priv);
1021 if (ret)
1022 goto out_free;
1023
1024 priv->inode = inode;
1025 priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
1026 if (IS_ERR(priv->mm)) {
1027 ret = PTR_ERR(priv->mm);
1028
1029 single_release(inode, file);
1030 goto out_free;
493b0e9d 1031 }
258f669e 1032
493b0e9d 1033 return 0;
258f669e
VB
1034
1035out_free:
1036 kfree(priv);
1037 return ret;
1038}
1039
1040static int smaps_rollup_release(struct inode *inode, struct file *file)
1041{
1042 struct seq_file *seq = file->private_data;
1043 struct proc_maps_private *priv = seq->private;
1044
1045 if (priv->mm)
1046 mmdrop(priv->mm);
1047
1048 kfree(priv);
1049 return single_release(inode, file);
493b0e9d
DC
1050}
1051
b7643757
SP
1052const struct file_operations proc_pid_smaps_operations = {
1053 .open = pid_smaps_open,
1054 .read = seq_read,
1055 .llseek = seq_lseek,
29a40ace 1056 .release = proc_map_release,
b7643757
SP
1057};
1058
493b0e9d 1059const struct file_operations proc_pid_smaps_rollup_operations = {
258f669e 1060 .open = smaps_rollup_open,
493b0e9d
DC
1061 .read = seq_read,
1062 .llseek = seq_lseek,
258f669e 1063 .release = smaps_rollup_release,
493b0e9d
DC
1064};
1065
040fa020
PE
1066enum clear_refs_types {
1067 CLEAR_REFS_ALL = 1,
1068 CLEAR_REFS_ANON,
1069 CLEAR_REFS_MAPPED,
0f8975ec 1070 CLEAR_REFS_SOFT_DIRTY,
695f0559 1071 CLEAR_REFS_MM_HIWATER_RSS,
040fa020
PE
1072 CLEAR_REFS_LAST,
1073};
1074
af9de7eb 1075struct clear_refs_private {
0f8975ec 1076 enum clear_refs_types type;
af9de7eb
PE
1077};
1078
7d5b3bfa 1079#ifdef CONFIG_MEM_SOFT_DIRTY
9348b73c 1080
9348b73c
LT
1081static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
1082{
1083 struct page *page;
1084
1085 if (!pte_write(pte))
1086 return false;
1087 if (!is_cow_mapping(vma->vm_flags))
1088 return false;
a458b76a 1089 if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)))
9348b73c
LT
1090 return false;
1091 page = vm_normal_page(vma, addr, pte);
1092 if (!page)
1093 return false;
1094 return page_maybe_dma_pinned(page);
1095}
1096
0f8975ec
PE
1097static inline void clear_soft_dirty(struct vm_area_struct *vma,
1098 unsigned long addr, pte_t *pte)
1099{
0f8975ec
PE
1100 /*
1101 * The soft-dirty tracker uses #PF-s to catch writes
1102 * to pages, so write-protect the pte as well. See the
1ad1335d 1103 * Documentation/admin-guide/mm/soft-dirty.rst for full description
0f8975ec
PE
1104 * of how soft-dirty works.
1105 */
c33c7948 1106 pte_t ptent = ptep_get(pte);
179ef71c
CG
1107
1108 if (pte_present(ptent)) {
04a86453
AK
1109 pte_t old_pte;
1110
9348b73c
LT
1111 if (pte_is_pinned(vma, addr, ptent))
1112 return;
04a86453
AK
1113 old_pte = ptep_modify_prot_start(vma, addr, pte);
1114 ptent = pte_wrprotect(old_pte);
a7b76174 1115 ptent = pte_clear_soft_dirty(ptent);
04a86453 1116 ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
179ef71c
CG
1117 } else if (is_swap_pte(ptent)) {
1118 ptent = pte_swp_clear_soft_dirty(ptent);
326c2597 1119 set_pte_at(vma->vm_mm, addr, pte, ptent);
179ef71c 1120 }
0f8975ec 1121}
5d3875a0
LD
1122#else
1123static inline void clear_soft_dirty(struct vm_area_struct *vma,
1124 unsigned long addr, pte_t *pte)
1125{
1126}
1127#endif
0f8975ec 1128
5d3875a0 1129#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
7d5b3bfa
KS
1130static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
1131 unsigned long addr, pmd_t *pmdp)
1132{
a3cf988f 1133 pmd_t old, pmd = *pmdp;
5b7abeae 1134
ab6e3d09
NH
1135 if (pmd_present(pmd)) {
1136 /* See comment in change_huge_pmd() */
a3cf988f
KS
1137 old = pmdp_invalidate(vma, addr, pmdp);
1138 if (pmd_dirty(old))
ab6e3d09 1139 pmd = pmd_mkdirty(pmd);
a3cf988f 1140 if (pmd_young(old))
ab6e3d09
NH
1141 pmd = pmd_mkyoung(pmd);
1142
1143 pmd = pmd_wrprotect(pmd);
1144 pmd = pmd_clear_soft_dirty(pmd);
1145
1146 set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
1147 } else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
1148 pmd = pmd_swp_clear_soft_dirty(pmd);
1149 set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
1150 }
7d5b3bfa 1151}
7d5b3bfa 1152#else
7d5b3bfa
KS
1153static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
1154 unsigned long addr, pmd_t *pmdp)
1155{
1156}
1157#endif
1158
a6198797 1159static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
2165009b 1160 unsigned long end, struct mm_walk *walk)
a6198797 1161{
af9de7eb 1162 struct clear_refs_private *cp = walk->private;
5c64f52a 1163 struct vm_area_struct *vma = walk->vma;
a6198797
MM
1164 pte_t *pte, ptent;
1165 spinlock_t *ptl;
03aa577f 1166 struct folio *folio;
a6198797 1167
b6ec57f4
KS
1168 ptl = pmd_trans_huge_lock(pmd, vma);
1169 if (ptl) {
7d5b3bfa
KS
1170 if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
1171 clear_soft_dirty_pmd(vma, addr, pmd);
1172 goto out;
1173 }
1174
84c3fc4e
ZY
1175 if (!pmd_present(*pmd))
1176 goto out;
1177
03aa577f 1178 folio = pmd_folio(*pmd);
7d5b3bfa
KS
1179
1180 /* Clear accessed and referenced bits. */
1181 pmdp_test_and_clear_young(vma, addr, pmd);
03aa577f
MWO
1182 folio_test_clear_young(folio);
1183 folio_clear_referenced(folio);
7d5b3bfa
KS
1184out:
1185 spin_unlock(ptl);
1186 return 0;
1187 }
1188
a6198797 1189 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
7780d040
HD
1190 if (!pte) {
1191 walk->action = ACTION_AGAIN;
1192 return 0;
1193 }
a6198797 1194 for (; addr != end; pte++, addr += PAGE_SIZE) {
c33c7948 1195 ptent = ptep_get(pte);
a6198797 1196
0f8975ec
PE
1197 if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
1198 clear_soft_dirty(vma, addr, pte);
1199 continue;
1200 }
1201
179ef71c
CG
1202 if (!pte_present(ptent))
1203 continue;
1204
03aa577f
MWO
1205 folio = vm_normal_folio(vma, addr, ptent);
1206 if (!folio)
a6198797
MM
1207 continue;
1208
1209 /* Clear accessed and referenced bits. */
1210 ptep_test_and_clear_young(vma, addr, pte);
03aa577f
MWO
1211 folio_test_clear_young(folio);
1212 folio_clear_referenced(folio);
a6198797
MM
1213 }
1214 pte_unmap_unlock(pte - 1, ptl);
1215 cond_resched();
1216 return 0;
1217}
1218
5c64f52a
NH
1219static int clear_refs_test_walk(unsigned long start, unsigned long end,
1220 struct mm_walk *walk)
1221{
1222 struct clear_refs_private *cp = walk->private;
1223 struct vm_area_struct *vma = walk->vma;
1224
48684a65
NH
1225 if (vma->vm_flags & VM_PFNMAP)
1226 return 1;
1227
5c64f52a
NH
1228 /*
1229 * Writing 1 to /proc/pid/clear_refs affects all pages.
1230 * Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
1231 * Writing 3 to /proc/pid/clear_refs only affects file mapped pages.
1232 * Writing 4 to /proc/pid/clear_refs affects all pages.
1233 */
1234 if (cp->type == CLEAR_REFS_ANON && vma->vm_file)
1235 return 1;
1236 if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file)
1237 return 1;
1238 return 0;
1239}
1240
7b86ac33
CH
1241static const struct mm_walk_ops clear_refs_walk_ops = {
1242 .pmd_entry = clear_refs_pte_range,
1243 .test_walk = clear_refs_test_walk,
49b06385 1244 .walk_lock = PGWALK_WRLOCK,
7b86ac33
CH
1245};
1246
f248dcb3
MM
1247static ssize_t clear_refs_write(struct file *file, const char __user *buf,
1248 size_t count, loff_t *ppos)
b813e931 1249{
f248dcb3 1250 struct task_struct *task;
860a2e7f 1251 char buffer[PROC_NUMBUF] = {};
f248dcb3 1252 struct mm_struct *mm;
b813e931 1253 struct vm_area_struct *vma;
040fa020
PE
1254 enum clear_refs_types type;
1255 int itype;
0a8cb8e3 1256 int rv;
b813e931 1257
f248dcb3
MM
1258 if (count > sizeof(buffer) - 1)
1259 count = sizeof(buffer) - 1;
1260 if (copy_from_user(buffer, buf, count))
1261 return -EFAULT;
040fa020 1262 rv = kstrtoint(strstrip(buffer), 10, &itype);
0a8cb8e3
AD
1263 if (rv < 0)
1264 return rv;
040fa020
PE
1265 type = (enum clear_refs_types)itype;
1266 if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
f248dcb3 1267 return -EINVAL;
541c237c 1268
496ad9aa 1269 task = get_proc_task(file_inode(file));
f248dcb3
MM
1270 if (!task)
1271 return -ESRCH;
1272 mm = get_task_mm(task);
1273 if (mm) {
250cb40f 1274 VMA_ITERATOR(vmi, mm, 0);
ac46d4f3 1275 struct mmu_notifier_range range;
af9de7eb 1276 struct clear_refs_private cp = {
0f8975ec 1277 .type = type,
af9de7eb 1278 };
695f0559 1279
29a951df
LT
1280 if (mmap_write_lock_killable(mm)) {
1281 count = -EINTR;
1282 goto out_mm;
1283 }
695f0559
PC
1284 if (type == CLEAR_REFS_MM_HIWATER_RSS) {
1285 /*
1286 * Writing 5 to /proc/pid/clear_refs resets the peak
1287 * resident set size to this mm's current rss value.
1288 */
695f0559 1289 reset_mm_hiwater_rss(mm);
29a951df 1290 goto out_unlock;
695f0559
PC
1291 }
1292
64e45507 1293 if (type == CLEAR_REFS_SOFT_DIRTY) {
250cb40f 1294 for_each_vma(vmi, vma) {
64e45507
PF
1295 if (!(vma->vm_flags & VM_SOFTDIRTY))
1296 continue;
1c71222e 1297 vm_flags_clear(vma, VM_SOFTDIRTY);
29a951df 1298 vma_set_page_prot(vma);
64e45507 1299 }
ac46d4f3 1300
912efa17 1301 inc_tlb_flush_pending(mm);
7269f999 1302 mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
7d4a8be0 1303 0, mm, 0, -1UL);
ac46d4f3 1304 mmu_notifier_invalidate_range_start(&range);
64e45507 1305 }
c4c84f06 1306 walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp);
912efa17 1307 if (type == CLEAR_REFS_SOFT_DIRTY) {
ac46d4f3 1308 mmu_notifier_invalidate_range_end(&range);
912efa17
WD
1309 flush_tlb_mm(mm);
1310 dec_tlb_flush_pending(mm);
1311 }
29a951df
LT
1312out_unlock:
1313 mmap_write_unlock(mm);
695f0559 1314out_mm:
f248dcb3
MM
1315 mmput(mm);
1316 }
1317 put_task_struct(task);
fb92a4b0
VL
1318
1319 return count;
b813e931
DR
1320}
1321
f248dcb3
MM
1322const struct file_operations proc_clear_refs_operations = {
1323 .write = clear_refs_write,
6038f373 1324 .llseek = noop_llseek,
f248dcb3
MM
1325};
1326
092b50ba
NH
1327typedef struct {
1328 u64 pme;
1329} pagemap_entry_t;
1330
85863e47 1331struct pagemapread {
8c829622 1332 int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
092b50ba 1333 pagemap_entry_t *buffer;
1c90308e 1334 bool show_pfn;
85863e47
MM
1335};
1336
5aaabe83
NH
1337#define PAGEMAP_WALK_SIZE (PMD_SIZE)
1338#define PAGEMAP_WALK_MASK (PMD_MASK)
1339
deb94544
KK
1340#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
1341#define PM_PFRAME_BITS 55
1342#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
1343#define PM_SOFT_DIRTY BIT_ULL(55)
77bb499b 1344#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
fb8e37f3 1345#define PM_UFFD_WP BIT_ULL(57)
deb94544
KK
1346#define PM_FILE BIT_ULL(61)
1347#define PM_SWAP BIT_ULL(62)
1348#define PM_PRESENT BIT_ULL(63)
1349
85863e47
MM
1350#define PM_END_OF_BUFFER 1
1351
deb94544 1352static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
092b50ba 1353{
deb94544 1354 return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
092b50ba
NH
1355}
1356
cabbb6d5 1357static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm)
85863e47 1358{
092b50ba 1359 pm->buffer[pm->pos++] = *pme;
d82ef020 1360 if (pm->pos >= pm->len)
aae8679b 1361 return PM_END_OF_BUFFER;
85863e47
MM
1362 return 0;
1363}
1364
1365static int pagemap_pte_hole(unsigned long start, unsigned long end,
b7a16c7a 1366 __always_unused int depth, struct mm_walk *walk)
85863e47 1367{
2165009b 1368 struct pagemapread *pm = walk->private;
68b5a652 1369 unsigned long addr = start;
85863e47 1370 int err = 0;
092b50ba 1371
68b5a652
PF
1372 while (addr < end) {
1373 struct vm_area_struct *vma = find_vma(walk->mm, addr);
deb94544 1374 pagemap_entry_t pme = make_pme(0, 0);
87e6d49a
PF
1375 /* End of address space hole, which we mark as non-present. */
1376 unsigned long hole_end;
68b5a652 1377
87e6d49a
PF
1378 if (vma)
1379 hole_end = min(end, vma->vm_start);
1380 else
1381 hole_end = end;
1382
1383 for (; addr < hole_end; addr += PAGE_SIZE) {
cabbb6d5 1384 err = add_to_pagemap(&pme, pm);
87e6d49a
PF
1385 if (err)
1386 goto out;
68b5a652
PF
1387 }
1388
87e6d49a
PF
1389 if (!vma)
1390 break;
1391
1392 /* Addresses in the VMA. */
1393 if (vma->vm_flags & VM_SOFTDIRTY)
deb94544 1394 pme = make_pme(0, PM_SOFT_DIRTY);
87e6d49a 1395 for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
cabbb6d5 1396 err = add_to_pagemap(&pme, pm);
68b5a652
PF
1397 if (err)
1398 goto out;
1399 }
85863e47 1400 }
68b5a652 1401out:
85863e47
MM
1402 return err;
1403}
1404
deb94544 1405static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
052fb0d6 1406 struct vm_area_struct *vma, unsigned long addr, pte_t pte)
85863e47 1407{
deb94544 1408 u64 frame = 0, flags = 0;
052fb0d6 1409 struct page *page = NULL;
24d7275c 1410 bool migration = false;
85863e47 1411
052fb0d6 1412 if (pte_present(pte)) {
1c90308e
KK
1413 if (pm->show_pfn)
1414 frame = pte_pfn(pte);
deb94544 1415 flags |= PM_PRESENT;
25b2995a 1416 page = vm_normal_page(vma, addr, pte);
e9cdd6e7 1417 if (pte_soft_dirty(pte))
deb94544 1418 flags |= PM_SOFT_DIRTY;
fb8e37f3
PX
1419 if (pte_uffd_wp(pte))
1420 flags |= PM_UFFD_WP;
052fb0d6 1421 } else if (is_swap_pte(pte)) {
179ef71c
CG
1422 swp_entry_t entry;
1423 if (pte_swp_soft_dirty(pte))
deb94544 1424 flags |= PM_SOFT_DIRTY;
fb8e37f3
PX
1425 if (pte_swp_uffd_wp(pte))
1426 flags |= PM_UFFD_WP;
179ef71c 1427 entry = pte_to_swp_entry(pte);
0d206b5d
PX
1428 if (pm->show_pfn) {
1429 pgoff_t offset;
1430 /*
1431 * For PFN swap offsets, keeping the offset field
1432 * to be PFN only to be compatible with old smaps.
1433 */
1434 if (is_pfn_swap_entry(entry))
1435 offset = swp_offset_pfn(entry);
1436 else
1437 offset = swp_offset(entry);
ab6ecf24 1438 frame = swp_type(entry) |
0d206b5d
PX
1439 (offset << MAX_SWAPFILES_SHIFT);
1440 }
deb94544 1441 flags |= PM_SWAP;
24d7275c 1442 migration = is_migration_entry(entry);
af5cdaf8
AP
1443 if (is_pfn_swap_entry(entry))
1444 page = pfn_swap_entry_to_page(entry);
8e165e73
PX
1445 if (pte_marker_entry_uffd_wp(entry))
1446 flags |= PM_UFFD_WP;
052fb0d6
KK
1447 }
1448
1449 if (page && !PageAnon(page))
1450 flags |= PM_FILE;
24d7275c 1451 if (page && !migration && page_mapcount(page) == 1)
77bb499b 1452 flags |= PM_MMAP_EXCLUSIVE;
deb94544
KK
1453 if (vma->vm_flags & VM_SOFTDIRTY)
1454 flags |= PM_SOFT_DIRTY;
052fb0d6 1455
deb94544 1456 return make_pme(frame, flags);
bcf8039e
DH
1457}
1458
356515e7 1459static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
2165009b 1460 struct mm_walk *walk)
85863e47 1461{
f995ece2 1462 struct vm_area_struct *vma = walk->vma;
2165009b 1463 struct pagemapread *pm = walk->private;
bf929152 1464 spinlock_t *ptl;
05fbf357 1465 pte_t *pte, *orig_pte;
85863e47 1466 int err = 0;
356515e7 1467#ifdef CONFIG_TRANSPARENT_HUGEPAGE
24d7275c
YS
1468 bool migration = false;
1469
b6ec57f4
KS
1470 ptl = pmd_trans_huge_lock(pmdp, vma);
1471 if (ptl) {
356515e7
KK
1472 u64 flags = 0, frame = 0;
1473 pmd_t pmd = *pmdp;
84c3fc4e 1474 struct page *page = NULL;
0f8975ec 1475
b83d7e43 1476 if (vma->vm_flags & VM_SOFTDIRTY)
deb94544 1477 flags |= PM_SOFT_DIRTY;
d9104d1c 1478
356515e7 1479 if (pmd_present(pmd)) {
84c3fc4e 1480 page = pmd_page(pmd);
77bb499b 1481
356515e7 1482 flags |= PM_PRESENT;
b83d7e43
HY
1483 if (pmd_soft_dirty(pmd))
1484 flags |= PM_SOFT_DIRTY;
fb8e37f3
PX
1485 if (pmd_uffd_wp(pmd))
1486 flags |= PM_UFFD_WP;
1c90308e
KK
1487 if (pm->show_pfn)
1488 frame = pmd_pfn(pmd) +
1489 ((addr & ~PMD_MASK) >> PAGE_SHIFT);
356515e7 1490 }
84c3fc4e
ZY
1491#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1492 else if (is_swap_pmd(pmd)) {
1493 swp_entry_t entry = pmd_to_swp_entry(pmd);
ab6ecf24 1494 unsigned long offset;
84c3fc4e 1495
ab6ecf24 1496 if (pm->show_pfn) {
0d206b5d
PX
1497 if (is_pfn_swap_entry(entry))
1498 offset = swp_offset_pfn(entry);
1499 else
1500 offset = swp_offset(entry);
1501 offset = offset +
ab6ecf24
HY
1502 ((addr & ~PMD_MASK) >> PAGE_SHIFT);
1503 frame = swp_type(entry) |
1504 (offset << MAX_SWAPFILES_SHIFT);
1505 }
84c3fc4e 1506 flags |= PM_SWAP;
b83d7e43
HY
1507 if (pmd_swp_soft_dirty(pmd))
1508 flags |= PM_SOFT_DIRTY;
fb8e37f3
PX
1509 if (pmd_swp_uffd_wp(pmd))
1510 flags |= PM_UFFD_WP;
84c3fc4e 1511 VM_BUG_ON(!is_pmd_migration_entry(pmd));
24d7275c 1512 migration = is_migration_entry(entry);
af5cdaf8 1513 page = pfn_swap_entry_to_page(entry);
84c3fc4e
ZY
1514 }
1515#endif
1516
24d7275c 1517 if (page && !migration && page_mapcount(page) == 1)
84c3fc4e 1518 flags |= PM_MMAP_EXCLUSIVE;
356515e7 1519
025c5b24 1520 for (; addr != end; addr += PAGE_SIZE) {
356515e7 1521 pagemap_entry_t pme = make_pme(frame, flags);
025c5b24 1522
cabbb6d5 1523 err = add_to_pagemap(&pme, pm);
025c5b24
NH
1524 if (err)
1525 break;
ab6ecf24
HY
1526 if (pm->show_pfn) {
1527 if (flags & PM_PRESENT)
1528 frame++;
1529 else if (flags & PM_SWAP)
1530 frame += (1 << MAX_SWAPFILES_SHIFT);
1531 }
5aaabe83 1532 }
bf929152 1533 spin_unlock(ptl);
025c5b24 1534 return err;
5aaabe83 1535 }
356515e7 1536#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
81d0fa62 1537
f995ece2
NH
1538 /*
1539 * We can assume that @vma always points to a valid one and @end never
1540 * goes beyond vma->vm_end.
1541 */
356515e7 1542 orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
7780d040
HD
1543 if (!pte) {
1544 walk->action = ACTION_AGAIN;
1545 return err;
1546 }
f995ece2
NH
1547 for (; addr < end; pte++, addr += PAGE_SIZE) {
1548 pagemap_entry_t pme;
05fbf357 1549
c33c7948 1550 pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte));
cabbb6d5 1551 err = add_to_pagemap(&pme, pm);
05fbf357 1552 if (err)
81d0fa62 1553 break;
85863e47 1554 }
f995ece2 1555 pte_unmap_unlock(orig_pte, ptl);
85863e47
MM
1556
1557 cond_resched();
1558
1559 return err;
1560}
1561
1a5cb814 1562#ifdef CONFIG_HUGETLB_PAGE
116354d1 1563/* This function walks within one hugetlb entry in the single call */
356515e7 1564static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
116354d1
NH
1565 unsigned long addr, unsigned long end,
1566 struct mm_walk *walk)
5dc37642 1567{
5dc37642 1568 struct pagemapread *pm = walk->private;
f995ece2 1569 struct vm_area_struct *vma = walk->vma;
356515e7 1570 u64 flags = 0, frame = 0;
5dc37642 1571 int err = 0;
356515e7 1572 pte_t pte;
5dc37642 1573
f995ece2 1574 if (vma->vm_flags & VM_SOFTDIRTY)
deb94544 1575 flags |= PM_SOFT_DIRTY;
d9104d1c 1576
356515e7
KK
1577 pte = huge_ptep_get(ptep);
1578 if (pte_present(pte)) {
1579 struct page *page = pte_page(pte);
1580
1581 if (!PageAnon(page))
1582 flags |= PM_FILE;
1583
77bb499b
KK
1584 if (page_mapcount(page) == 1)
1585 flags |= PM_MMAP_EXCLUSIVE;
1586
8e165e73
PX
1587 if (huge_pte_uffd_wp(pte))
1588 flags |= PM_UFFD_WP;
1589
356515e7 1590 flags |= PM_PRESENT;
1c90308e
KK
1591 if (pm->show_pfn)
1592 frame = pte_pfn(pte) +
1593 ((addr & ~hmask) >> PAGE_SHIFT);
8e165e73
PX
1594 } else if (pte_swp_uffd_wp_any(pte)) {
1595 flags |= PM_UFFD_WP;
356515e7
KK
1596 }
1597
5dc37642 1598 for (; addr != end; addr += PAGE_SIZE) {
356515e7
KK
1599 pagemap_entry_t pme = make_pme(frame, flags);
1600
cabbb6d5 1601 err = add_to_pagemap(&pme, pm);
5dc37642
NH
1602 if (err)
1603 return err;
1c90308e 1604 if (pm->show_pfn && (flags & PM_PRESENT))
356515e7 1605 frame++;
5dc37642
NH
1606 }
1607
1608 cond_resched();
1609
1610 return err;
1611}
7b86ac33
CH
1612#else
1613#define pagemap_hugetlb_range NULL
1a5cb814 1614#endif /* HUGETLB_PAGE */
5dc37642 1615
7b86ac33
CH
1616static const struct mm_walk_ops pagemap_ops = {
1617 .pmd_entry = pagemap_pmd_range,
1618 .pte_hole = pagemap_pte_hole,
1619 .hugetlb_entry = pagemap_hugetlb_range,
49b06385 1620 .walk_lock = PGWALK_RDLOCK,
7b86ac33
CH
1621};
1622
85863e47
MM
1623/*
1624 * /proc/pid/pagemap - an array mapping virtual pages to pfns
1625 *
f16278c6
HR
1626 * For each page in the address space, this file contains one 64-bit entry
1627 * consisting of the following:
1628 *
052fb0d6 1629 * Bits 0-54 page frame number (PFN) if present
f16278c6 1630 * Bits 0-4 swap type if swapped
052fb0d6 1631 * Bits 5-54 swap offset if swapped
1ad1335d 1632 * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst)
77bb499b 1633 * Bit 56 page exclusively mapped
dd21bfa4
YZ
1634 * Bit 57 pte is uffd-wp write-protected
1635 * Bits 58-60 zero
052fb0d6 1636 * Bit 61 page is file-page or shared-anon
f16278c6
HR
1637 * Bit 62 page swapped
1638 * Bit 63 page present
1639 *
1640 * If the page is not present but in swap, then the PFN contains an
1641 * encoding of the swap file number and the page's offset into the
1642 * swap. Unmapped pages return a null PFN. This allows determining
85863e47
MM
1643 * precisely which pages are mapped (or in swap) and comparing mapped
1644 * pages between processes.
1645 *
1646 * Efficient users of this interface will use /proc/pid/maps to
1647 * determine which areas of memory are actually mapped and llseek to
1648 * skip over unmapped regions.
1649 */
1650static ssize_t pagemap_read(struct file *file, char __user *buf,
1651 size_t count, loff_t *ppos)
1652{
a06db751 1653 struct mm_struct *mm = file->private_data;
85863e47 1654 struct pagemapread pm;
5d7e0d2b
AM
1655 unsigned long src;
1656 unsigned long svpfn;
1657 unsigned long start_vaddr;
1658 unsigned long end_vaddr;
a06db751 1659 int ret = 0, copied = 0;
85863e47 1660
388f7934 1661 if (!mm || !mmget_not_zero(mm))
85863e47
MM
1662 goto out;
1663
85863e47
MM
1664 ret = -EINVAL;
1665 /* file position must be aligned */
aae8679b 1666 if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
a06db751 1667 goto out_mm;
85863e47
MM
1668
1669 ret = 0;
08161786 1670 if (!count)
a06db751 1671 goto out_mm;
08161786 1672
1c90308e
KK
1673 /* do not disclose physical addresses: attack vector */
1674 pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
1675
8c829622 1676 pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
6da2ec56 1677 pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL);
5d7e0d2b 1678 ret = -ENOMEM;
d82ef020 1679 if (!pm.buffer)
a06db751 1680 goto out_mm;
85863e47 1681
5d7e0d2b
AM
1682 src = *ppos;
1683 svpfn = src / PM_ENTRY_BYTES;
a06db751 1684 end_vaddr = mm->task_size;
5d7e0d2b
AM
1685
1686 /* watch out for wraparound */
40d6366e 1687 start_vaddr = end_vaddr;
428e106a 1688 if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) {
7bab8dfb
YX
1689 unsigned long end;
1690
428e106a
KS
1691 ret = mmap_read_lock_killable(mm);
1692 if (ret)
1693 goto out_free;
1694 start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT);
1695 mmap_read_unlock(mm);
7bab8dfb
YX
1696
1697 end = start_vaddr + ((count / PM_ENTRY_BYTES) << PAGE_SHIFT);
1698 if (end >= start_vaddr && end < mm->task_size)
1699 end_vaddr = end;
428e106a 1700 }
40d6366e
MC
1701
1702 /* Ensure the address is inside the task */
1703 if (start_vaddr > mm->task_size)
5d7e0d2b
AM
1704 start_vaddr = end_vaddr;
1705
d82ef020
KH
1706 ret = 0;
1707 while (count && (start_vaddr < end_vaddr)) {
1708 int len;
1709 unsigned long end;
1710
1711 pm.pos = 0;
ea251c1d 1712 end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
d82ef020
KH
1713 /* overflow ? */
1714 if (end < start_vaddr || end > end_vaddr)
1715 end = end_vaddr;
d8ed45c5 1716 ret = mmap_read_lock_killable(mm);
ad80b932
KK
1717 if (ret)
1718 goto out_free;
7b86ac33 1719 ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm);
d8ed45c5 1720 mmap_read_unlock(mm);
d82ef020
KH
1721 start_vaddr = end;
1722
1723 len = min(count, PM_ENTRY_BYTES * pm.pos);
309361e0 1724 if (copy_to_user(buf, pm.buffer, len)) {
d82ef020 1725 ret = -EFAULT;
a06db751 1726 goto out_free;
d82ef020
KH
1727 }
1728 copied += len;
1729 buf += len;
1730 count -= len;
85863e47 1731 }
d82ef020
KH
1732 *ppos += copied;
1733 if (!ret || ret == PM_END_OF_BUFFER)
1734 ret = copied;
1735
98bc93e5
KM
1736out_free:
1737 kfree(pm.buffer);
a06db751
KK
1738out_mm:
1739 mmput(mm);
85863e47
MM
1740out:
1741 return ret;
1742}
1743
541c237c
PE
1744static int pagemap_open(struct inode *inode, struct file *file)
1745{
a06db751
KK
1746 struct mm_struct *mm;
1747
a06db751
KK
1748 mm = proc_mem_open(inode, PTRACE_MODE_READ);
1749 if (IS_ERR(mm))
1750 return PTR_ERR(mm);
1751 file->private_data = mm;
1752 return 0;
1753}
1754
1755static int pagemap_release(struct inode *inode, struct file *file)
1756{
1757 struct mm_struct *mm = file->private_data;
1758
1759 if (mm)
1760 mmdrop(mm);
541c237c
PE
1761 return 0;
1762}
1763
52526ca7
MUA
1764#define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \
1765 PAGE_IS_FILE | PAGE_IS_PRESENT | \
1766 PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \
e6a9a2cb 1767 PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY)
52526ca7
MUA
1768#define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC)
1769
1770struct pagemap_scan_private {
1771 struct pm_scan_arg arg;
1772 unsigned long masks_of_interest, cur_vma_category;
1773 struct page_region *vec_buf;
1774 unsigned long vec_buf_len, vec_buf_index, found_pages;
1775 struct page_region __user *vec_out;
1776};
1777
1778static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
1779 struct vm_area_struct *vma,
1780 unsigned long addr, pte_t pte)
1781{
1782 unsigned long categories = 0;
1783
1784 if (pte_present(pte)) {
1785 struct page *page;
1786
1787 categories |= PAGE_IS_PRESENT;
1788 if (!pte_uffd_wp(pte))
1789 categories |= PAGE_IS_WRITTEN;
1790
1791 if (p->masks_of_interest & PAGE_IS_FILE) {
1792 page = vm_normal_page(vma, addr, pte);
1793 if (page && !PageAnon(page))
1794 categories |= PAGE_IS_FILE;
1795 }
1796
1797 if (is_zero_pfn(pte_pfn(pte)))
1798 categories |= PAGE_IS_PFNZERO;
e6a9a2cb
AV
1799 if (pte_soft_dirty(pte))
1800 categories |= PAGE_IS_SOFT_DIRTY;
52526ca7
MUA
1801 } else if (is_swap_pte(pte)) {
1802 swp_entry_t swp;
1803
1804 categories |= PAGE_IS_SWAPPED;
1805 if (!pte_swp_uffd_wp_any(pte))
1806 categories |= PAGE_IS_WRITTEN;
1807
1808 if (p->masks_of_interest & PAGE_IS_FILE) {
1809 swp = pte_to_swp_entry(pte);
1810 if (is_pfn_swap_entry(swp) &&
71014224 1811 !folio_test_anon(pfn_swap_entry_folio(swp)))
52526ca7
MUA
1812 categories |= PAGE_IS_FILE;
1813 }
e6a9a2cb
AV
1814 if (pte_swp_soft_dirty(pte))
1815 categories |= PAGE_IS_SOFT_DIRTY;
52526ca7
MUA
1816 }
1817
1818 return categories;
1819}
1820
1821static void make_uffd_wp_pte(struct vm_area_struct *vma,
1822 unsigned long addr, pte_t *pte)
1823{
1824 pte_t ptent = ptep_get(pte);
1825
1826 if (pte_present(ptent)) {
1827 pte_t old_pte;
1828
1829 old_pte = ptep_modify_prot_start(vma, addr, pte);
1830 ptent = pte_mkuffd_wp(ptent);
1831 ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
1832 } else if (is_swap_pte(ptent)) {
1833 ptent = pte_swp_mkuffd_wp(ptent);
1834 set_pte_at(vma->vm_mm, addr, pte, ptent);
1835 } else {
1836 set_pte_at(vma->vm_mm, addr, pte,
1837 make_pte_marker(PTE_MARKER_UFFD_WP));
1838 }
1839}
1840
1841#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1842static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
1843 struct vm_area_struct *vma,
1844 unsigned long addr, pmd_t pmd)
1845{
1846 unsigned long categories = PAGE_IS_HUGE;
1847
1848 if (pmd_present(pmd)) {
1849 struct page *page;
1850
1851 categories |= PAGE_IS_PRESENT;
1852 if (!pmd_uffd_wp(pmd))
1853 categories |= PAGE_IS_WRITTEN;
1854
1855 if (p->masks_of_interest & PAGE_IS_FILE) {
1856 page = vm_normal_page_pmd(vma, addr, pmd);
1857 if (page && !PageAnon(page))
1858 categories |= PAGE_IS_FILE;
1859 }
1860
1861 if (is_zero_pfn(pmd_pfn(pmd)))
1862 categories |= PAGE_IS_PFNZERO;
e6a9a2cb
AV
1863 if (pmd_soft_dirty(pmd))
1864 categories |= PAGE_IS_SOFT_DIRTY;
52526ca7
MUA
1865 } else if (is_swap_pmd(pmd)) {
1866 swp_entry_t swp;
1867
1868 categories |= PAGE_IS_SWAPPED;
1869 if (!pmd_swp_uffd_wp(pmd))
1870 categories |= PAGE_IS_WRITTEN;
e6a9a2cb
AV
1871 if (pmd_swp_soft_dirty(pmd))
1872 categories |= PAGE_IS_SOFT_DIRTY;
52526ca7
MUA
1873
1874 if (p->masks_of_interest & PAGE_IS_FILE) {
1875 swp = pmd_to_swp_entry(pmd);
1876 if (is_pfn_swap_entry(swp) &&
71014224 1877 !folio_test_anon(pfn_swap_entry_folio(swp)))
52526ca7
MUA
1878 categories |= PAGE_IS_FILE;
1879 }
1880 }
1881
1882 return categories;
1883}
1884
1885static void make_uffd_wp_pmd(struct vm_area_struct *vma,
1886 unsigned long addr, pmd_t *pmdp)
1887{
1888 pmd_t old, pmd = *pmdp;
1889
1890 if (pmd_present(pmd)) {
1891 old = pmdp_invalidate_ad(vma, addr, pmdp);
1892 pmd = pmd_mkuffd_wp(old);
1893 set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
1894 } else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
1895 pmd = pmd_swp_mkuffd_wp(pmd);
1896 set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
1897 }
1898}
1899#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1900
1901#ifdef CONFIG_HUGETLB_PAGE
1902static unsigned long pagemap_hugetlb_category(pte_t pte)
1903{
1904 unsigned long categories = PAGE_IS_HUGE;
1905
1906 /*
1907 * According to pagemap_hugetlb_range(), file-backed HugeTLB
1908 * page cannot be swapped. So PAGE_IS_FILE is not checked for
1909 * swapped pages.
1910 */
1911 if (pte_present(pte)) {
1912 categories |= PAGE_IS_PRESENT;
1913 if (!huge_pte_uffd_wp(pte))
1914 categories |= PAGE_IS_WRITTEN;
1915 if (!PageAnon(pte_page(pte)))
1916 categories |= PAGE_IS_FILE;
1917 if (is_zero_pfn(pte_pfn(pte)))
1918 categories |= PAGE_IS_PFNZERO;
e6a9a2cb
AV
1919 if (pte_soft_dirty(pte))
1920 categories |= PAGE_IS_SOFT_DIRTY;
52526ca7
MUA
1921 } else if (is_swap_pte(pte)) {
1922 categories |= PAGE_IS_SWAPPED;
1923 if (!pte_swp_uffd_wp_any(pte))
1924 categories |= PAGE_IS_WRITTEN;
e6a9a2cb
AV
1925 if (pte_swp_soft_dirty(pte))
1926 categories |= PAGE_IS_SOFT_DIRTY;
52526ca7
MUA
1927 }
1928
1929 return categories;
1930}
1931
1932static void make_uffd_wp_huge_pte(struct vm_area_struct *vma,
1933 unsigned long addr, pte_t *ptep,
1934 pte_t ptent)
1935{
1936 unsigned long psize;
1937
1938 if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent))
1939 return;
1940
1941 psize = huge_page_size(hstate_vma(vma));
1942
1943 if (is_hugetlb_entry_migration(ptent))
1944 set_huge_pte_at(vma->vm_mm, addr, ptep,
1945 pte_swp_mkuffd_wp(ptent), psize);
1946 else if (!huge_pte_none(ptent))
1947 huge_ptep_modify_prot_commit(vma, addr, ptep, ptent,
1948 huge_pte_mkuffd_wp(ptent));
1949 else
1950 set_huge_pte_at(vma->vm_mm, addr, ptep,
1951 make_pte_marker(PTE_MARKER_UFFD_WP), psize);
1952}
1953#endif /* CONFIG_HUGETLB_PAGE */
1954
1955#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
1956static void pagemap_scan_backout_range(struct pagemap_scan_private *p,
1957 unsigned long addr, unsigned long end)
1958{
1959 struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
1960
1961 if (cur_buf->start != addr)
1962 cur_buf->end = addr;
1963 else
1964 cur_buf->start = cur_buf->end = 0;
1965
1966 p->found_pages -= (end - addr) / PAGE_SIZE;
1967}
1968#endif
1969
1970static bool pagemap_scan_is_interesting_page(unsigned long categories,
1971 const struct pagemap_scan_private *p)
1972{
1973 categories ^= p->arg.category_inverted;
1974 if ((categories & p->arg.category_mask) != p->arg.category_mask)
1975 return false;
1976 if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask))
1977 return false;
1978
1979 return true;
1980}
1981
1982static bool pagemap_scan_is_interesting_vma(unsigned long categories,
1983 const struct pagemap_scan_private *p)
1984{
1985 unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED;
1986
1987 categories ^= p->arg.category_inverted;
1988 if ((categories & required) != required)
1989 return false;
1990
1991 return true;
1992}
1993
1994static int pagemap_scan_test_walk(unsigned long start, unsigned long end,
1995 struct mm_walk *walk)
1996{
1997 struct pagemap_scan_private *p = walk->private;
1998 struct vm_area_struct *vma = walk->vma;
1999 unsigned long vma_category = 0;
0dff1b40
PX
2000 bool wp_allowed = userfaultfd_wp_async(vma) &&
2001 userfaultfd_wp_use_markers(vma);
52526ca7 2002
0dff1b40
PX
2003 if (!wp_allowed) {
2004 /* User requested explicit failure over wp-async capability */
2005 if (p->arg.flags & PM_SCAN_CHECK_WPASYNC)
2006 return -EPERM;
2007 /*
2008 * User requires wr-protect, and allows silently skipping
2009 * unsupported vmas.
2010 */
2011 if (p->arg.flags & PM_SCAN_WP_MATCHING)
2012 return 1;
2013 /*
2014 * Then the request doesn't involve wr-protects at all,
2015 * fall through to the rest checks, and allow vma walk.
2016 */
2017 }
52526ca7
MUA
2018
2019 if (vma->vm_flags & VM_PFNMAP)
2020 return 1;
2021
0dff1b40
PX
2022 if (wp_allowed)
2023 vma_category |= PAGE_IS_WPALLOWED;
2024
e6a9a2cb
AV
2025 if (vma->vm_flags & VM_SOFTDIRTY)
2026 vma_category |= PAGE_IS_SOFT_DIRTY;
2027
52526ca7
MUA
2028 if (!pagemap_scan_is_interesting_vma(vma_category, p))
2029 return 1;
2030
2031 p->cur_vma_category = vma_category;
2032
2033 return 0;
2034}
2035
2036static bool pagemap_scan_push_range(unsigned long categories,
2037 struct pagemap_scan_private *p,
2038 unsigned long addr, unsigned long end)
2039{
2040 struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
2041
2042 /*
2043 * When there is no output buffer provided at all, the sentinel values
2044 * won't match here. There is no other way for `cur_buf->end` to be
2045 * non-zero other than it being non-empty.
2046 */
2047 if (addr == cur_buf->end && categories == cur_buf->categories) {
2048 cur_buf->end = end;
2049 return true;
2050 }
2051
2052 if (cur_buf->end) {
2053 if (p->vec_buf_index >= p->vec_buf_len - 1)
2054 return false;
2055
2056 cur_buf = &p->vec_buf[++p->vec_buf_index];
2057 }
2058
2059 cur_buf->start = addr;
2060 cur_buf->end = end;
2061 cur_buf->categories = categories;
2062
2063 return true;
2064}
2065
2066static int pagemap_scan_output(unsigned long categories,
2067 struct pagemap_scan_private *p,
2068 unsigned long addr, unsigned long *end)
2069{
2070 unsigned long n_pages, total_pages;
2071 int ret = 0;
2072
2073 if (!p->vec_buf)
2074 return 0;
2075
2076 categories &= p->arg.return_mask;
2077
2078 n_pages = (*end - addr) / PAGE_SIZE;
2079 if (check_add_overflow(p->found_pages, n_pages, &total_pages) ||
2080 total_pages > p->arg.max_pages) {
2081 size_t n_too_much = total_pages - p->arg.max_pages;
2082 *end -= n_too_much * PAGE_SIZE;
2083 n_pages -= n_too_much;
2084 ret = -ENOSPC;
2085 }
2086
2087 if (!pagemap_scan_push_range(categories, p, addr, *end)) {
2088 *end = addr;
2089 n_pages = 0;
2090 ret = -ENOSPC;
2091 }
2092
2093 p->found_pages += n_pages;
2094 if (ret)
2095 p->arg.walk_end = *end;
2096
2097 return ret;
2098}
2099
2100static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start,
2101 unsigned long end, struct mm_walk *walk)
2102{
2103#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2104 struct pagemap_scan_private *p = walk->private;
2105 struct vm_area_struct *vma = walk->vma;
2106 unsigned long categories;
2107 spinlock_t *ptl;
2108 int ret = 0;
2109
2110 ptl = pmd_trans_huge_lock(pmd, vma);
2111 if (!ptl)
2112 return -ENOENT;
2113
2114 categories = p->cur_vma_category |
2115 pagemap_thp_category(p, vma, start, *pmd);
2116
2117 if (!pagemap_scan_is_interesting_page(categories, p))
2118 goto out_unlock;
2119
2120 ret = pagemap_scan_output(categories, p, start, &end);
2121 if (start == end)
2122 goto out_unlock;
2123
2124 if (~p->arg.flags & PM_SCAN_WP_MATCHING)
2125 goto out_unlock;
2126 if (~categories & PAGE_IS_WRITTEN)
2127 goto out_unlock;
2128
2129 /*
2130 * Break huge page into small pages if the WP operation
2131 * needs to be performed on a portion of the huge page.
2132 */
2133 if (end != start + HPAGE_SIZE) {
2134 spin_unlock(ptl);
2135 split_huge_pmd(vma, pmd, start);
2136 pagemap_scan_backout_range(p, start, end);
2137 /* Report as if there was no THP */
2138 return -ENOENT;
2139 }
2140
2141 make_uffd_wp_pmd(vma, start, pmd);
2142 flush_tlb_range(vma, start, end);
2143out_unlock:
2144 spin_unlock(ptl);
2145 return ret;
2146#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
2147 return -ENOENT;
2148#endif
2149}
2150
2151static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
2152 unsigned long end, struct mm_walk *walk)
2153{
2154 struct pagemap_scan_private *p = walk->private;
2155 struct vm_area_struct *vma = walk->vma;
2156 unsigned long addr, flush_end = 0;
2157 pte_t *pte, *start_pte;
2158 spinlock_t *ptl;
2159 int ret;
2160
2161 arch_enter_lazy_mmu_mode();
2162
2163 ret = pagemap_scan_thp_entry(pmd, start, end, walk);
2164 if (ret != -ENOENT) {
2165 arch_leave_lazy_mmu_mode();
2166 return ret;
2167 }
2168
2169 ret = 0;
2170 start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
2171 if (!pte) {
2172 arch_leave_lazy_mmu_mode();
2173 walk->action = ACTION_AGAIN;
2174 return 0;
2175 }
2176
4980e837 2177 if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) {
12f6b01a
MUA
2178 /* Fast path for performing exclusive WP */
2179 for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
2180 if (pte_uffd_wp(ptep_get(pte)))
2181 continue;
2182 make_uffd_wp_pte(vma, addr, pte);
2183 if (!flush_end)
2184 start = addr;
2185 flush_end = addr + PAGE_SIZE;
2186 }
2187 goto flush_and_return;
2188 }
2189
2190 if (!p->arg.category_anyof_mask && !p->arg.category_inverted &&
2191 p->arg.category_mask == PAGE_IS_WRITTEN &&
2192 p->arg.return_mask == PAGE_IS_WRITTEN) {
2193 for (addr = start; addr < end; pte++, addr += PAGE_SIZE) {
2194 unsigned long next = addr + PAGE_SIZE;
2195
2196 if (pte_uffd_wp(ptep_get(pte)))
2197 continue;
2198 ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN,
2199 p, addr, &next);
2200 if (next == addr)
2201 break;
2202 if (~p->arg.flags & PM_SCAN_WP_MATCHING)
2203 continue;
2204 make_uffd_wp_pte(vma, addr, pte);
2205 if (!flush_end)
2206 start = addr;
2207 flush_end = next;
2208 }
2209 goto flush_and_return;
2210 }
2211
52526ca7
MUA
2212 for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
2213 unsigned long categories = p->cur_vma_category |
2214 pagemap_page_category(p, vma, addr, ptep_get(pte));
2215 unsigned long next = addr + PAGE_SIZE;
2216
2217 if (!pagemap_scan_is_interesting_page(categories, p))
2218 continue;
2219
2220 ret = pagemap_scan_output(categories, p, addr, &next);
2221 if (next == addr)
2222 break;
2223
2224 if (~p->arg.flags & PM_SCAN_WP_MATCHING)
2225 continue;
2226 if (~categories & PAGE_IS_WRITTEN)
2227 continue;
2228
2229 make_uffd_wp_pte(vma, addr, pte);
2230 if (!flush_end)
2231 start = addr;
2232 flush_end = next;
2233 }
2234
12f6b01a 2235flush_and_return:
52526ca7
MUA
2236 if (flush_end)
2237 flush_tlb_range(vma, start, addr);
2238
2239 pte_unmap_unlock(start_pte, ptl);
2240 arch_leave_lazy_mmu_mode();
2241
2242 cond_resched();
2243 return ret;
2244}
2245
2246#ifdef CONFIG_HUGETLB_PAGE
2247static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
2248 unsigned long start, unsigned long end,
2249 struct mm_walk *walk)
2250{
2251 struct pagemap_scan_private *p = walk->private;
2252 struct vm_area_struct *vma = walk->vma;
2253 unsigned long categories;
2254 spinlock_t *ptl;
2255 int ret = 0;
2256 pte_t pte;
2257
2258 if (~p->arg.flags & PM_SCAN_WP_MATCHING) {
2259 /* Go the short route when not write-protecting pages. */
2260
2261 pte = huge_ptep_get(ptep);
2262 categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
2263
2264 if (!pagemap_scan_is_interesting_page(categories, p))
2265 return 0;
2266
2267 return pagemap_scan_output(categories, p, start, &end);
2268 }
2269
2270 i_mmap_lock_write(vma->vm_file->f_mapping);
2271 ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep);
2272
2273 pte = huge_ptep_get(ptep);
2274 categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
2275
2276 if (!pagemap_scan_is_interesting_page(categories, p))
2277 goto out_unlock;
2278
2279 ret = pagemap_scan_output(categories, p, start, &end);
2280 if (start == end)
2281 goto out_unlock;
2282
2283 if (~categories & PAGE_IS_WRITTEN)
2284 goto out_unlock;
2285
2286 if (end != start + HPAGE_SIZE) {
2287 /* Partial HugeTLB page WP isn't possible. */
2288 pagemap_scan_backout_range(p, start, end);
2289 p->arg.walk_end = start;
2290 ret = 0;
2291 goto out_unlock;
2292 }
2293
2294 make_uffd_wp_huge_pte(vma, start, ptep, pte);
2295 flush_hugetlb_tlb_range(vma, start, end);
2296
2297out_unlock:
2298 spin_unlock(ptl);
2299 i_mmap_unlock_write(vma->vm_file->f_mapping);
2300
2301 return ret;
2302}
2303#else
2304#define pagemap_scan_hugetlb_entry NULL
2305#endif
2306
2307static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end,
2308 int depth, struct mm_walk *walk)
2309{
2310 struct pagemap_scan_private *p = walk->private;
2311 struct vm_area_struct *vma = walk->vma;
2312 int ret, err;
2313
2314 if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p))
2315 return 0;
2316
2317 ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end);
2318 if (addr == end)
2319 return ret;
2320
2321 if (~p->arg.flags & PM_SCAN_WP_MATCHING)
2322 return ret;
2323
2324 err = uffd_wp_range(vma, addr, end - addr, true);
2325 if (err < 0)
2326 ret = err;
2327
2328 return ret;
2329}
2330
2331static const struct mm_walk_ops pagemap_scan_ops = {
2332 .test_walk = pagemap_scan_test_walk,
2333 .pmd_entry = pagemap_scan_pmd_entry,
2334 .pte_hole = pagemap_scan_pte_hole,
2335 .hugetlb_entry = pagemap_scan_hugetlb_entry,
2336};
2337
2338static int pagemap_scan_get_args(struct pm_scan_arg *arg,
2339 unsigned long uarg)
2340{
2341 if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg)))
2342 return -EFAULT;
2343
2344 if (arg->size != sizeof(struct pm_scan_arg))
2345 return -EINVAL;
2346
2347 /* Validate requested features */
2348 if (arg->flags & ~PM_SCAN_FLAGS)
2349 return -EINVAL;
2350 if ((arg->category_inverted | arg->category_mask |
2351 arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES)
2352 return -EINVAL;
2353
2354 arg->start = untagged_addr((unsigned long)arg->start);
2355 arg->end = untagged_addr((unsigned long)arg->end);
2356 arg->vec = untagged_addr((unsigned long)arg->vec);
2357
2358 /* Validate memory pointers */
2359 if (!IS_ALIGNED(arg->start, PAGE_SIZE))
2360 return -EINVAL;
2361 if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start))
2362 return -EFAULT;
2363 if (!arg->vec && arg->vec_len)
2364 return -EINVAL;
2365 if (arg->vec && !access_ok((void __user *)(long)arg->vec,
2366 arg->vec_len * sizeof(struct page_region)))
2367 return -EFAULT;
2368
2369 /* Fixup default values */
2370 arg->end = ALIGN(arg->end, PAGE_SIZE);
2371 arg->walk_end = 0;
2372 if (!arg->max_pages)
2373 arg->max_pages = ULONG_MAX;
2374
2375 return 0;
2376}
2377
2378static int pagemap_scan_writeback_args(struct pm_scan_arg *arg,
2379 unsigned long uargl)
2380{
2381 struct pm_scan_arg __user *uarg = (void __user *)uargl;
2382
2383 if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end)))
2384 return -EFAULT;
2385
2386 return 0;
2387}
2388
2389static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p)
2390{
2391 if (!p->arg.vec_len)
2392 return 0;
2393
2394 p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT,
2395 p->arg.vec_len);
2396 p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf),
2397 GFP_KERNEL);
2398 if (!p->vec_buf)
2399 return -ENOMEM;
2400
2401 p->vec_buf->start = p->vec_buf->end = 0;
2402 p->vec_out = (struct page_region __user *)(long)p->arg.vec;
2403
2404 return 0;
2405}
2406
2407static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p)
2408{
2409 const struct page_region *buf = p->vec_buf;
2410 long n = p->vec_buf_index;
2411
2412 if (!p->vec_buf)
2413 return 0;
2414
2415 if (buf[n].end != buf[n].start)
2416 n++;
2417
2418 if (!n)
2419 return 0;
2420
2421 if (copy_to_user(p->vec_out, buf, n * sizeof(*buf)))
2422 return -EFAULT;
2423
2424 p->arg.vec_len -= n;
2425 p->vec_out += n;
2426
2427 p->vec_buf_index = 0;
2428 p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len);
2429 p->vec_buf->start = p->vec_buf->end = 0;
2430
2431 return n;
2432}
2433
2434static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
2435{
52526ca7
MUA
2436 struct pagemap_scan_private p = {0};
2437 unsigned long walk_start;
2438 size_t n_ranges_out = 0;
2439 int ret;
2440
2441 ret = pagemap_scan_get_args(&p.arg, uarg);
2442 if (ret)
2443 return ret;
2444
2445 p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask |
2446 p.arg.return_mask;
2447 ret = pagemap_scan_init_bounce_buffer(&p);
2448 if (ret)
2449 return ret;
2450
52526ca7
MUA
2451 for (walk_start = p.arg.start; walk_start < p.arg.end;
2452 walk_start = p.arg.walk_end) {
4cccb622 2453 struct mmu_notifier_range range;
52526ca7
MUA
2454 long n_out;
2455
2456 if (fatal_signal_pending(current)) {
2457 ret = -EINTR;
2458 break;
2459 }
2460
2461 ret = mmap_read_lock_killable(mm);
2462 if (ret)
2463 break;
4cccb622
MUA
2464
2465 /* Protection change for the range is going to happen. */
2466 if (p.arg.flags & PM_SCAN_WP_MATCHING) {
2467 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
2468 mm, walk_start, p.arg.end);
2469 mmu_notifier_invalidate_range_start(&range);
2470 }
2471
52526ca7
MUA
2472 ret = walk_page_range(mm, walk_start, p.arg.end,
2473 &pagemap_scan_ops, &p);
4cccb622
MUA
2474
2475 if (p.arg.flags & PM_SCAN_WP_MATCHING)
2476 mmu_notifier_invalidate_range_end(&range);
2477
52526ca7
MUA
2478 mmap_read_unlock(mm);
2479
2480 n_out = pagemap_scan_flush_buffer(&p);
2481 if (n_out < 0)
2482 ret = n_out;
2483 else
2484 n_ranges_out += n_out;
2485
2486 if (ret != -ENOSPC)
2487 break;
2488
2489 if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages)
2490 break;
2491 }
2492
2493 /* ENOSPC signifies early stop (buffer full) from the walk. */
2494 if (!ret || ret == -ENOSPC)
2495 ret = n_ranges_out;
2496
2497 /* The walk_end isn't set when ret is zero */
2498 if (!p.arg.walk_end)
2499 p.arg.walk_end = p.arg.end;
2500 if (pagemap_scan_writeback_args(&p.arg, uarg))
2501 ret = -EFAULT;
2502
52526ca7
MUA
2503 kfree(p.vec_buf);
2504 return ret;
2505}
2506
2507static long do_pagemap_cmd(struct file *file, unsigned int cmd,
2508 unsigned long arg)
2509{
2510 struct mm_struct *mm = file->private_data;
2511
2512 switch (cmd) {
2513 case PAGEMAP_SCAN:
2514 return do_pagemap_scan(mm, arg);
2515
2516 default:
2517 return -EINVAL;
2518 }
2519}
2520
85863e47
MM
2521const struct file_operations proc_pagemap_operations = {
2522 .llseek = mem_lseek, /* borrow this */
2523 .read = pagemap_read,
541c237c 2524 .open = pagemap_open,
a06db751 2525 .release = pagemap_release,
52526ca7
MUA
2526 .unlocked_ioctl = do_pagemap_cmd,
2527 .compat_ioctl = do_pagemap_cmd,
85863e47 2528};
1e883281 2529#endif /* CONFIG_PROC_PAGE_MONITOR */
85863e47 2530
6e21c8f1 2531#ifdef CONFIG_NUMA
6e21c8f1 2532
f69ff943 2533struct numa_maps {
f69ff943
SW
2534 unsigned long pages;
2535 unsigned long anon;
2536 unsigned long active;
2537 unsigned long writeback;
2538 unsigned long mapcount_max;
2539 unsigned long dirty;
2540 unsigned long swapcache;
2541 unsigned long node[MAX_NUMNODES];
2542};
2543
5b52fc89
SW
2544struct numa_maps_private {
2545 struct proc_maps_private proc_maps;
2546 struct numa_maps md;
2547};
2548
eb4866d0
DH
2549static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
2550 unsigned long nr_pages)
f69ff943 2551{
f1dc623f 2552 struct folio *folio = page_folio(page);
f69ff943
SW
2553 int count = page_mapcount(page);
2554
eb4866d0 2555 md->pages += nr_pages;
f1dc623f 2556 if (pte_dirty || folio_test_dirty(folio))
eb4866d0 2557 md->dirty += nr_pages;
f69ff943 2558
f1dc623f 2559 if (folio_test_swapcache(folio))
eb4866d0 2560 md->swapcache += nr_pages;
f69ff943 2561
f1dc623f 2562 if (folio_test_active(folio) || folio_test_unevictable(folio))
eb4866d0 2563 md->active += nr_pages;
f69ff943 2564
f1dc623f 2565 if (folio_test_writeback(folio))
eb4866d0 2566 md->writeback += nr_pages;
f69ff943 2567
f1dc623f 2568 if (folio_test_anon(folio))
eb4866d0 2569 md->anon += nr_pages;
f69ff943
SW
2570
2571 if (count > md->mapcount_max)
2572 md->mapcount_max = count;
2573
f1dc623f 2574 md->node[folio_nid(folio)] += nr_pages;
f69ff943
SW
2575}
2576
3200a8aa
DH
2577static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
2578 unsigned long addr)
2579{
2580 struct page *page;
2581 int nid;
2582
2583 if (!pte_present(pte))
2584 return NULL;
2585
2586 page = vm_normal_page(vma, addr, pte);
3218f871 2587 if (!page || is_zone_device_page(page))
3200a8aa
DH
2588 return NULL;
2589
2590 if (PageReserved(page))
2591 return NULL;
2592
2593 nid = page_to_nid(page);
4ff1b2c2 2594 if (!node_isset(nid, node_states[N_MEMORY]))
3200a8aa
DH
2595 return NULL;
2596
2597 return page;
2598}
2599
28093f9f
GS
2600#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2601static struct page *can_gather_numa_stats_pmd(pmd_t pmd,
2602 struct vm_area_struct *vma,
2603 unsigned long addr)
2604{
2605 struct page *page;
2606 int nid;
2607
2608 if (!pmd_present(pmd))
2609 return NULL;
2610
2611 page = vm_normal_page_pmd(vma, addr, pmd);
2612 if (!page)
2613 return NULL;
2614
2615 if (PageReserved(page))
2616 return NULL;
2617
2618 nid = page_to_nid(page);
2619 if (!node_isset(nid, node_states[N_MEMORY]))
2620 return NULL;
2621
2622 return page;
2623}
2624#endif
2625
f69ff943
SW
2626static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
2627 unsigned long end, struct mm_walk *walk)
2628{
d85f4d6d
NH
2629 struct numa_maps *md = walk->private;
2630 struct vm_area_struct *vma = walk->vma;
f69ff943
SW
2631 spinlock_t *ptl;
2632 pte_t *orig_pte;
2633 pte_t *pte;
2634
28093f9f 2635#ifdef CONFIG_TRANSPARENT_HUGEPAGE
b6ec57f4
KS
2636 ptl = pmd_trans_huge_lock(pmd, vma);
2637 if (ptl) {
025c5b24
NH
2638 struct page *page;
2639
28093f9f 2640 page = can_gather_numa_stats_pmd(*pmd, vma, addr);
025c5b24 2641 if (page)
28093f9f 2642 gather_stats(page, md, pmd_dirty(*pmd),
025c5b24 2643 HPAGE_PMD_SIZE/PAGE_SIZE);
bf929152 2644 spin_unlock(ptl);
025c5b24 2645 return 0;
32ef4384 2646 }
28093f9f 2647#endif
f69ff943 2648 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
7780d040
HD
2649 if (!pte) {
2650 walk->action = ACTION_AGAIN;
2651 return 0;
2652 }
f69ff943 2653 do {
c33c7948
RR
2654 pte_t ptent = ptep_get(pte);
2655 struct page *page = can_gather_numa_stats(ptent, vma, addr);
f69ff943
SW
2656 if (!page)
2657 continue;
c33c7948 2658 gather_stats(page, md, pte_dirty(ptent), 1);
f69ff943
SW
2659
2660 } while (pte++, addr += PAGE_SIZE, addr != end);
2661 pte_unmap_unlock(orig_pte, ptl);
a66c0410 2662 cond_resched();
f69ff943
SW
2663 return 0;
2664}
2665#ifdef CONFIG_HUGETLB_PAGE
632fd60f 2666static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
f69ff943
SW
2667 unsigned long addr, unsigned long end, struct mm_walk *walk)
2668{
5c2ff95e 2669 pte_t huge_pte = huge_ptep_get(pte);
f69ff943
SW
2670 struct numa_maps *md;
2671 struct page *page;
2672
5c2ff95e 2673 if (!pte_present(huge_pte))
f69ff943
SW
2674 return 0;
2675
5c2ff95e 2676 page = pte_page(huge_pte);
f69ff943
SW
2677
2678 md = walk->private;
5c2ff95e 2679 gather_stats(page, md, pte_dirty(huge_pte), 1);
f69ff943
SW
2680 return 0;
2681}
2682
2683#else
632fd60f 2684static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
f69ff943
SW
2685 unsigned long addr, unsigned long end, struct mm_walk *walk)
2686{
2687 return 0;
2688}
2689#endif
2690
7b86ac33
CH
2691static const struct mm_walk_ops show_numa_ops = {
2692 .hugetlb_entry = gather_hugetlb_stats,
2693 .pmd_entry = gather_pte_stats,
49b06385 2694 .walk_lock = PGWALK_RDLOCK,
7b86ac33
CH
2695};
2696
f69ff943
SW
2697/*
2698 * Display pages allocated per node and memory policy via /proc.
2699 */
871305bb 2700static int show_numa_map(struct seq_file *m, void *v)
f69ff943 2701{
5b52fc89
SW
2702 struct numa_maps_private *numa_priv = m->private;
2703 struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
f69ff943 2704 struct vm_area_struct *vma = v;
5b52fc89 2705 struct numa_maps *md = &numa_priv->md;
f69ff943
SW
2706 struct file *file = vma->vm_file;
2707 struct mm_struct *mm = vma->vm_mm;
948927ee 2708 char buffer[64];
ddc1a5cb
HD
2709 struct mempolicy *pol;
2710 pgoff_t ilx;
948927ee 2711 int nid;
f69ff943
SW
2712
2713 if (!mm)
2714 return 0;
2715
5b52fc89
SW
2716 /* Ensure we start with an empty set of numa_maps statistics. */
2717 memset(md, 0, sizeof(*md));
f69ff943 2718
ddc1a5cb 2719 pol = __get_vma_policy(vma, vma->vm_start, &ilx);
498f2371
ON
2720 if (pol) {
2721 mpol_to_str(buffer, sizeof(buffer), pol);
2722 mpol_cond_put(pol);
2723 } else {
2724 mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy);
2725 }
f69ff943
SW
2726
2727 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2728
2729 if (file) {
17c2b4ee 2730 seq_puts(m, " file=");
08582d67 2731 seq_path(m, file_user_path(file), "\n\t= ");
11250fd1 2732 } else if (vma_is_initial_heap(vma)) {
17c2b4ee 2733 seq_puts(m, " heap");
11250fd1 2734 } else if (vma_is_initial_stack(vma)) {
65376df5 2735 seq_puts(m, " stack");
f69ff943
SW
2736 }
2737
fc360bd9 2738 if (is_vm_hugetlb_page(vma))
17c2b4ee 2739 seq_puts(m, " huge");
fc360bd9 2740
c1e8d7c6 2741 /* mmap_lock is held by m_start */
7b86ac33 2742 walk_page_vma(vma, &show_numa_ops, md);
f69ff943
SW
2743
2744 if (!md->pages)
2745 goto out;
2746
2747 if (md->anon)
2748 seq_printf(m, " anon=%lu", md->anon);
2749
2750 if (md->dirty)
2751 seq_printf(m, " dirty=%lu", md->dirty);
2752
2753 if (md->pages != md->anon && md->pages != md->dirty)
2754 seq_printf(m, " mapped=%lu", md->pages);
2755
2756 if (md->mapcount_max > 1)
2757 seq_printf(m, " mapmax=%lu", md->mapcount_max);
2758
2759 if (md->swapcache)
2760 seq_printf(m, " swapcache=%lu", md->swapcache);
2761
2762 if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2763 seq_printf(m, " active=%lu", md->active);
2764
2765 if (md->writeback)
2766 seq_printf(m, " writeback=%lu", md->writeback);
2767
948927ee
DR
2768 for_each_node_state(nid, N_MEMORY)
2769 if (md->node[nid])
2770 seq_printf(m, " N%d=%lu", nid, md->node[nid]);
198d1597
RA
2771
2772 seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
f69ff943
SW
2773out:
2774 seq_putc(m, '\n');
f69ff943
SW
2775 return 0;
2776}
5b52fc89 2777
03a44825 2778static const struct seq_operations proc_pid_numa_maps_op = {
b7643757
SP
2779 .start = m_start,
2780 .next = m_next,
2781 .stop = m_stop,
871305bb 2782 .show = show_numa_map,
6e21c8f1 2783};
662795de 2784
b7643757
SP
2785static int pid_numa_maps_open(struct inode *inode, struct file *file)
2786{
871305bb
VB
2787 return proc_maps_open(inode, file, &proc_pid_numa_maps_op,
2788 sizeof(struct numa_maps_private));
b7643757
SP
2789}
2790
2791const struct file_operations proc_pid_numa_maps_operations = {
2792 .open = pid_numa_maps_open,
2793 .read = seq_read,
2794 .llseek = seq_lseek,
29a40ace 2795 .release = proc_map_release,
b7643757
SP
2796};
2797
f69ff943 2798#endif /* CONFIG_NUMA */