fs/proc/task_mmu.c: remove redundant page validation of pte_page
[linux-block.git] / fs / proc / task_mmu.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
a520110e 2#include <linux/pagewalk.h>
615d6e87 3#include <linux/vmacache.h>
17fca131 4#include <linux/mm_inline.h>
1da177e4 5#include <linux/hugetlb.h>
22e057c5 6#include <linux/huge_mm.h>
1da177e4
LT
7#include <linux/mount.h>
8#include <linux/seq_file.h>
e070ad49 9#include <linux/highmem.h>
5096add8 10#include <linux/ptrace.h>
5a0e3ad6 11#include <linux/slab.h>
6e21c8f1
CL
12#include <linux/pagemap.h>
13#include <linux/mempolicy.h>
22e057c5 14#include <linux/rmap.h>
85863e47 15#include <linux/swap.h>
6e84f315 16#include <linux/sched/mm.h>
85863e47 17#include <linux/swapops.h>
0f8975ec 18#include <linux/mmu_notifier.h>
33c3fc71 19#include <linux/page_idle.h>
6a15a370 20#include <linux/shmem_fs.h>
b3a81d08 21#include <linux/uaccess.h>
27cca866 22#include <linux/pkeys.h>
e070ad49 23
1da177e4 24#include <asm/elf.h>
b3a81d08 25#include <asm/tlb.h>
e070ad49 26#include <asm/tlbflush.h>
1da177e4
LT
27#include "internal.h"
28
d1be35cb
AV
29#define SEQ_PUT_DEC(str, val) \
30 seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8)
df5f8314 31void task_mem(struct seq_file *m, struct mm_struct *mm)
1da177e4 32{
af5b0f6a 33 unsigned long text, lib, swap, anon, file, shmem;
365e9c87
HD
34 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
35
8cee852e
JM
36 anon = get_mm_counter(mm, MM_ANONPAGES);
37 file = get_mm_counter(mm, MM_FILEPAGES);
38 shmem = get_mm_counter(mm, MM_SHMEMPAGES);
39
365e9c87
HD
40 /*
41 * Note: to minimize their overhead, mm maintains hiwater_vm and
42 * hiwater_rss only when about to *lower* total_vm or rss. Any
43 * collector of these hiwater stats must therefore get total_vm
44 * and rss too, which will usually be the higher. Barriers? not
45 * worth the effort, such snapshots can always be inconsistent.
46 */
47 hiwater_vm = total_vm = mm->total_vm;
48 if (hiwater_vm < mm->hiwater_vm)
49 hiwater_vm = mm->hiwater_vm;
8cee852e 50 hiwater_rss = total_rss = anon + file + shmem;
365e9c87
HD
51 if (hiwater_rss < mm->hiwater_rss)
52 hiwater_rss = mm->hiwater_rss;
1da177e4 53
8526d84f
KK
54 /* split executable areas between text and lib */
55 text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK);
56 text = min(text, mm->exec_vm << PAGE_SHIFT);
57 lib = (mm->exec_vm << PAGE_SHIFT) - text;
58
b084d435 59 swap = get_mm_counter(mm, MM_SWAPENTS);
d1be35cb
AV
60 SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
61 SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
62 SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
70f8a3ca 63 SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm));
d1be35cb
AV
64 SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss);
65 SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss);
66 SEQ_PUT_DEC(" kB\nRssAnon:\t", anon);
67 SEQ_PUT_DEC(" kB\nRssFile:\t", file);
68 SEQ_PUT_DEC(" kB\nRssShmem:\t", shmem);
69 SEQ_PUT_DEC(" kB\nVmData:\t", mm->data_vm);
70 SEQ_PUT_DEC(" kB\nVmStk:\t", mm->stack_vm);
71 seq_put_decimal_ull_width(m,
72 " kB\nVmExe:\t", text >> 10, 8);
73 seq_put_decimal_ull_width(m,
74 " kB\nVmLib:\t", lib >> 10, 8);
75 seq_put_decimal_ull_width(m,
76 " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8);
77 SEQ_PUT_DEC(" kB\nVmSwap:\t", swap);
78 seq_puts(m, " kB\n");
5d317b2b 79 hugetlb_report_usage(m, mm);
1da177e4 80}
d1be35cb 81#undef SEQ_PUT_DEC
1da177e4
LT
82
83unsigned long task_vsize(struct mm_struct *mm)
84{
85 return PAGE_SIZE * mm->total_vm;
86}
87
a2ade7b6
AD
88unsigned long task_statm(struct mm_struct *mm,
89 unsigned long *shared, unsigned long *text,
90 unsigned long *data, unsigned long *resident)
1da177e4 91{
eca56ff9
JM
92 *shared = get_mm_counter(mm, MM_FILEPAGES) +
93 get_mm_counter(mm, MM_SHMEMPAGES);
1da177e4
LT
94 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
95 >> PAGE_SHIFT;
84638335 96 *data = mm->data_vm + mm->stack_vm;
d559db08 97 *resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
1da177e4
LT
98 return mm->total_vm;
99}
100
9e781440
KH
101#ifdef CONFIG_NUMA
102/*
498f2371 103 * Save get_task_policy() for show_numa_map().
9e781440
KH
104 */
105static void hold_task_mempolicy(struct proc_maps_private *priv)
106{
107 struct task_struct *task = priv->task;
108
109 task_lock(task);
498f2371 110 priv->task_mempolicy = get_task_policy(task);
9e781440
KH
111 mpol_get(priv->task_mempolicy);
112 task_unlock(task);
113}
114static void release_task_mempolicy(struct proc_maps_private *priv)
115{
116 mpol_put(priv->task_mempolicy);
117}
118#else
119static void hold_task_mempolicy(struct proc_maps_private *priv)
120{
121}
122static void release_task_mempolicy(struct proc_maps_private *priv)
123{
124}
125#endif
126
0c255321 127static void *m_start(struct seq_file *m, loff_t *ppos)
e070ad49 128{
a6198797 129 struct proc_maps_private *priv = m->private;
4781f2c3 130 unsigned long last_addr = *ppos;
a6198797 131 struct mm_struct *mm;
0c255321 132 struct vm_area_struct *vma;
a6198797 133
c2e88d22 134 /* See m_next(). Zero at the start or after lseek. */
b8c20a9b
ON
135 if (last_addr == -1UL)
136 return NULL;
137
2c03376d 138 priv->task = get_proc_task(priv->inode);
a6198797 139 if (!priv->task)
ec6fd8a4 140 return ERR_PTR(-ESRCH);
a6198797 141
29a40ace 142 mm = priv->mm;
d07ded61
MWO
143 if (!mm || !mmget_not_zero(mm)) {
144 put_task_struct(priv->task);
145 priv->task = NULL;
29a40ace 146 return NULL;
d07ded61 147 }
a6198797 148
d8ed45c5 149 if (mmap_read_lock_killable(mm)) {
8a713e7d 150 mmput(mm);
d07ded61
MWO
151 put_task_struct(priv->task);
152 priv->task = NULL;
8a713e7d
KK
153 return ERR_PTR(-EINTR);
154 }
155
9e781440 156 hold_task_mempolicy(priv);
0c255321 157 priv->tail_vma = get_gate_vma(mm);
a6198797 158
c2e88d22
MWO
159 vma = find_vma(mm, last_addr);
160 if (vma)
a6198797 161 return vma;
59b4bf12 162
c2e88d22 163 return priv->tail_vma;
a6198797
MM
164}
165
4781f2c3 166static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
a6198797
MM
167{
168 struct proc_maps_private *priv = m->private;
fad95500
MWO
169 struct vm_area_struct *next, *vma = v;
170
171 if (vma == priv->tail_vma)
172 next = NULL;
173 else if (vma->vm_next)
174 next = vma->vm_next;
175 else
176 next = priv->tail_vma;
a6198797 177
4781f2c3 178 *ppos = next ? next->vm_start : -1UL;
c2e88d22 179
59b4bf12 180 return next;
a6198797
MM
181}
182
183static void m_stop(struct seq_file *m, void *v)
184{
185 struct proc_maps_private *priv = m->private;
d07ded61 186 struct mm_struct *mm = priv->mm;
a6198797 187
d07ded61
MWO
188 if (!priv->task)
189 return;
190
191 release_task_mempolicy(priv);
d8ed45c5 192 mmap_read_unlock(mm);
d07ded61
MWO
193 mmput(mm);
194 put_task_struct(priv->task);
195 priv->task = NULL;
a6198797
MM
196}
197
4db7d0ee
ON
198static int proc_maps_open(struct inode *inode, struct file *file,
199 const struct seq_operations *ops, int psize)
200{
201 struct proc_maps_private *priv = __seq_open_private(file, ops, psize);
202
203 if (!priv)
204 return -ENOMEM;
205
2c03376d 206 priv->inode = inode;
29a40ace
ON
207 priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
208 if (IS_ERR(priv->mm)) {
209 int err = PTR_ERR(priv->mm);
210
211 seq_release_private(inode, file);
212 return err;
213 }
214
4db7d0ee
ON
215 return 0;
216}
217
29a40ace
ON
218static int proc_map_release(struct inode *inode, struct file *file)
219{
220 struct seq_file *seq = file->private_data;
221 struct proc_maps_private *priv = seq->private;
222
223 if (priv->mm)
224 mmdrop(priv->mm);
225
226 return seq_release_private(inode, file);
227}
228
a6198797 229static int do_maps_open(struct inode *inode, struct file *file,
03a44825 230 const struct seq_operations *ops)
a6198797 231{
4db7d0ee
ON
232 return proc_maps_open(inode, file, ops,
233 sizeof(struct proc_maps_private));
a6198797 234}
e070ad49 235
65376df5
JW
236/*
237 * Indicate if the VMA is a stack for the given task; for
238 * /proc/PID/maps that is the stack of the main task.
239 */
1240ea0d 240static int is_stack(struct vm_area_struct *vma)
58cb6548 241{
b18cb64e
AL
242 /*
243 * We make no effort to guess what a given thread considers to be
244 * its "stack". It's not even well-defined for programs written
245 * languages like Go.
246 */
247 return vma->vm_start <= vma->vm_mm->start_stack &&
248 vma->vm_end >= vma->vm_mm->start_stack;
58cb6548
ON
249}
250
493b0e9d
DC
251static void show_vma_header_prefix(struct seq_file *m,
252 unsigned long start, unsigned long end,
253 vm_flags_t flags, unsigned long long pgoff,
254 dev_t dev, unsigned long ino)
255{
256 seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
0e3dc019
AV
257 seq_put_hex_ll(m, NULL, start, 8);
258 seq_put_hex_ll(m, "-", end, 8);
259 seq_putc(m, ' ');
260 seq_putc(m, flags & VM_READ ? 'r' : '-');
261 seq_putc(m, flags & VM_WRITE ? 'w' : '-');
262 seq_putc(m, flags & VM_EXEC ? 'x' : '-');
263 seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p');
264 seq_put_hex_ll(m, " ", pgoff, 8);
265 seq_put_hex_ll(m, " ", MAJOR(dev), 2);
266 seq_put_hex_ll(m, ":", MINOR(dev), 2);
267 seq_put_decimal_ull(m, " ", ino);
268 seq_putc(m, ' ');
493b0e9d
DC
269}
270
b7643757 271static void
871305bb 272show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
1da177e4 273{
e070ad49
ML
274 struct mm_struct *mm = vma->vm_mm;
275 struct file *file = vma->vm_file;
ca16d140 276 vm_flags_t flags = vma->vm_flags;
1da177e4 277 unsigned long ino = 0;
6260a4b0 278 unsigned long long pgoff = 0;
a09a79f6 279 unsigned long start, end;
1da177e4 280 dev_t dev = 0;
b7643757 281 const char *name = NULL;
1da177e4
LT
282
283 if (file) {
496ad9aa 284 struct inode *inode = file_inode(vma->vm_file);
1da177e4
LT
285 dev = inode->i_sb->s_dev;
286 ino = inode->i_ino;
6260a4b0 287 pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
1da177e4
LT
288 }
289
d7824370 290 start = vma->vm_start;
a09a79f6 291 end = vma->vm_end;
493b0e9d 292 show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
1da177e4
LT
293
294 /*
295 * Print the dentry name for named mappings, and a
296 * special [heap] marker for the heap:
297 */
e070ad49 298 if (file) {
652586df 299 seq_pad(m, ' ');
2726d566 300 seq_file_path(m, file, "\n");
b7643757
SP
301 goto done;
302 }
303
78d683e8
AL
304 if (vma->vm_ops && vma->vm_ops->name) {
305 name = vma->vm_ops->name(vma);
306 if (name)
307 goto done;
308 }
309
b7643757
SP
310 name = arch_vma_name(vma);
311 if (!name) {
5c26f6ac 312 struct anon_vma_name *anon_name;
9a10064f 313
b7643757
SP
314 if (!mm) {
315 name = "[vdso]";
316 goto done;
317 }
318
319 if (vma->vm_start <= mm->brk &&
320 vma->vm_end >= mm->start_brk) {
321 name = "[heap]";
322 goto done;
323 }
324
9a10064f 325 if (is_stack(vma)) {
65376df5 326 name = "[stack]";
9a10064f
CC
327 goto done;
328 }
329
5c26f6ac 330 anon_name = anon_vma_name(vma);
9a10064f
CC
331 if (anon_name) {
332 seq_pad(m, ' ');
5c26f6ac 333 seq_printf(m, "[anon:%s]", anon_name->name);
9a10064f 334 }
b7643757
SP
335 }
336
337done:
338 if (name) {
652586df 339 seq_pad(m, ' ');
b7643757 340 seq_puts(m, name);
1da177e4
LT
341 }
342 seq_putc(m, '\n');
7c88db0c
JK
343}
344
871305bb 345static int show_map(struct seq_file *m, void *v)
7c88db0c 346{
871305bb 347 show_map_vma(m, v);
1da177e4
LT
348 return 0;
349}
350
03a44825 351static const struct seq_operations proc_pid_maps_op = {
a6198797
MM
352 .start = m_start,
353 .next = m_next,
354 .stop = m_stop,
871305bb 355 .show = show_map
a6198797
MM
356};
357
b7643757 358static int pid_maps_open(struct inode *inode, struct file *file)
a6198797
MM
359{
360 return do_maps_open(inode, file, &proc_pid_maps_op);
361}
362
b7643757
SP
363const struct file_operations proc_pid_maps_operations = {
364 .open = pid_maps_open,
365 .read = seq_read,
366 .llseek = seq_lseek,
29a40ace 367 .release = proc_map_release,
b7643757
SP
368};
369
a6198797
MM
370/*
371 * Proportional Set Size(PSS): my share of RSS.
372 *
373 * PSS of a process is the count of pages it has in memory, where each
374 * page is divided by the number of processes sharing it. So if a
375 * process has 1000 pages all to itself, and 1000 shared with one other
376 * process, its PSS will be 1500.
377 *
378 * To keep (accumulated) division errors low, we adopt a 64bit
379 * fixed-point pss counter to minimize division errors. So (pss >>
380 * PSS_SHIFT) would be the real byte count.
381 *
382 * A shift of 12 before division means (assuming 4K page size):
383 * - 1M 3-user-pages add up to 8KB errors;
384 * - supports mapcount up to 2^24, or 16M;
385 * - supports PSS up to 2^52 bytes, or 4PB.
386 */
387#define PSS_SHIFT 12
388
1e883281 389#ifdef CONFIG_PROC_PAGE_MONITOR
214e471f 390struct mem_size_stats {
a6198797
MM
391 unsigned long resident;
392 unsigned long shared_clean;
393 unsigned long shared_dirty;
394 unsigned long private_clean;
395 unsigned long private_dirty;
396 unsigned long referenced;
b40d4f84 397 unsigned long anonymous;
cf8496ea 398 unsigned long lazyfree;
4031a219 399 unsigned long anonymous_thp;
65c45377 400 unsigned long shmem_thp;
60fbf0ab 401 unsigned long file_thp;
214e471f 402 unsigned long swap;
25ee01a2
NH
403 unsigned long shared_hugetlb;
404 unsigned long private_hugetlb;
a6198797 405 u64 pss;
ee2ad71b
LS
406 u64 pss_anon;
407 u64 pss_file;
408 u64 pss_shmem;
493b0e9d 409 u64 pss_locked;
8334b962 410 u64 swap_pss;
a6198797
MM
411};
412
ee2ad71b
LS
413static void smaps_page_accumulate(struct mem_size_stats *mss,
414 struct page *page, unsigned long size, unsigned long pss,
415 bool dirty, bool locked, bool private)
416{
417 mss->pss += pss;
418
419 if (PageAnon(page))
420 mss->pss_anon += pss;
421 else if (PageSwapBacked(page))
422 mss->pss_shmem += pss;
423 else
424 mss->pss_file += pss;
425
426 if (locked)
427 mss->pss_locked += pss;
428
429 if (dirty || PageDirty(page)) {
430 if (private)
431 mss->private_dirty += size;
432 else
433 mss->shared_dirty += size;
434 } else {
435 if (private)
436 mss->private_clean += size;
437 else
438 mss->shared_clean += size;
439 }
440}
441
c164e038 442static void smaps_account(struct mem_size_stats *mss, struct page *page,
24d7275c
YS
443 bool compound, bool young, bool dirty, bool locked,
444 bool migration)
c164e038 445{
d8c6546b 446 int i, nr = compound ? compound_nr(page) : 1;
afd9883f 447 unsigned long size = nr * PAGE_SIZE;
c164e038 448
ee2ad71b
LS
449 /*
450 * First accumulate quantities that depend only on |size| and the type
451 * of the compound page.
452 */
cf8496ea 453 if (PageAnon(page)) {
c164e038 454 mss->anonymous += size;
cf8496ea
SL
455 if (!PageSwapBacked(page) && !dirty && !PageDirty(page))
456 mss->lazyfree += size;
457 }
c164e038
KS
458
459 mss->resident += size;
460 /* Accumulate the size in pages that have been accessed. */
33c3fc71 461 if (young || page_is_young(page) || PageReferenced(page))
c164e038 462 mss->referenced += size;
c164e038 463
afd9883f 464 /*
ee2ad71b
LS
465 * Then accumulate quantities that may depend on sharing, or that may
466 * differ page-by-page.
467 *
afd9883f
KS
468 * page_count(page) == 1 guarantees the page is mapped exactly once.
469 * If any subpage of the compound page mapped with PTE it would elevate
470 * page_count().
24d7275c
YS
471 *
472 * The page_mapcount() is called to get a snapshot of the mapcount.
473 * Without holding the page lock this snapshot can be slightly wrong as
474 * we cannot always read the mapcount atomically. It is not safe to
475 * call page_mapcount() even with PTL held if the page is not mapped,
476 * especially for migration entries. Treat regular migration entries
477 * as mapcount == 1.
afd9883f 478 */
24d7275c 479 if ((page_count(page) == 1) || migration) {
ee2ad71b
LS
480 smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
481 locked, true);
afd9883f
KS
482 return;
483 }
afd9883f
KS
484 for (i = 0; i < nr; i++, page++) {
485 int mapcount = page_mapcount(page);
ee2ad71b
LS
486 unsigned long pss = PAGE_SIZE << PSS_SHIFT;
487 if (mapcount >= 2)
488 pss /= mapcount;
489 smaps_page_accumulate(mss, page, PAGE_SIZE, pss, dirty, locked,
490 mapcount < 2);
c164e038
KS
491 }
492}
ae11c4d9 493
c261e7d9 494#ifdef CONFIG_SHMEM
c261e7d9 495static int smaps_pte_hole(unsigned long addr, unsigned long end,
b7a16c7a 496 __always_unused int depth, struct mm_walk *walk)
c261e7d9
VB
497{
498 struct mem_size_stats *mss = walk->private;
10c848c8 499 struct vm_area_struct *vma = walk->vma;
c261e7d9 500
10c848c8
PX
501 mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping,
502 linear_page_index(vma, addr),
503 linear_page_index(vma, end));
c261e7d9
VB
504
505 return 0;
506}
7b86ac33
CH
507#else
508#define smaps_pte_hole NULL
509#endif /* CONFIG_SHMEM */
c261e7d9 510
23010032
PX
511static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk)
512{
513#ifdef CONFIG_SHMEM
514 if (walk->ops->pte_hole) {
515 /* depth is not used */
516 smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk);
517 }
518#endif
519}
520
c164e038
KS
521static void smaps_pte_entry(pte_t *pte, unsigned long addr,
522 struct mm_walk *walk)
ae11c4d9
DH
523{
524 struct mem_size_stats *mss = walk->private;
14eb6fdd 525 struct vm_area_struct *vma = walk->vma;
27dd768e 526 bool locked = !!(vma->vm_flags & VM_LOCKED);
b1d4d9e0 527 struct page *page = NULL;
24d7275c 528 bool migration = false;
ae11c4d9 529
c164e038
KS
530 if (pte_present(*pte)) {
531 page = vm_normal_page(vma, addr, *pte);
532 } else if (is_swap_pte(*pte)) {
533 swp_entry_t swpent = pte_to_swp_entry(*pte);
ae11c4d9 534
8334b962
MK
535 if (!non_swap_entry(swpent)) {
536 int mapcount;
537
c164e038 538 mss->swap += PAGE_SIZE;
8334b962
MK
539 mapcount = swp_swapcount(swpent);
540 if (mapcount >= 2) {
541 u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
542
543 do_div(pss_delta, mapcount);
544 mss->swap_pss += pss_delta;
545 } else {
546 mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
547 }
24d7275c
YS
548 } else if (is_pfn_swap_entry(swpent)) {
549 if (is_migration_entry(swpent))
550 migration = true;
af5cdaf8 551 page = pfn_swap_entry_to_page(swpent);
24d7275c 552 }
23010032
PX
553 } else {
554 smaps_pte_hole_lookup(addr, walk);
48131e03 555 return;
b1d4d9e0 556 }
ae11c4d9 557
ae11c4d9
DH
558 if (!page)
559 return;
afd9883f 560
24d7275c
YS
561 smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte),
562 locked, migration);
ae11c4d9
DH
563}
564
c164e038
KS
565#ifdef CONFIG_TRANSPARENT_HUGEPAGE
566static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
567 struct mm_walk *walk)
568{
569 struct mem_size_stats *mss = walk->private;
14eb6fdd 570 struct vm_area_struct *vma = walk->vma;
27dd768e 571 bool locked = !!(vma->vm_flags & VM_LOCKED);
c94b6923 572 struct page *page = NULL;
24d7275c 573 bool migration = false;
c94b6923
HY
574
575 if (pmd_present(*pmd)) {
576 /* FOLL_DUMP will return -EFAULT on huge zero page */
577 page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
578 } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
579 swp_entry_t entry = pmd_to_swp_entry(*pmd);
c164e038 580
24d7275c
YS
581 if (is_migration_entry(entry)) {
582 migration = true;
af5cdaf8 583 page = pfn_swap_entry_to_page(entry);
24d7275c 584 }
c94b6923 585 }
c164e038
KS
586 if (IS_ERR_OR_NULL(page))
587 return;
65c45377
KS
588 if (PageAnon(page))
589 mss->anonymous_thp += HPAGE_PMD_SIZE;
590 else if (PageSwapBacked(page))
591 mss->shmem_thp += HPAGE_PMD_SIZE;
ca120cf6
DW
592 else if (is_zone_device_page(page))
593 /* pass */;
65c45377 594 else
60fbf0ab 595 mss->file_thp += HPAGE_PMD_SIZE;
24d7275c
YS
596
597 smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
598 locked, migration);
c164e038
KS
599}
600#else
601static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
602 struct mm_walk *walk)
603{
604}
605#endif
606
b3ae5acb 607static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
2165009b 608 struct mm_walk *walk)
e070ad49 609{
14eb6fdd 610 struct vm_area_struct *vma = walk->vma;
ae11c4d9 611 pte_t *pte;
705e87c0 612 spinlock_t *ptl;
e070ad49 613
b6ec57f4
KS
614 ptl = pmd_trans_huge_lock(pmd, vma);
615 if (ptl) {
c94b6923 616 smaps_pmd_entry(pmd, addr, walk);
bf929152 617 spin_unlock(ptl);
14038302 618 goto out;
22e057c5 619 }
1a5a9906
AA
620
621 if (pmd_trans_unstable(pmd))
14038302 622 goto out;
22e057c5 623 /*
c1e8d7c6 624 * The mmap_lock held all the way back in m_start() is what
22e057c5
DH
625 * keeps khugepaged out of here and from collapsing things
626 * in here.
627 */
705e87c0 628 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
ae11c4d9 629 for (; addr != end; pte++, addr += PAGE_SIZE)
c164e038 630 smaps_pte_entry(pte, addr, walk);
705e87c0 631 pte_unmap_unlock(pte - 1, ptl);
14038302 632out:
705e87c0 633 cond_resched();
b3ae5acb 634 return 0;
e070ad49
ML
635}
636
834f82e2
CG
637static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
638{
639 /*
640 * Don't forget to update Documentation/ on changes.
641 */
642 static const char mnemonics[BITS_PER_LONG][2] = {
643 /*
644 * In case if we meet a flag we don't know about.
645 */
646 [0 ... (BITS_PER_LONG-1)] = "??",
647
648 [ilog2(VM_READ)] = "rd",
649 [ilog2(VM_WRITE)] = "wr",
650 [ilog2(VM_EXEC)] = "ex",
651 [ilog2(VM_SHARED)] = "sh",
652 [ilog2(VM_MAYREAD)] = "mr",
653 [ilog2(VM_MAYWRITE)] = "mw",
654 [ilog2(VM_MAYEXEC)] = "me",
655 [ilog2(VM_MAYSHARE)] = "ms",
656 [ilog2(VM_GROWSDOWN)] = "gd",
657 [ilog2(VM_PFNMAP)] = "pf",
834f82e2
CG
658 [ilog2(VM_LOCKED)] = "lo",
659 [ilog2(VM_IO)] = "io",
660 [ilog2(VM_SEQ_READ)] = "sr",
661 [ilog2(VM_RAND_READ)] = "rr",
662 [ilog2(VM_DONTCOPY)] = "dc",
663 [ilog2(VM_DONTEXPAND)] = "de",
664 [ilog2(VM_ACCOUNT)] = "ac",
665 [ilog2(VM_NORESERVE)] = "nr",
666 [ilog2(VM_HUGETLB)] = "ht",
b6fb293f 667 [ilog2(VM_SYNC)] = "sf",
834f82e2 668 [ilog2(VM_ARCH_1)] = "ar",
d2cd9ede 669 [ilog2(VM_WIPEONFORK)] = "wf",
834f82e2 670 [ilog2(VM_DONTDUMP)] = "dd",
424037b7
DK
671#ifdef CONFIG_ARM64_BTI
672 [ilog2(VM_ARM64_BTI)] = "bt",
673#endif
ec8e41ae
NH
674#ifdef CONFIG_MEM_SOFT_DIRTY
675 [ilog2(VM_SOFTDIRTY)] = "sd",
676#endif
834f82e2
CG
677 [ilog2(VM_MIXEDMAP)] = "mm",
678 [ilog2(VM_HUGEPAGE)] = "hg",
679 [ilog2(VM_NOHUGEPAGE)] = "nh",
680 [ilog2(VM_MERGEABLE)] = "mg",
16ba6f81
AA
681 [ilog2(VM_UFFD_MISSING)]= "um",
682 [ilog2(VM_UFFD_WP)] = "uw",
9f341931
CM
683#ifdef CONFIG_ARM64_MTE
684 [ilog2(VM_MTE)] = "mt",
685 [ilog2(VM_MTE_ALLOWED)] = "",
686#endif
5212213a 687#ifdef CONFIG_ARCH_HAS_PKEYS
c1192f84
DH
688 /* These come out via ProtectionKey: */
689 [ilog2(VM_PKEY_BIT0)] = "",
690 [ilog2(VM_PKEY_BIT1)] = "",
691 [ilog2(VM_PKEY_BIT2)] = "",
692 [ilog2(VM_PKEY_BIT3)] = "",
2c9e0a6f
RP
693#if VM_PKEY_BIT4
694 [ilog2(VM_PKEY_BIT4)] = "",
c1192f84 695#endif
5212213a 696#endif /* CONFIG_ARCH_HAS_PKEYS */
7677f7fd
AR
697#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
698 [ilog2(VM_UFFD_MINOR)] = "ui",
699#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
834f82e2
CG
700 };
701 size_t i;
702
703 seq_puts(m, "VmFlags: ");
704 for (i = 0; i < BITS_PER_LONG; i++) {
c1192f84
DH
705 if (!mnemonics[i][0])
706 continue;
834f82e2 707 if (vma->vm_flags & (1UL << i)) {
f6640663
AV
708 seq_putc(m, mnemonics[i][0]);
709 seq_putc(m, mnemonics[i][1]);
710 seq_putc(m, ' ');
834f82e2
CG
711 }
712 }
713 seq_putc(m, '\n');
714}
715
25ee01a2
NH
716#ifdef CONFIG_HUGETLB_PAGE
717static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
718 unsigned long addr, unsigned long end,
719 struct mm_walk *walk)
720{
721 struct mem_size_stats *mss = walk->private;
722 struct vm_area_struct *vma = walk->vma;
723 struct page *page = NULL;
724
725 if (pte_present(*pte)) {
726 page = vm_normal_page(vma, addr, *pte);
727 } else if (is_swap_pte(*pte)) {
728 swp_entry_t swpent = pte_to_swp_entry(*pte);
729
af5cdaf8
AP
730 if (is_pfn_swap_entry(swpent))
731 page = pfn_swap_entry_to_page(swpent);
25ee01a2
NH
732 }
733 if (page) {
734 int mapcount = page_mapcount(page);
735
736 if (mapcount >= 2)
737 mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
738 else
739 mss->private_hugetlb += huge_page_size(hstate_vma(vma));
740 }
741 return 0;
742}
7b86ac33
CH
743#else
744#define smaps_hugetlb_range NULL
25ee01a2
NH
745#endif /* HUGETLB_PAGE */
746
7b86ac33
CH
747static const struct mm_walk_ops smaps_walk_ops = {
748 .pmd_entry = smaps_pte_range,
749 .hugetlb_entry = smaps_hugetlb_range,
750};
751
752static const struct mm_walk_ops smaps_shmem_walk_ops = {
753 .pmd_entry = smaps_pte_range,
754 .hugetlb_entry = smaps_hugetlb_range,
755 .pte_hole = smaps_pte_hole,
756};
757
03b4b114
CC
758/*
759 * Gather mem stats from @vma with the indicated beginning
760 * address @start, and keep them in @mss.
761 *
762 * Use vm_start of @vma as the beginning address if @start is 0.
763 */
8e68d689 764static void smap_gather_stats(struct vm_area_struct *vma,
03b4b114 765 struct mem_size_stats *mss, unsigned long start)
e070ad49 766{
03b4b114
CC
767 const struct mm_walk_ops *ops = &smaps_walk_ops;
768
769 /* Invalid start */
770 if (start >= vma->vm_end)
771 return;
772
c261e7d9
VB
773#ifdef CONFIG_SHMEM
774 if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
6a15a370
VB
775 /*
776 * For shared or readonly shmem mappings we know that all
777 * swapped out pages belong to the shmem object, and we can
778 * obtain the swap value much more efficiently. For private
779 * writable mappings, we might have COW pages that are
780 * not affected by the parent swapped out pages of the shmem
781 * object, so we have to distinguish them during the page walk.
782 * Unless we know that the shmem object (or the part mapped by
783 * our VMA) has no swapped out pages at all.
784 */
785 unsigned long shmem_swapped = shmem_swap_usage(vma);
786
03b4b114
CC
787 if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
788 !(vma->vm_flags & VM_WRITE))) {
fa76da46 789 mss->swap += shmem_swapped;
6a15a370 790 } else {
03b4b114 791 ops = &smaps_shmem_walk_ops;
6a15a370 792 }
c261e7d9
VB
793 }
794#endif
c1e8d7c6 795 /* mmap_lock is held in m_start */
03b4b114
CC
796 if (!start)
797 walk_page_vma(vma, ops, mss);
798 else
799 walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss);
8e68d689
VB
800}
801
802#define SEQ_PUT_DEC(str, val) \
803 seq_put_decimal_ull_width(m, str, (val) >> 10, 8)
f1547959
VB
804
805/* Show the contents common for smaps and smaps_rollup */
ee2ad71b
LS
806static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
807 bool rollup_mode)
f1547959
VB
808{
809 SEQ_PUT_DEC("Rss: ", mss->resident);
810 SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT);
ee2ad71b
LS
811 if (rollup_mode) {
812 /*
813 * These are meaningful only for smaps_rollup, otherwise two of
814 * them are zero, and the other one is the same as Pss.
815 */
816 SEQ_PUT_DEC(" kB\nPss_Anon: ",
817 mss->pss_anon >> PSS_SHIFT);
818 SEQ_PUT_DEC(" kB\nPss_File: ",
819 mss->pss_file >> PSS_SHIFT);
820 SEQ_PUT_DEC(" kB\nPss_Shmem: ",
821 mss->pss_shmem >> PSS_SHIFT);
822 }
f1547959
VB
823 SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean);
824 SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty);
825 SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean);
826 SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty);
827 SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced);
828 SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous);
829 SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree);
830 SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp);
831 SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
471e78cc 832 SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp);
f1547959
VB
833 SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb);
834 seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ",
835 mss->private_hugetlb >> 10, 7);
836 SEQ_PUT_DEC(" kB\nSwap: ", mss->swap);
837 SEQ_PUT_DEC(" kB\nSwapPss: ",
838 mss->swap_pss >> PSS_SHIFT);
839 SEQ_PUT_DEC(" kB\nLocked: ",
840 mss->pss_locked >> PSS_SHIFT);
841 seq_puts(m, " kB\n");
842}
843
8e68d689
VB
844static int show_smap(struct seq_file *m, void *v)
845{
8e68d689 846 struct vm_area_struct *vma = v;
258f669e
VB
847 struct mem_size_stats mss;
848
849 memset(&mss, 0, sizeof(mss));
850
03b4b114 851 smap_gather_stats(vma, &mss, 0);
258f669e
VB
852
853 show_map_vma(m, vma);
854
855 SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start);
856 SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma));
857 SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma));
858 seq_puts(m, " kB\n");
859
ee2ad71b 860 __show_smap(m, &mss, false);
258f669e 861
471e78cc 862 seq_printf(m, "THPeligible: %d\n",
e6be37b2 863 transparent_hugepage_active(vma));
7635d9cb 864
258f669e
VB
865 if (arch_pkeys_enabled())
866 seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
867 show_smap_vma_flags(m, vma);
868
258f669e
VB
869 return 0;
870}
871
872static int show_smaps_rollup(struct seq_file *m, void *v)
873{
874 struct proc_maps_private *priv = m->private;
875 struct mem_size_stats mss;
876 struct mm_struct *mm;
877 struct vm_area_struct *vma;
878 unsigned long last_vma_end = 0;
8e68d689 879 int ret = 0;
8e68d689 880
258f669e
VB
881 priv->task = get_proc_task(priv->inode);
882 if (!priv->task)
883 return -ESRCH;
493b0e9d 884
258f669e
VB
885 mm = priv->mm;
886 if (!mm || !mmget_not_zero(mm)) {
887 ret = -ESRCH;
888 goto out_put_task;
493b0e9d 889 }
4752c369 890
258f669e 891 memset(&mss, 0, sizeof(mss));
493b0e9d 892
d8ed45c5 893 ret = mmap_read_lock_killable(mm);
a26a9781
KK
894 if (ret)
895 goto out_put_mm;
896
258f669e 897 hold_task_mempolicy(priv);
f1547959 898
ff9f47f6 899 for (vma = priv->mm->mmap; vma;) {
03b4b114 900 smap_gather_stats(vma, &mss, 0);
258f669e 901 last_vma_end = vma->vm_end;
ff9f47f6
CC
902
903 /*
904 * Release mmap_lock temporarily if someone wants to
905 * access it for write request.
906 */
907 if (mmap_lock_is_contended(mm)) {
908 mmap_read_unlock(mm);
909 ret = mmap_read_lock_killable(mm);
910 if (ret) {
911 release_task_mempolicy(priv);
912 goto out_put_mm;
913 }
914
915 /*
916 * After dropping the lock, there are four cases to
917 * consider. See the following example for explanation.
918 *
919 * +------+------+-----------+
920 * | VMA1 | VMA2 | VMA3 |
921 * +------+------+-----------+
922 * | | | |
923 * 4k 8k 16k 400k
924 *
925 * Suppose we drop the lock after reading VMA2 due to
926 * contention, then we get:
927 *
928 * last_vma_end = 16k
929 *
930 * 1) VMA2 is freed, but VMA3 exists:
931 *
932 * find_vma(mm, 16k - 1) will return VMA3.
933 * In this case, just continue from VMA3.
934 *
935 * 2) VMA2 still exists:
936 *
937 * find_vma(mm, 16k - 1) will return VMA2.
938 * Iterate the loop like the original one.
939 *
940 * 3) No more VMAs can be found:
941 *
942 * find_vma(mm, 16k - 1) will return NULL.
943 * No more things to do, just break.
944 *
945 * 4) (last_vma_end - 1) is the middle of a vma (VMA'):
946 *
947 * find_vma(mm, 16k - 1) will return VMA' whose range
948 * contains last_vma_end.
949 * Iterate VMA' from last_vma_end.
950 */
951 vma = find_vma(mm, last_vma_end - 1);
952 /* Case 3 above */
953 if (!vma)
954 break;
955
956 /* Case 1 above */
957 if (vma->vm_start >= last_vma_end)
958 continue;
959
960 /* Case 4 above */
961 if (vma->vm_end > last_vma_end)
962 smap_gather_stats(vma, &mss, last_vma_end);
963 }
964 /* Case 2 above */
965 vma = vma->vm_next;
493b0e9d 966 }
258f669e
VB
967
968 show_vma_header_prefix(m, priv->mm->mmap->vm_start,
969 last_vma_end, 0, 0, 0, 0);
970 seq_pad(m, ' ');
971 seq_puts(m, "[rollup]\n");
972
ee2ad71b 973 __show_smap(m, &mss, true);
258f669e
VB
974
975 release_task_mempolicy(priv);
d8ed45c5 976 mmap_read_unlock(mm);
258f669e 977
a26a9781
KK
978out_put_mm:
979 mmput(mm);
258f669e
VB
980out_put_task:
981 put_task_struct(priv->task);
982 priv->task = NULL;
983
493b0e9d 984 return ret;
e070ad49 985}
d1be35cb 986#undef SEQ_PUT_DEC
e070ad49 987
03a44825 988static const struct seq_operations proc_pid_smaps_op = {
a6198797
MM
989 .start = m_start,
990 .next = m_next,
991 .stop = m_stop,
871305bb 992 .show = show_smap
a6198797
MM
993};
994
b7643757 995static int pid_smaps_open(struct inode *inode, struct file *file)
a6198797
MM
996{
997 return do_maps_open(inode, file, &proc_pid_smaps_op);
998}
999
258f669e 1000static int smaps_rollup_open(struct inode *inode, struct file *file)
493b0e9d 1001{
258f669e 1002 int ret;
493b0e9d 1003 struct proc_maps_private *priv;
258f669e
VB
1004
1005 priv = kzalloc(sizeof(*priv), GFP_KERNEL_ACCOUNT);
1006 if (!priv)
493b0e9d 1007 return -ENOMEM;
258f669e
VB
1008
1009 ret = single_open(file, show_smaps_rollup, priv);
1010 if (ret)
1011 goto out_free;
1012
1013 priv->inode = inode;
1014 priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
1015 if (IS_ERR(priv->mm)) {
1016 ret = PTR_ERR(priv->mm);
1017
1018 single_release(inode, file);
1019 goto out_free;
493b0e9d 1020 }
258f669e 1021
493b0e9d 1022 return 0;
258f669e
VB
1023
1024out_free:
1025 kfree(priv);
1026 return ret;
1027}
1028
1029static int smaps_rollup_release(struct inode *inode, struct file *file)
1030{
1031 struct seq_file *seq = file->private_data;
1032 struct proc_maps_private *priv = seq->private;
1033
1034 if (priv->mm)
1035 mmdrop(priv->mm);
1036
1037 kfree(priv);
1038 return single_release(inode, file);
493b0e9d
DC
1039}
1040
b7643757
SP
1041const struct file_operations proc_pid_smaps_operations = {
1042 .open = pid_smaps_open,
1043 .read = seq_read,
1044 .llseek = seq_lseek,
29a40ace 1045 .release = proc_map_release,
b7643757
SP
1046};
1047
493b0e9d 1048const struct file_operations proc_pid_smaps_rollup_operations = {
258f669e 1049 .open = smaps_rollup_open,
493b0e9d
DC
1050 .read = seq_read,
1051 .llseek = seq_lseek,
258f669e 1052 .release = smaps_rollup_release,
493b0e9d
DC
1053};
1054
040fa020
PE
1055enum clear_refs_types {
1056 CLEAR_REFS_ALL = 1,
1057 CLEAR_REFS_ANON,
1058 CLEAR_REFS_MAPPED,
0f8975ec 1059 CLEAR_REFS_SOFT_DIRTY,
695f0559 1060 CLEAR_REFS_MM_HIWATER_RSS,
040fa020
PE
1061 CLEAR_REFS_LAST,
1062};
1063
af9de7eb 1064struct clear_refs_private {
0f8975ec 1065 enum clear_refs_types type;
af9de7eb
PE
1066};
1067
7d5b3bfa 1068#ifdef CONFIG_MEM_SOFT_DIRTY
9348b73c 1069
9348b73c
LT
1070static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
1071{
1072 struct page *page;
1073
1074 if (!pte_write(pte))
1075 return false;
1076 if (!is_cow_mapping(vma->vm_flags))
1077 return false;
a458b76a 1078 if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)))
9348b73c
LT
1079 return false;
1080 page = vm_normal_page(vma, addr, pte);
1081 if (!page)
1082 return false;
1083 return page_maybe_dma_pinned(page);
1084}
1085
0f8975ec
PE
1086static inline void clear_soft_dirty(struct vm_area_struct *vma,
1087 unsigned long addr, pte_t *pte)
1088{
0f8975ec
PE
1089 /*
1090 * The soft-dirty tracker uses #PF-s to catch writes
1091 * to pages, so write-protect the pte as well. See the
1ad1335d 1092 * Documentation/admin-guide/mm/soft-dirty.rst for full description
0f8975ec
PE
1093 * of how soft-dirty works.
1094 */
1095 pte_t ptent = *pte;
179ef71c
CG
1096
1097 if (pte_present(ptent)) {
04a86453
AK
1098 pte_t old_pte;
1099
9348b73c
LT
1100 if (pte_is_pinned(vma, addr, ptent))
1101 return;
04a86453
AK
1102 old_pte = ptep_modify_prot_start(vma, addr, pte);
1103 ptent = pte_wrprotect(old_pte);
a7b76174 1104 ptent = pte_clear_soft_dirty(ptent);
04a86453 1105 ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
179ef71c
CG
1106 } else if (is_swap_pte(ptent)) {
1107 ptent = pte_swp_clear_soft_dirty(ptent);
326c2597 1108 set_pte_at(vma->vm_mm, addr, pte, ptent);
179ef71c 1109 }
0f8975ec 1110}
5d3875a0
LD
1111#else
1112static inline void clear_soft_dirty(struct vm_area_struct *vma,
1113 unsigned long addr, pte_t *pte)
1114{
1115}
1116#endif
0f8975ec 1117
5d3875a0 1118#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
7d5b3bfa
KS
1119static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
1120 unsigned long addr, pmd_t *pmdp)
1121{
a3cf988f 1122 pmd_t old, pmd = *pmdp;
5b7abeae 1123
ab6e3d09
NH
1124 if (pmd_present(pmd)) {
1125 /* See comment in change_huge_pmd() */
a3cf988f
KS
1126 old = pmdp_invalidate(vma, addr, pmdp);
1127 if (pmd_dirty(old))
ab6e3d09 1128 pmd = pmd_mkdirty(pmd);
a3cf988f 1129 if (pmd_young(old))
ab6e3d09
NH
1130 pmd = pmd_mkyoung(pmd);
1131
1132 pmd = pmd_wrprotect(pmd);
1133 pmd = pmd_clear_soft_dirty(pmd);
1134
1135 set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
1136 } else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
1137 pmd = pmd_swp_clear_soft_dirty(pmd);
1138 set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
1139 }
7d5b3bfa 1140}
7d5b3bfa 1141#else
7d5b3bfa
KS
1142static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
1143 unsigned long addr, pmd_t *pmdp)
1144{
1145}
1146#endif
1147
a6198797 1148static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
2165009b 1149 unsigned long end, struct mm_walk *walk)
a6198797 1150{
af9de7eb 1151 struct clear_refs_private *cp = walk->private;
5c64f52a 1152 struct vm_area_struct *vma = walk->vma;
a6198797
MM
1153 pte_t *pte, ptent;
1154 spinlock_t *ptl;
1155 struct page *page;
1156
b6ec57f4
KS
1157 ptl = pmd_trans_huge_lock(pmd, vma);
1158 if (ptl) {
7d5b3bfa
KS
1159 if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
1160 clear_soft_dirty_pmd(vma, addr, pmd);
1161 goto out;
1162 }
1163
84c3fc4e
ZY
1164 if (!pmd_present(*pmd))
1165 goto out;
1166
7d5b3bfa
KS
1167 page = pmd_page(*pmd);
1168
1169 /* Clear accessed and referenced bits. */
1170 pmdp_test_and_clear_young(vma, addr, pmd);
33c3fc71 1171 test_and_clear_page_young(page);
7d5b3bfa
KS
1172 ClearPageReferenced(page);
1173out:
1174 spin_unlock(ptl);
1175 return 0;
1176 }
1177
1a5a9906
AA
1178 if (pmd_trans_unstable(pmd))
1179 return 0;
03319327 1180
a6198797
MM
1181 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1182 for (; addr != end; pte++, addr += PAGE_SIZE) {
1183 ptent = *pte;
a6198797 1184
0f8975ec
PE
1185 if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
1186 clear_soft_dirty(vma, addr, pte);
1187 continue;
1188 }
1189
179ef71c
CG
1190 if (!pte_present(ptent))
1191 continue;
1192
a6198797
MM
1193 page = vm_normal_page(vma, addr, ptent);
1194 if (!page)
1195 continue;
1196
1197 /* Clear accessed and referenced bits. */
1198 ptep_test_and_clear_young(vma, addr, pte);
33c3fc71 1199 test_and_clear_page_young(page);
a6198797
MM
1200 ClearPageReferenced(page);
1201 }
1202 pte_unmap_unlock(pte - 1, ptl);
1203 cond_resched();
1204 return 0;
1205}
1206
5c64f52a
NH
1207static int clear_refs_test_walk(unsigned long start, unsigned long end,
1208 struct mm_walk *walk)
1209{
1210 struct clear_refs_private *cp = walk->private;
1211 struct vm_area_struct *vma = walk->vma;
1212
48684a65
NH
1213 if (vma->vm_flags & VM_PFNMAP)
1214 return 1;
1215
5c64f52a
NH
1216 /*
1217 * Writing 1 to /proc/pid/clear_refs affects all pages.
1218 * Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
1219 * Writing 3 to /proc/pid/clear_refs only affects file mapped pages.
1220 * Writing 4 to /proc/pid/clear_refs affects all pages.
1221 */
1222 if (cp->type == CLEAR_REFS_ANON && vma->vm_file)
1223 return 1;
1224 if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file)
1225 return 1;
1226 return 0;
1227}
1228
7b86ac33
CH
1229static const struct mm_walk_ops clear_refs_walk_ops = {
1230 .pmd_entry = clear_refs_pte_range,
1231 .test_walk = clear_refs_test_walk,
1232};
1233
f248dcb3
MM
1234static ssize_t clear_refs_write(struct file *file, const char __user *buf,
1235 size_t count, loff_t *ppos)
b813e931 1236{
f248dcb3 1237 struct task_struct *task;
fb92a4b0 1238 char buffer[PROC_NUMBUF];
f248dcb3 1239 struct mm_struct *mm;
b813e931 1240 struct vm_area_struct *vma;
040fa020
PE
1241 enum clear_refs_types type;
1242 int itype;
0a8cb8e3 1243 int rv;
b813e931 1244
f248dcb3
MM
1245 memset(buffer, 0, sizeof(buffer));
1246 if (count > sizeof(buffer) - 1)
1247 count = sizeof(buffer) - 1;
1248 if (copy_from_user(buffer, buf, count))
1249 return -EFAULT;
040fa020 1250 rv = kstrtoint(strstrip(buffer), 10, &itype);
0a8cb8e3
AD
1251 if (rv < 0)
1252 return rv;
040fa020
PE
1253 type = (enum clear_refs_types)itype;
1254 if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
f248dcb3 1255 return -EINVAL;
541c237c 1256
496ad9aa 1257 task = get_proc_task(file_inode(file));
f248dcb3
MM
1258 if (!task)
1259 return -ESRCH;
1260 mm = get_task_mm(task);
1261 if (mm) {
ac46d4f3 1262 struct mmu_notifier_range range;
af9de7eb 1263 struct clear_refs_private cp = {
0f8975ec 1264 .type = type,
af9de7eb 1265 };
695f0559 1266
29a951df
LT
1267 if (mmap_write_lock_killable(mm)) {
1268 count = -EINTR;
1269 goto out_mm;
1270 }
695f0559
PC
1271 if (type == CLEAR_REFS_MM_HIWATER_RSS) {
1272 /*
1273 * Writing 5 to /proc/pid/clear_refs resets the peak
1274 * resident set size to this mm's current rss value.
1275 */
695f0559 1276 reset_mm_hiwater_rss(mm);
29a951df 1277 goto out_unlock;
695f0559
PC
1278 }
1279
64e45507
PF
1280 if (type == CLEAR_REFS_SOFT_DIRTY) {
1281 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1282 if (!(vma->vm_flags & VM_SOFTDIRTY))
1283 continue;
29a951df
LT
1284 vma->vm_flags &= ~VM_SOFTDIRTY;
1285 vma_set_page_prot(vma);
64e45507 1286 }
ac46d4f3 1287
912efa17 1288 inc_tlb_flush_pending(mm);
7269f999
JG
1289 mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
1290 0, NULL, mm, 0, -1UL);
ac46d4f3 1291 mmu_notifier_invalidate_range_start(&range);
64e45507 1292 }
7b86ac33
CH
1293 walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops,
1294 &cp);
912efa17 1295 if (type == CLEAR_REFS_SOFT_DIRTY) {
ac46d4f3 1296 mmu_notifier_invalidate_range_end(&range);
912efa17
WD
1297 flush_tlb_mm(mm);
1298 dec_tlb_flush_pending(mm);
1299 }
29a951df
LT
1300out_unlock:
1301 mmap_write_unlock(mm);
695f0559 1302out_mm:
f248dcb3
MM
1303 mmput(mm);
1304 }
1305 put_task_struct(task);
fb92a4b0
VL
1306
1307 return count;
b813e931
DR
1308}
1309
f248dcb3
MM
1310const struct file_operations proc_clear_refs_operations = {
1311 .write = clear_refs_write,
6038f373 1312 .llseek = noop_llseek,
f248dcb3
MM
1313};
1314
092b50ba
NH
1315typedef struct {
1316 u64 pme;
1317} pagemap_entry_t;
1318
85863e47 1319struct pagemapread {
8c829622 1320 int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
092b50ba 1321 pagemap_entry_t *buffer;
1c90308e 1322 bool show_pfn;
85863e47
MM
1323};
1324
5aaabe83
NH
1325#define PAGEMAP_WALK_SIZE (PMD_SIZE)
1326#define PAGEMAP_WALK_MASK (PMD_MASK)
1327
deb94544
KK
1328#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
1329#define PM_PFRAME_BITS 55
1330#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
1331#define PM_SOFT_DIRTY BIT_ULL(55)
77bb499b 1332#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
fb8e37f3 1333#define PM_UFFD_WP BIT_ULL(57)
deb94544
KK
1334#define PM_FILE BIT_ULL(61)
1335#define PM_SWAP BIT_ULL(62)
1336#define PM_PRESENT BIT_ULL(63)
1337
85863e47
MM
1338#define PM_END_OF_BUFFER 1
1339
deb94544 1340static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
092b50ba 1341{
deb94544 1342 return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
092b50ba
NH
1343}
1344
1345static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
85863e47
MM
1346 struct pagemapread *pm)
1347{
092b50ba 1348 pm->buffer[pm->pos++] = *pme;
d82ef020 1349 if (pm->pos >= pm->len)
aae8679b 1350 return PM_END_OF_BUFFER;
85863e47
MM
1351 return 0;
1352}
1353
1354static int pagemap_pte_hole(unsigned long start, unsigned long end,
b7a16c7a 1355 __always_unused int depth, struct mm_walk *walk)
85863e47 1356{
2165009b 1357 struct pagemapread *pm = walk->private;
68b5a652 1358 unsigned long addr = start;
85863e47 1359 int err = 0;
092b50ba 1360
68b5a652
PF
1361 while (addr < end) {
1362 struct vm_area_struct *vma = find_vma(walk->mm, addr);
deb94544 1363 pagemap_entry_t pme = make_pme(0, 0);
87e6d49a
PF
1364 /* End of address space hole, which we mark as non-present. */
1365 unsigned long hole_end;
68b5a652 1366
87e6d49a
PF
1367 if (vma)
1368 hole_end = min(end, vma->vm_start);
1369 else
1370 hole_end = end;
1371
1372 for (; addr < hole_end; addr += PAGE_SIZE) {
1373 err = add_to_pagemap(addr, &pme, pm);
1374 if (err)
1375 goto out;
68b5a652
PF
1376 }
1377
87e6d49a
PF
1378 if (!vma)
1379 break;
1380
1381 /* Addresses in the VMA. */
1382 if (vma->vm_flags & VM_SOFTDIRTY)
deb94544 1383 pme = make_pme(0, PM_SOFT_DIRTY);
87e6d49a 1384 for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
68b5a652
PF
1385 err = add_to_pagemap(addr, &pme, pm);
1386 if (err)
1387 goto out;
1388 }
85863e47 1389 }
68b5a652 1390out:
85863e47
MM
1391 return err;
1392}
1393
deb94544 1394static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
052fb0d6 1395 struct vm_area_struct *vma, unsigned long addr, pte_t pte)
85863e47 1396{
deb94544 1397 u64 frame = 0, flags = 0;
052fb0d6 1398 struct page *page = NULL;
24d7275c 1399 bool migration = false;
85863e47 1400
052fb0d6 1401 if (pte_present(pte)) {
1c90308e
KK
1402 if (pm->show_pfn)
1403 frame = pte_pfn(pte);
deb94544 1404 flags |= PM_PRESENT;
25b2995a 1405 page = vm_normal_page(vma, addr, pte);
e9cdd6e7 1406 if (pte_soft_dirty(pte))
deb94544 1407 flags |= PM_SOFT_DIRTY;
fb8e37f3
PX
1408 if (pte_uffd_wp(pte))
1409 flags |= PM_UFFD_WP;
052fb0d6 1410 } else if (is_swap_pte(pte)) {
179ef71c
CG
1411 swp_entry_t entry;
1412 if (pte_swp_soft_dirty(pte))
deb94544 1413 flags |= PM_SOFT_DIRTY;
fb8e37f3
PX
1414 if (pte_swp_uffd_wp(pte))
1415 flags |= PM_UFFD_WP;
179ef71c 1416 entry = pte_to_swp_entry(pte);
ab6ecf24
HY
1417 if (pm->show_pfn)
1418 frame = swp_type(entry) |
1419 (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
deb94544 1420 flags |= PM_SWAP;
24d7275c 1421 migration = is_migration_entry(entry);
af5cdaf8
AP
1422 if (is_pfn_swap_entry(entry))
1423 page = pfn_swap_entry_to_page(entry);
052fb0d6
KK
1424 }
1425
1426 if (page && !PageAnon(page))
1427 flags |= PM_FILE;
24d7275c 1428 if (page && !migration && page_mapcount(page) == 1)
77bb499b 1429 flags |= PM_MMAP_EXCLUSIVE;
deb94544
KK
1430 if (vma->vm_flags & VM_SOFTDIRTY)
1431 flags |= PM_SOFT_DIRTY;
052fb0d6 1432
deb94544 1433 return make_pme(frame, flags);
bcf8039e
DH
1434}
1435
356515e7 1436static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
2165009b 1437 struct mm_walk *walk)
85863e47 1438{
f995ece2 1439 struct vm_area_struct *vma = walk->vma;
2165009b 1440 struct pagemapread *pm = walk->private;
bf929152 1441 spinlock_t *ptl;
05fbf357 1442 pte_t *pte, *orig_pte;
85863e47 1443 int err = 0;
356515e7 1444#ifdef CONFIG_TRANSPARENT_HUGEPAGE
24d7275c
YS
1445 bool migration = false;
1446
b6ec57f4
KS
1447 ptl = pmd_trans_huge_lock(pmdp, vma);
1448 if (ptl) {
356515e7
KK
1449 u64 flags = 0, frame = 0;
1450 pmd_t pmd = *pmdp;
84c3fc4e 1451 struct page *page = NULL;
0f8975ec 1452
b83d7e43 1453 if (vma->vm_flags & VM_SOFTDIRTY)
deb94544 1454 flags |= PM_SOFT_DIRTY;
d9104d1c 1455
356515e7 1456 if (pmd_present(pmd)) {
84c3fc4e 1457 page = pmd_page(pmd);
77bb499b 1458
356515e7 1459 flags |= PM_PRESENT;
b83d7e43
HY
1460 if (pmd_soft_dirty(pmd))
1461 flags |= PM_SOFT_DIRTY;
fb8e37f3
PX
1462 if (pmd_uffd_wp(pmd))
1463 flags |= PM_UFFD_WP;
1c90308e
KK
1464 if (pm->show_pfn)
1465 frame = pmd_pfn(pmd) +
1466 ((addr & ~PMD_MASK) >> PAGE_SHIFT);
356515e7 1467 }
84c3fc4e
ZY
1468#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1469 else if (is_swap_pmd(pmd)) {
1470 swp_entry_t entry = pmd_to_swp_entry(pmd);
ab6ecf24 1471 unsigned long offset;
84c3fc4e 1472
ab6ecf24
HY
1473 if (pm->show_pfn) {
1474 offset = swp_offset(entry) +
1475 ((addr & ~PMD_MASK) >> PAGE_SHIFT);
1476 frame = swp_type(entry) |
1477 (offset << MAX_SWAPFILES_SHIFT);
1478 }
84c3fc4e 1479 flags |= PM_SWAP;
b83d7e43
HY
1480 if (pmd_swp_soft_dirty(pmd))
1481 flags |= PM_SOFT_DIRTY;
fb8e37f3
PX
1482 if (pmd_swp_uffd_wp(pmd))
1483 flags |= PM_UFFD_WP;
84c3fc4e 1484 VM_BUG_ON(!is_pmd_migration_entry(pmd));
24d7275c 1485 migration = is_migration_entry(entry);
af5cdaf8 1486 page = pfn_swap_entry_to_page(entry);
84c3fc4e
ZY
1487 }
1488#endif
1489
24d7275c 1490 if (page && !migration && page_mapcount(page) == 1)
84c3fc4e 1491 flags |= PM_MMAP_EXCLUSIVE;
356515e7 1492
025c5b24 1493 for (; addr != end; addr += PAGE_SIZE) {
356515e7 1494 pagemap_entry_t pme = make_pme(frame, flags);
025c5b24 1495
092b50ba 1496 err = add_to_pagemap(addr, &pme, pm);
025c5b24
NH
1497 if (err)
1498 break;
ab6ecf24
HY
1499 if (pm->show_pfn) {
1500 if (flags & PM_PRESENT)
1501 frame++;
1502 else if (flags & PM_SWAP)
1503 frame += (1 << MAX_SWAPFILES_SHIFT);
1504 }
5aaabe83 1505 }
bf929152 1506 spin_unlock(ptl);
025c5b24 1507 return err;
5aaabe83
NH
1508 }
1509
356515e7 1510 if (pmd_trans_unstable(pmdp))
45f83cef 1511 return 0;
356515e7 1512#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
81d0fa62 1513
f995ece2
NH
1514 /*
1515 * We can assume that @vma always points to a valid one and @end never
1516 * goes beyond vma->vm_end.
1517 */
356515e7 1518 orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
f995ece2
NH
1519 for (; addr < end; pte++, addr += PAGE_SIZE) {
1520 pagemap_entry_t pme;
05fbf357 1521
deb94544 1522 pme = pte_to_pagemap_entry(pm, vma, addr, *pte);
f995ece2 1523 err = add_to_pagemap(addr, &pme, pm);
05fbf357 1524 if (err)
81d0fa62 1525 break;
85863e47 1526 }
f995ece2 1527 pte_unmap_unlock(orig_pte, ptl);
85863e47
MM
1528
1529 cond_resched();
1530
1531 return err;
1532}
1533
1a5cb814 1534#ifdef CONFIG_HUGETLB_PAGE
116354d1 1535/* This function walks within one hugetlb entry in the single call */
356515e7 1536static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
116354d1
NH
1537 unsigned long addr, unsigned long end,
1538 struct mm_walk *walk)
5dc37642 1539{
5dc37642 1540 struct pagemapread *pm = walk->private;
f995ece2 1541 struct vm_area_struct *vma = walk->vma;
356515e7 1542 u64 flags = 0, frame = 0;
5dc37642 1543 int err = 0;
356515e7 1544 pte_t pte;
5dc37642 1545
f995ece2 1546 if (vma->vm_flags & VM_SOFTDIRTY)
deb94544 1547 flags |= PM_SOFT_DIRTY;
d9104d1c 1548
356515e7
KK
1549 pte = huge_ptep_get(ptep);
1550 if (pte_present(pte)) {
1551 struct page *page = pte_page(pte);
1552
1553 if (!PageAnon(page))
1554 flags |= PM_FILE;
1555
77bb499b
KK
1556 if (page_mapcount(page) == 1)
1557 flags |= PM_MMAP_EXCLUSIVE;
1558
356515e7 1559 flags |= PM_PRESENT;
1c90308e
KK
1560 if (pm->show_pfn)
1561 frame = pte_pfn(pte) +
1562 ((addr & ~hmask) >> PAGE_SHIFT);
356515e7
KK
1563 }
1564
5dc37642 1565 for (; addr != end; addr += PAGE_SIZE) {
356515e7
KK
1566 pagemap_entry_t pme = make_pme(frame, flags);
1567
092b50ba 1568 err = add_to_pagemap(addr, &pme, pm);
5dc37642
NH
1569 if (err)
1570 return err;
1c90308e 1571 if (pm->show_pfn && (flags & PM_PRESENT))
356515e7 1572 frame++;
5dc37642
NH
1573 }
1574
1575 cond_resched();
1576
1577 return err;
1578}
7b86ac33
CH
1579#else
1580#define pagemap_hugetlb_range NULL
1a5cb814 1581#endif /* HUGETLB_PAGE */
5dc37642 1582
7b86ac33
CH
1583static const struct mm_walk_ops pagemap_ops = {
1584 .pmd_entry = pagemap_pmd_range,
1585 .pte_hole = pagemap_pte_hole,
1586 .hugetlb_entry = pagemap_hugetlb_range,
1587};
1588
85863e47
MM
1589/*
1590 * /proc/pid/pagemap - an array mapping virtual pages to pfns
1591 *
f16278c6
HR
1592 * For each page in the address space, this file contains one 64-bit entry
1593 * consisting of the following:
1594 *
052fb0d6 1595 * Bits 0-54 page frame number (PFN) if present
f16278c6 1596 * Bits 0-4 swap type if swapped
052fb0d6 1597 * Bits 5-54 swap offset if swapped
1ad1335d 1598 * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst)
77bb499b 1599 * Bit 56 page exclusively mapped
dd21bfa4
YZ
1600 * Bit 57 pte is uffd-wp write-protected
1601 * Bits 58-60 zero
052fb0d6 1602 * Bit 61 page is file-page or shared-anon
f16278c6
HR
1603 * Bit 62 page swapped
1604 * Bit 63 page present
1605 *
1606 * If the page is not present but in swap, then the PFN contains an
1607 * encoding of the swap file number and the page's offset into the
1608 * swap. Unmapped pages return a null PFN. This allows determining
85863e47
MM
1609 * precisely which pages are mapped (or in swap) and comparing mapped
1610 * pages between processes.
1611 *
1612 * Efficient users of this interface will use /proc/pid/maps to
1613 * determine which areas of memory are actually mapped and llseek to
1614 * skip over unmapped regions.
1615 */
1616static ssize_t pagemap_read(struct file *file, char __user *buf,
1617 size_t count, loff_t *ppos)
1618{
a06db751 1619 struct mm_struct *mm = file->private_data;
85863e47 1620 struct pagemapread pm;
5d7e0d2b
AM
1621 unsigned long src;
1622 unsigned long svpfn;
1623 unsigned long start_vaddr;
1624 unsigned long end_vaddr;
a06db751 1625 int ret = 0, copied = 0;
85863e47 1626
388f7934 1627 if (!mm || !mmget_not_zero(mm))
85863e47
MM
1628 goto out;
1629
85863e47
MM
1630 ret = -EINVAL;
1631 /* file position must be aligned */
aae8679b 1632 if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
a06db751 1633 goto out_mm;
85863e47
MM
1634
1635 ret = 0;
08161786 1636 if (!count)
a06db751 1637 goto out_mm;
08161786 1638
1c90308e
KK
1639 /* do not disclose physical addresses: attack vector */
1640 pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
1641
8c829622 1642 pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
6da2ec56 1643 pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL);
5d7e0d2b 1644 ret = -ENOMEM;
d82ef020 1645 if (!pm.buffer)
a06db751 1646 goto out_mm;
85863e47 1647
5d7e0d2b
AM
1648 src = *ppos;
1649 svpfn = src / PM_ENTRY_BYTES;
a06db751 1650 end_vaddr = mm->task_size;
5d7e0d2b
AM
1651
1652 /* watch out for wraparound */
40d6366e
MC
1653 start_vaddr = end_vaddr;
1654 if (svpfn <= (ULONG_MAX >> PAGE_SHIFT))
1655 start_vaddr = untagged_addr(svpfn << PAGE_SHIFT);
1656
1657 /* Ensure the address is inside the task */
1658 if (start_vaddr > mm->task_size)
5d7e0d2b
AM
1659 start_vaddr = end_vaddr;
1660
1661 /*
1662 * The odds are that this will stop walking way
1663 * before end_vaddr, because the length of the
1664 * user buffer is tracked in "pm", and the walk
1665 * will stop when we hit the end of the buffer.
1666 */
d82ef020
KH
1667 ret = 0;
1668 while (count && (start_vaddr < end_vaddr)) {
1669 int len;
1670 unsigned long end;
1671
1672 pm.pos = 0;
ea251c1d 1673 end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
d82ef020
KH
1674 /* overflow ? */
1675 if (end < start_vaddr || end > end_vaddr)
1676 end = end_vaddr;
d8ed45c5 1677 ret = mmap_read_lock_killable(mm);
ad80b932
KK
1678 if (ret)
1679 goto out_free;
7b86ac33 1680 ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm);
d8ed45c5 1681 mmap_read_unlock(mm);
d82ef020
KH
1682 start_vaddr = end;
1683
1684 len = min(count, PM_ENTRY_BYTES * pm.pos);
309361e0 1685 if (copy_to_user(buf, pm.buffer, len)) {
d82ef020 1686 ret = -EFAULT;
a06db751 1687 goto out_free;
d82ef020
KH
1688 }
1689 copied += len;
1690 buf += len;
1691 count -= len;
85863e47 1692 }
d82ef020
KH
1693 *ppos += copied;
1694 if (!ret || ret == PM_END_OF_BUFFER)
1695 ret = copied;
1696
98bc93e5
KM
1697out_free:
1698 kfree(pm.buffer);
a06db751
KK
1699out_mm:
1700 mmput(mm);
85863e47
MM
1701out:
1702 return ret;
1703}
1704
541c237c
PE
1705static int pagemap_open(struct inode *inode, struct file *file)
1706{
a06db751
KK
1707 struct mm_struct *mm;
1708
a06db751
KK
1709 mm = proc_mem_open(inode, PTRACE_MODE_READ);
1710 if (IS_ERR(mm))
1711 return PTR_ERR(mm);
1712 file->private_data = mm;
1713 return 0;
1714}
1715
1716static int pagemap_release(struct inode *inode, struct file *file)
1717{
1718 struct mm_struct *mm = file->private_data;
1719
1720 if (mm)
1721 mmdrop(mm);
541c237c
PE
1722 return 0;
1723}
1724
85863e47
MM
1725const struct file_operations proc_pagemap_operations = {
1726 .llseek = mem_lseek, /* borrow this */
1727 .read = pagemap_read,
541c237c 1728 .open = pagemap_open,
a06db751 1729 .release = pagemap_release,
85863e47 1730};
1e883281 1731#endif /* CONFIG_PROC_PAGE_MONITOR */
85863e47 1732
6e21c8f1 1733#ifdef CONFIG_NUMA
6e21c8f1 1734
f69ff943 1735struct numa_maps {
f69ff943
SW
1736 unsigned long pages;
1737 unsigned long anon;
1738 unsigned long active;
1739 unsigned long writeback;
1740 unsigned long mapcount_max;
1741 unsigned long dirty;
1742 unsigned long swapcache;
1743 unsigned long node[MAX_NUMNODES];
1744};
1745
5b52fc89
SW
1746struct numa_maps_private {
1747 struct proc_maps_private proc_maps;
1748 struct numa_maps md;
1749};
1750
eb4866d0
DH
1751static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
1752 unsigned long nr_pages)
f69ff943
SW
1753{
1754 int count = page_mapcount(page);
1755
eb4866d0 1756 md->pages += nr_pages;
f69ff943 1757 if (pte_dirty || PageDirty(page))
eb4866d0 1758 md->dirty += nr_pages;
f69ff943
SW
1759
1760 if (PageSwapCache(page))
eb4866d0 1761 md->swapcache += nr_pages;
f69ff943
SW
1762
1763 if (PageActive(page) || PageUnevictable(page))
eb4866d0 1764 md->active += nr_pages;
f69ff943
SW
1765
1766 if (PageWriteback(page))
eb4866d0 1767 md->writeback += nr_pages;
f69ff943
SW
1768
1769 if (PageAnon(page))
eb4866d0 1770 md->anon += nr_pages;
f69ff943
SW
1771
1772 if (count > md->mapcount_max)
1773 md->mapcount_max = count;
1774
eb4866d0 1775 md->node[page_to_nid(page)] += nr_pages;
f69ff943
SW
1776}
1777
3200a8aa
DH
1778static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
1779 unsigned long addr)
1780{
1781 struct page *page;
1782 int nid;
1783
1784 if (!pte_present(pte))
1785 return NULL;
1786
1787 page = vm_normal_page(vma, addr, pte);
1788 if (!page)
1789 return NULL;
1790
1791 if (PageReserved(page))
1792 return NULL;
1793
1794 nid = page_to_nid(page);
4ff1b2c2 1795 if (!node_isset(nid, node_states[N_MEMORY]))
3200a8aa
DH
1796 return NULL;
1797
1798 return page;
1799}
1800
28093f9f
GS
1801#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1802static struct page *can_gather_numa_stats_pmd(pmd_t pmd,
1803 struct vm_area_struct *vma,
1804 unsigned long addr)
1805{
1806 struct page *page;
1807 int nid;
1808
1809 if (!pmd_present(pmd))
1810 return NULL;
1811
1812 page = vm_normal_page_pmd(vma, addr, pmd);
1813 if (!page)
1814 return NULL;
1815
1816 if (PageReserved(page))
1817 return NULL;
1818
1819 nid = page_to_nid(page);
1820 if (!node_isset(nid, node_states[N_MEMORY]))
1821 return NULL;
1822
1823 return page;
1824}
1825#endif
1826
f69ff943
SW
1827static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
1828 unsigned long end, struct mm_walk *walk)
1829{
d85f4d6d
NH
1830 struct numa_maps *md = walk->private;
1831 struct vm_area_struct *vma = walk->vma;
f69ff943
SW
1832 spinlock_t *ptl;
1833 pte_t *orig_pte;
1834 pte_t *pte;
1835
28093f9f 1836#ifdef CONFIG_TRANSPARENT_HUGEPAGE
b6ec57f4
KS
1837 ptl = pmd_trans_huge_lock(pmd, vma);
1838 if (ptl) {
025c5b24
NH
1839 struct page *page;
1840
28093f9f 1841 page = can_gather_numa_stats_pmd(*pmd, vma, addr);
025c5b24 1842 if (page)
28093f9f 1843 gather_stats(page, md, pmd_dirty(*pmd),
025c5b24 1844 HPAGE_PMD_SIZE/PAGE_SIZE);
bf929152 1845 spin_unlock(ptl);
025c5b24 1846 return 0;
32ef4384
DH
1847 }
1848
1a5a9906
AA
1849 if (pmd_trans_unstable(pmd))
1850 return 0;
28093f9f 1851#endif
f69ff943
SW
1852 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
1853 do {
d85f4d6d 1854 struct page *page = can_gather_numa_stats(*pte, vma, addr);
f69ff943
SW
1855 if (!page)
1856 continue;
eb4866d0 1857 gather_stats(page, md, pte_dirty(*pte), 1);
f69ff943
SW
1858
1859 } while (pte++, addr += PAGE_SIZE, addr != end);
1860 pte_unmap_unlock(orig_pte, ptl);
a66c0410 1861 cond_resched();
f69ff943
SW
1862 return 0;
1863}
1864#ifdef CONFIG_HUGETLB_PAGE
632fd60f 1865static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
f69ff943
SW
1866 unsigned long addr, unsigned long end, struct mm_walk *walk)
1867{
5c2ff95e 1868 pte_t huge_pte = huge_ptep_get(pte);
f69ff943
SW
1869 struct numa_maps *md;
1870 struct page *page;
1871
5c2ff95e 1872 if (!pte_present(huge_pte))
f69ff943
SW
1873 return 0;
1874
5c2ff95e 1875 page = pte_page(huge_pte);
f69ff943
SW
1876
1877 md = walk->private;
5c2ff95e 1878 gather_stats(page, md, pte_dirty(huge_pte), 1);
f69ff943
SW
1879 return 0;
1880}
1881
1882#else
632fd60f 1883static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
f69ff943
SW
1884 unsigned long addr, unsigned long end, struct mm_walk *walk)
1885{
1886 return 0;
1887}
1888#endif
1889
7b86ac33
CH
1890static const struct mm_walk_ops show_numa_ops = {
1891 .hugetlb_entry = gather_hugetlb_stats,
1892 .pmd_entry = gather_pte_stats,
1893};
1894
f69ff943
SW
1895/*
1896 * Display pages allocated per node and memory policy via /proc.
1897 */
871305bb 1898static int show_numa_map(struct seq_file *m, void *v)
f69ff943 1899{
5b52fc89
SW
1900 struct numa_maps_private *numa_priv = m->private;
1901 struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
f69ff943 1902 struct vm_area_struct *vma = v;
5b52fc89 1903 struct numa_maps *md = &numa_priv->md;
f69ff943
SW
1904 struct file *file = vma->vm_file;
1905 struct mm_struct *mm = vma->vm_mm;
f69ff943 1906 struct mempolicy *pol;
948927ee
DR
1907 char buffer[64];
1908 int nid;
f69ff943
SW
1909
1910 if (!mm)
1911 return 0;
1912
5b52fc89
SW
1913 /* Ensure we start with an empty set of numa_maps statistics. */
1914 memset(md, 0, sizeof(*md));
f69ff943 1915
498f2371
ON
1916 pol = __get_vma_policy(vma, vma->vm_start);
1917 if (pol) {
1918 mpol_to_str(buffer, sizeof(buffer), pol);
1919 mpol_cond_put(pol);
1920 } else {
1921 mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy);
1922 }
f69ff943
SW
1923
1924 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1925
1926 if (file) {
17c2b4ee 1927 seq_puts(m, " file=");
2726d566 1928 seq_file_path(m, file, "\n\t= ");
f69ff943 1929 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
17c2b4ee 1930 seq_puts(m, " heap");
1240ea0d 1931 } else if (is_stack(vma)) {
65376df5 1932 seq_puts(m, " stack");
f69ff943
SW
1933 }
1934
fc360bd9 1935 if (is_vm_hugetlb_page(vma))
17c2b4ee 1936 seq_puts(m, " huge");
fc360bd9 1937
c1e8d7c6 1938 /* mmap_lock is held by m_start */
7b86ac33 1939 walk_page_vma(vma, &show_numa_ops, md);
f69ff943
SW
1940
1941 if (!md->pages)
1942 goto out;
1943
1944 if (md->anon)
1945 seq_printf(m, " anon=%lu", md->anon);
1946
1947 if (md->dirty)
1948 seq_printf(m, " dirty=%lu", md->dirty);
1949
1950 if (md->pages != md->anon && md->pages != md->dirty)
1951 seq_printf(m, " mapped=%lu", md->pages);
1952
1953 if (md->mapcount_max > 1)
1954 seq_printf(m, " mapmax=%lu", md->mapcount_max);
1955
1956 if (md->swapcache)
1957 seq_printf(m, " swapcache=%lu", md->swapcache);
1958
1959 if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1960 seq_printf(m, " active=%lu", md->active);
1961
1962 if (md->writeback)
1963 seq_printf(m, " writeback=%lu", md->writeback);
1964
948927ee
DR
1965 for_each_node_state(nid, N_MEMORY)
1966 if (md->node[nid])
1967 seq_printf(m, " N%d=%lu", nid, md->node[nid]);
198d1597
RA
1968
1969 seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
f69ff943
SW
1970out:
1971 seq_putc(m, '\n');
f69ff943
SW
1972 return 0;
1973}
5b52fc89 1974
03a44825 1975static const struct seq_operations proc_pid_numa_maps_op = {
b7643757
SP
1976 .start = m_start,
1977 .next = m_next,
1978 .stop = m_stop,
871305bb 1979 .show = show_numa_map,
6e21c8f1 1980};
662795de 1981
b7643757
SP
1982static int pid_numa_maps_open(struct inode *inode, struct file *file)
1983{
871305bb
VB
1984 return proc_maps_open(inode, file, &proc_pid_numa_maps_op,
1985 sizeof(struct numa_maps_private));
b7643757
SP
1986}
1987
1988const struct file_operations proc_pid_numa_maps_operations = {
1989 .open = pid_numa_maps_open,
1990 .read = seq_read,
1991 .llseek = seq_lseek,
29a40ace 1992 .release = proc_map_release,
b7643757
SP
1993};
1994
f69ff943 1995#endif /* CONFIG_NUMA */