mm: remove vmacache
[linux-2.6-block.git] / mm / vmstat.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
f6ac2354
CL
2/*
3 * linux/mm/vmstat.c
4 *
5 * Manages VM statistics
6 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
2244b95a
CL
7 *
8 * zoned VM statistics
9 * Copyright (C) 2006 Silicon Graphics, Inc.,
10 * Christoph Lameter <christoph@lameter.com>
7cc36bbd 11 * Copyright (C) 2008-2014 Christoph Lameter
f6ac2354 12 */
8f32f7e5 13#include <linux/fs.h>
f6ac2354 14#include <linux/mm.h>
4e950f6f 15#include <linux/err.h>
2244b95a 16#include <linux/module.h>
5a0e3ad6 17#include <linux/slab.h>
df9ecaba 18#include <linux/cpu.h>
7cc36bbd 19#include <linux/cpumask.h>
c748e134 20#include <linux/vmstat.h>
3c486871
AM
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/debugfs.h>
e8edc6e0 24#include <linux/sched.h>
f1a5ab12 25#include <linux/math64.h>
79da826a 26#include <linux/writeback.h>
36deb0be 27#include <linux/compaction.h>
6e543d57 28#include <linux/mm_inline.h>
48c96a36
JK
29#include <linux/page_ext.h>
30#include <linux/page_owner.h>
6e543d57
LD
31
32#include "internal.h"
f6ac2354 33
4518085e
KW
34#ifdef CONFIG_NUMA
35int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
36
37/* zero numa counters within a zone */
38static void zero_zone_numa_counters(struct zone *zone)
39{
40 int item, cpu;
41
f19298b9
MG
42 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) {
43 atomic_long_set(&zone->vm_numa_event[item], 0);
44 for_each_online_cpu(cpu) {
45 per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item]
4518085e 46 = 0;
f19298b9 47 }
4518085e
KW
48 }
49}
50
51/* zero numa counters of all the populated zones */
52static void zero_zones_numa_counters(void)
53{
54 struct zone *zone;
55
56 for_each_populated_zone(zone)
57 zero_zone_numa_counters(zone);
58}
59
60/* zero global numa counters */
61static void zero_global_numa_counters(void)
62{
63 int item;
64
f19298b9
MG
65 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
66 atomic_long_set(&vm_numa_event[item], 0);
4518085e
KW
67}
68
69static void invalid_numa_statistics(void)
70{
71 zero_zones_numa_counters();
72 zero_global_numa_counters();
73}
74
75static DEFINE_MUTEX(vm_numa_stat_lock);
76
77int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
32927393 78 void *buffer, size_t *length, loff_t *ppos)
4518085e
KW
79{
80 int ret, oldval;
81
82 mutex_lock(&vm_numa_stat_lock);
83 if (write)
84 oldval = sysctl_vm_numa_stat;
85 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
86 if (ret || !write)
87 goto out;
88
89 if (oldval == sysctl_vm_numa_stat)
90 goto out;
91 else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
92 static_branch_enable(&vm_numa_stat_key);
93 pr_info("enable numa statistics\n");
94 } else {
95 static_branch_disable(&vm_numa_stat_key);
96 invalid_numa_statistics();
97 pr_info("disable numa statistics, and clear numa counters\n");
98 }
99
100out:
101 mutex_unlock(&vm_numa_stat_lock);
102 return ret;
103}
104#endif
105
f8891e5e
CL
106#ifdef CONFIG_VM_EVENT_COUNTERS
107DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
108EXPORT_PER_CPU_SYMBOL(vm_event_states);
109
31f961a8 110static void sum_vm_events(unsigned long *ret)
f8891e5e 111{
9eccf2a8 112 int cpu;
f8891e5e
CL
113 int i;
114
115 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
116
31f961a8 117 for_each_online_cpu(cpu) {
f8891e5e
CL
118 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
119
f8891e5e
CL
120 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
121 ret[i] += this->event[i];
122 }
123}
124
125/*
126 * Accumulate the vm event counters across all CPUs.
127 * The result is unavoidably approximate - it can change
128 * during and after execution of this function.
129*/
130void all_vm_events(unsigned long *ret)
131{
7625eccd 132 cpus_read_lock();
31f961a8 133 sum_vm_events(ret);
7625eccd 134 cpus_read_unlock();
f8891e5e 135}
32dd66fc 136EXPORT_SYMBOL_GPL(all_vm_events);
f8891e5e 137
f8891e5e
CL
138/*
139 * Fold the foreign cpu events into our own.
140 *
141 * This is adding to the events on one processor
142 * but keeps the global counts constant.
143 */
144void vm_events_fold_cpu(int cpu)
145{
146 struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
147 int i;
148
149 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
150 count_vm_events(i, fold_state->event[i]);
151 fold_state->event[i] = 0;
152 }
153}
f8891e5e
CL
154
155#endif /* CONFIG_VM_EVENT_COUNTERS */
156
2244b95a
CL
157/*
158 * Manage combined zone based / global counters
159 *
160 * vm_stat contains the global counters
161 */
75ef7184
MG
162atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
163atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
f19298b9 164atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
75ef7184
MG
165EXPORT_SYMBOL(vm_zone_stat);
166EXPORT_SYMBOL(vm_node_stat);
2244b95a 167
ebeac3ea
GU
168#ifdef CONFIG_NUMA
169static void fold_vm_zone_numa_events(struct zone *zone)
170{
171 unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
172 int cpu;
173 enum numa_stat_item item;
174
175 for_each_online_cpu(cpu) {
176 struct per_cpu_zonestat *pzstats;
177
178 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
179 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
180 zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
181 }
182
183 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
184 zone_numa_event_add(zone_numa_events[item], zone, item);
185}
186
187void fold_vm_numa_events(void)
188{
189 struct zone *zone;
190
191 for_each_populated_zone(zone)
192 fold_vm_zone_numa_events(zone);
193}
194#endif
195
2244b95a
CL
196#ifdef CONFIG_SMP
197
b44129b3 198int calculate_pressure_threshold(struct zone *zone)
88f5acf8
MG
199{
200 int threshold;
201 int watermark_distance;
202
203 /*
204 * As vmstats are not up to date, there is drift between the estimated
205 * and real values. For high thresholds and a high number of CPUs, it
206 * is possible for the min watermark to be breached while the estimated
207 * value looks fine. The pressure threshold is a reduced value such
208 * that even the maximum amount of drift will not accidentally breach
209 * the min watermark
210 */
211 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
212 threshold = max(1, (int)(watermark_distance / num_online_cpus()));
213
214 /*
215 * Maximum threshold is 125
216 */
217 threshold = min(125, threshold);
218
219 return threshold;
220}
221
b44129b3 222int calculate_normal_threshold(struct zone *zone)
df9ecaba
CL
223{
224 int threshold;
225 int mem; /* memory in 128 MB units */
226
227 /*
228 * The threshold scales with the number of processors and the amount
229 * of memory per zone. More memory means that we can defer updates for
230 * longer, more processors could lead to more contention.
231 * fls() is used to have a cheap way of logarithmic scaling.
232 *
233 * Some sample thresholds:
234 *
ea15ba17 235 * Threshold Processors (fls) Zonesize fls(mem)+1
df9ecaba
CL
236 * ------------------------------------------------------------------
237 * 8 1 1 0.9-1 GB 4
238 * 16 2 2 0.9-1 GB 4
239 * 20 2 2 1-2 GB 5
240 * 24 2 2 2-4 GB 6
241 * 28 2 2 4-8 GB 7
242 * 32 2 2 8-16 GB 8
243 * 4 2 2 <128M 1
244 * 30 4 3 2-4 GB 5
245 * 48 4 3 8-16 GB 8
246 * 32 8 4 1-2 GB 4
247 * 32 8 4 0.9-1GB 4
248 * 10 16 5 <128M 1
249 * 40 16 5 900M 4
250 * 70 64 7 2-4 GB 5
251 * 84 64 7 4-8 GB 6
252 * 108 512 9 4-8 GB 6
253 * 125 1024 10 8-16 GB 8
254 * 125 1024 10 16-32 GB 9
255 */
256
9705bea5 257 mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
df9ecaba
CL
258
259 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
260
261 /*
262 * Maximum threshold is 125
263 */
264 threshold = min(125, threshold);
265
266 return threshold;
267}
2244b95a
CL
268
269/*
df9ecaba 270 * Refresh the thresholds for each zone.
2244b95a 271 */
a6cccdc3 272void refresh_zone_stat_thresholds(void)
2244b95a 273{
75ef7184 274 struct pglist_data *pgdat;
df9ecaba
CL
275 struct zone *zone;
276 int cpu;
277 int threshold;
278
75ef7184
MG
279 /* Zero current pgdat thresholds */
280 for_each_online_pgdat(pgdat) {
281 for_each_online_cpu(cpu) {
282 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
283 }
284 }
285
ee99c71c 286 for_each_populated_zone(zone) {
75ef7184 287 struct pglist_data *pgdat = zone->zone_pgdat;
aa454840
CL
288 unsigned long max_drift, tolerate_drift;
289
b44129b3 290 threshold = calculate_normal_threshold(zone);
df9ecaba 291
75ef7184
MG
292 for_each_online_cpu(cpu) {
293 int pgdat_threshold;
294
28f836b6 295 per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
99dcc3e5 296 = threshold;
1d90ca89 297
75ef7184
MG
298 /* Base nodestat threshold on the largest populated zone. */
299 pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
300 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
301 = max(threshold, pgdat_threshold);
302 }
303
aa454840
CL
304 /*
305 * Only set percpu_drift_mark if there is a danger that
306 * NR_FREE_PAGES reports the low watermark is ok when in fact
307 * the min watermark could be breached by an allocation
308 */
309 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
310 max_drift = num_online_cpus() * threshold;
311 if (max_drift > tolerate_drift)
312 zone->percpu_drift_mark = high_wmark_pages(zone) +
313 max_drift;
df9ecaba 314 }
2244b95a
CL
315}
316
b44129b3
MG
317void set_pgdat_percpu_threshold(pg_data_t *pgdat,
318 int (*calculate_pressure)(struct zone *))
88f5acf8
MG
319{
320 struct zone *zone;
321 int cpu;
322 int threshold;
323 int i;
324
88f5acf8
MG
325 for (i = 0; i < pgdat->nr_zones; i++) {
326 zone = &pgdat->node_zones[i];
327 if (!zone->percpu_drift_mark)
328 continue;
329
b44129b3 330 threshold = (*calculate_pressure)(zone);
1d90ca89 331 for_each_online_cpu(cpu)
28f836b6 332 per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
88f5acf8
MG
333 = threshold;
334 }
88f5acf8
MG
335}
336
2244b95a 337/*
bea04b07
JZ
338 * For use when we know that interrupts are disabled,
339 * or when we know that preemption is disabled and that
340 * particular counter cannot be updated from interrupt context.
2244b95a
CL
341 */
342void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 343 long delta)
2244b95a 344{
28f836b6 345 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
12938a92 346 s8 __percpu *p = pcp->vm_stat_diff + item;
2244b95a 347 long x;
12938a92
CL
348 long t;
349
c68ed794
IM
350 /*
351 * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels,
352 * atomicity is provided by IRQs being disabled -- either explicitly
353 * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables
354 * CPU migrations and preemption potentially corrupts a counter so
355 * disable preemption.
356 */
357 if (IS_ENABLED(CONFIG_PREEMPT_RT))
358 preempt_disable();
359
12938a92 360 x = delta + __this_cpu_read(*p);
2244b95a 361
12938a92 362 t = __this_cpu_read(pcp->stat_threshold);
2244b95a 363
40610076 364 if (unlikely(abs(x) > t)) {
2244b95a
CL
365 zone_page_state_add(x, zone, item);
366 x = 0;
367 }
12938a92 368 __this_cpu_write(*p, x);
c68ed794
IM
369
370 if (IS_ENABLED(CONFIG_PREEMPT_RT))
371 preempt_enable();
2244b95a
CL
372}
373EXPORT_SYMBOL(__mod_zone_page_state);
374
75ef7184
MG
375void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
376 long delta)
377{
378 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
379 s8 __percpu *p = pcp->vm_node_stat_diff + item;
380 long x;
381 long t;
382
ea426c2a 383 if (vmstat_item_in_bytes(item)) {
629484ae
JW
384 /*
385 * Only cgroups use subpage accounting right now; at
386 * the global level, these items still change in
387 * multiples of whole pages. Store them as pages
388 * internally to keep the per-cpu counters compact.
389 */
ea426c2a
RG
390 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
391 delta >>= PAGE_SHIFT;
392 }
393
c68ed794
IM
394 /* See __mod_node_page_state */
395 if (IS_ENABLED(CONFIG_PREEMPT_RT))
396 preempt_disable();
397
75ef7184
MG
398 x = delta + __this_cpu_read(*p);
399
400 t = __this_cpu_read(pcp->stat_threshold);
401
40610076 402 if (unlikely(abs(x) > t)) {
75ef7184
MG
403 node_page_state_add(x, pgdat, item);
404 x = 0;
405 }
406 __this_cpu_write(*p, x);
c68ed794
IM
407
408 if (IS_ENABLED(CONFIG_PREEMPT_RT))
409 preempt_enable();
75ef7184
MG
410}
411EXPORT_SYMBOL(__mod_node_page_state);
412
2244b95a
CL
413/*
414 * Optimized increment and decrement functions.
415 *
416 * These are only for a single page and therefore can take a struct page *
417 * argument instead of struct zone *. This allows the inclusion of the code
418 * generated for page_zone(page) into the optimized functions.
419 *
420 * No overflow check is necessary and therefore the differential can be
421 * incremented or decremented in place which may allow the compilers to
422 * generate better code.
2244b95a
CL
423 * The increment or decrement is known and therefore one boundary check can
424 * be omitted.
425 *
df9ecaba
CL
426 * NOTE: These functions are very performance sensitive. Change only
427 * with care.
428 *
2244b95a
CL
429 * Some processors have inc/dec instructions that are atomic vs an interrupt.
430 * However, the code must first determine the differential location in a zone
431 * based on the processor number and then inc/dec the counter. There is no
432 * guarantee without disabling preemption that the processor will not change
433 * in between and therefore the atomicity vs. interrupt cannot be exploited
434 * in a useful way here.
435 */
c8785385 436void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
2244b95a 437{
28f836b6 438 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
12938a92
CL
439 s8 __percpu *p = pcp->vm_stat_diff + item;
440 s8 v, t;
2244b95a 441
c68ed794
IM
442 /* See __mod_node_page_state */
443 if (IS_ENABLED(CONFIG_PREEMPT_RT))
444 preempt_disable();
445
908ee0f1 446 v = __this_cpu_inc_return(*p);
12938a92
CL
447 t = __this_cpu_read(pcp->stat_threshold);
448 if (unlikely(v > t)) {
449 s8 overstep = t >> 1;
df9ecaba 450
12938a92
CL
451 zone_page_state_add(v + overstep, zone, item);
452 __this_cpu_write(*p, -overstep);
2244b95a 453 }
c68ed794
IM
454
455 if (IS_ENABLED(CONFIG_PREEMPT_RT))
456 preempt_enable();
2244b95a 457}
ca889e6c 458
75ef7184
MG
459void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
460{
461 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
462 s8 __percpu *p = pcp->vm_node_stat_diff + item;
463 s8 v, t;
464
ea426c2a
RG
465 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
466
c68ed794
IM
467 /* See __mod_node_page_state */
468 if (IS_ENABLED(CONFIG_PREEMPT_RT))
469 preempt_disable();
470
75ef7184
MG
471 v = __this_cpu_inc_return(*p);
472 t = __this_cpu_read(pcp->stat_threshold);
473 if (unlikely(v > t)) {
474 s8 overstep = t >> 1;
475
476 node_page_state_add(v + overstep, pgdat, item);
477 __this_cpu_write(*p, -overstep);
478 }
c68ed794
IM
479
480 if (IS_ENABLED(CONFIG_PREEMPT_RT))
481 preempt_enable();
75ef7184
MG
482}
483
ca889e6c
CL
484void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
485{
486 __inc_zone_state(page_zone(page), item);
487}
2244b95a
CL
488EXPORT_SYMBOL(__inc_zone_page_state);
489
75ef7184
MG
490void __inc_node_page_state(struct page *page, enum node_stat_item item)
491{
492 __inc_node_state(page_pgdat(page), item);
493}
494EXPORT_SYMBOL(__inc_node_page_state);
495
c8785385 496void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
2244b95a 497{
28f836b6 498 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
12938a92
CL
499 s8 __percpu *p = pcp->vm_stat_diff + item;
500 s8 v, t;
2244b95a 501
c68ed794
IM
502 /* See __mod_node_page_state */
503 if (IS_ENABLED(CONFIG_PREEMPT_RT))
504 preempt_disable();
505
908ee0f1 506 v = __this_cpu_dec_return(*p);
12938a92
CL
507 t = __this_cpu_read(pcp->stat_threshold);
508 if (unlikely(v < - t)) {
509 s8 overstep = t >> 1;
2244b95a 510
12938a92
CL
511 zone_page_state_add(v - overstep, zone, item);
512 __this_cpu_write(*p, overstep);
2244b95a 513 }
c68ed794
IM
514
515 if (IS_ENABLED(CONFIG_PREEMPT_RT))
516 preempt_enable();
2244b95a 517}
c8785385 518
75ef7184
MG
519void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
520{
521 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
522 s8 __percpu *p = pcp->vm_node_stat_diff + item;
523 s8 v, t;
524
ea426c2a
RG
525 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
526
c68ed794
IM
527 /* See __mod_node_page_state */
528 if (IS_ENABLED(CONFIG_PREEMPT_RT))
529 preempt_disable();
530
75ef7184
MG
531 v = __this_cpu_dec_return(*p);
532 t = __this_cpu_read(pcp->stat_threshold);
533 if (unlikely(v < - t)) {
534 s8 overstep = t >> 1;
535
536 node_page_state_add(v - overstep, pgdat, item);
537 __this_cpu_write(*p, overstep);
538 }
c68ed794
IM
539
540 if (IS_ENABLED(CONFIG_PREEMPT_RT))
541 preempt_enable();
75ef7184
MG
542}
543
c8785385
CL
544void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
545{
546 __dec_zone_state(page_zone(page), item);
547}
2244b95a
CL
548EXPORT_SYMBOL(__dec_zone_page_state);
549
75ef7184
MG
550void __dec_node_page_state(struct page *page, enum node_stat_item item)
551{
552 __dec_node_state(page_pgdat(page), item);
553}
554EXPORT_SYMBOL(__dec_node_page_state);
555
4156153c 556#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
7c839120
CL
557/*
558 * If we have cmpxchg_local support then we do not need to incur the overhead
559 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
560 *
561 * mod_state() modifies the zone counter state through atomic per cpu
562 * operations.
563 *
564 * Overstep mode specifies how overstep should handled:
565 * 0 No overstepping
566 * 1 Overstepping half of threshold
567 * -1 Overstepping minus half of threshold
568*/
75ef7184
MG
569static inline void mod_zone_state(struct zone *zone,
570 enum zone_stat_item item, long delta, int overstep_mode)
7c839120 571{
28f836b6 572 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
7c839120
CL
573 s8 __percpu *p = pcp->vm_stat_diff + item;
574 long o, n, t, z;
575
576 do {
577 z = 0; /* overflow to zone counters */
578
579 /*
580 * The fetching of the stat_threshold is racy. We may apply
581 * a counter threshold to the wrong the cpu if we get
d3bc2367
CL
582 * rescheduled while executing here. However, the next
583 * counter update will apply the threshold again and
584 * therefore bring the counter under the threshold again.
585 *
586 * Most of the time the thresholds are the same anyways
587 * for all cpus in a zone.
7c839120
CL
588 */
589 t = this_cpu_read(pcp->stat_threshold);
590
591 o = this_cpu_read(*p);
592 n = delta + o;
593
40610076 594 if (abs(n) > t) {
7c839120
CL
595 int os = overstep_mode * (t >> 1) ;
596
597 /* Overflow must be added to zone counters */
598 z = n + os;
599 n = -os;
600 }
601 } while (this_cpu_cmpxchg(*p, o, n) != o);
602
603 if (z)
604 zone_page_state_add(z, zone, item);
605}
606
607void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 608 long delta)
7c839120 609{
75ef7184 610 mod_zone_state(zone, item, delta, 0);
7c839120
CL
611}
612EXPORT_SYMBOL(mod_zone_page_state);
613
7c839120
CL
614void inc_zone_page_state(struct page *page, enum zone_stat_item item)
615{
75ef7184 616 mod_zone_state(page_zone(page), item, 1, 1);
7c839120
CL
617}
618EXPORT_SYMBOL(inc_zone_page_state);
619
620void dec_zone_page_state(struct page *page, enum zone_stat_item item)
621{
75ef7184 622 mod_zone_state(page_zone(page), item, -1, -1);
7c839120
CL
623}
624EXPORT_SYMBOL(dec_zone_page_state);
75ef7184
MG
625
626static inline void mod_node_state(struct pglist_data *pgdat,
627 enum node_stat_item item, int delta, int overstep_mode)
628{
629 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
630 s8 __percpu *p = pcp->vm_node_stat_diff + item;
631 long o, n, t, z;
632
ea426c2a 633 if (vmstat_item_in_bytes(item)) {
629484ae
JW
634 /*
635 * Only cgroups use subpage accounting right now; at
636 * the global level, these items still change in
637 * multiples of whole pages. Store them as pages
638 * internally to keep the per-cpu counters compact.
639 */
ea426c2a
RG
640 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
641 delta >>= PAGE_SHIFT;
642 }
643
75ef7184
MG
644 do {
645 z = 0; /* overflow to node counters */
646
647 /*
648 * The fetching of the stat_threshold is racy. We may apply
649 * a counter threshold to the wrong the cpu if we get
650 * rescheduled while executing here. However, the next
651 * counter update will apply the threshold again and
652 * therefore bring the counter under the threshold again.
653 *
654 * Most of the time the thresholds are the same anyways
655 * for all cpus in a node.
656 */
657 t = this_cpu_read(pcp->stat_threshold);
658
659 o = this_cpu_read(*p);
660 n = delta + o;
661
40610076 662 if (abs(n) > t) {
75ef7184
MG
663 int os = overstep_mode * (t >> 1) ;
664
665 /* Overflow must be added to node counters */
666 z = n + os;
667 n = -os;
668 }
669 } while (this_cpu_cmpxchg(*p, o, n) != o);
670
671 if (z)
672 node_page_state_add(z, pgdat, item);
673}
674
675void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
676 long delta)
677{
678 mod_node_state(pgdat, item, delta, 0);
679}
680EXPORT_SYMBOL(mod_node_page_state);
681
682void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
683{
684 mod_node_state(pgdat, item, 1, 1);
685}
686
687void inc_node_page_state(struct page *page, enum node_stat_item item)
688{
689 mod_node_state(page_pgdat(page), item, 1, 1);
690}
691EXPORT_SYMBOL(inc_node_page_state);
692
693void dec_node_page_state(struct page *page, enum node_stat_item item)
694{
695 mod_node_state(page_pgdat(page), item, -1, -1);
696}
697EXPORT_SYMBOL(dec_node_page_state);
7c839120
CL
698#else
699/*
700 * Use interrupt disable to serialize counter updates
701 */
702void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 703 long delta)
7c839120
CL
704{
705 unsigned long flags;
706
707 local_irq_save(flags);
708 __mod_zone_page_state(zone, item, delta);
709 local_irq_restore(flags);
710}
711EXPORT_SYMBOL(mod_zone_page_state);
712
2244b95a
CL
713void inc_zone_page_state(struct page *page, enum zone_stat_item item)
714{
715 unsigned long flags;
716 struct zone *zone;
2244b95a
CL
717
718 zone = page_zone(page);
719 local_irq_save(flags);
ca889e6c 720 __inc_zone_state(zone, item);
2244b95a
CL
721 local_irq_restore(flags);
722}
723EXPORT_SYMBOL(inc_zone_page_state);
724
725void dec_zone_page_state(struct page *page, enum zone_stat_item item)
726{
727 unsigned long flags;
2244b95a 728
2244b95a 729 local_irq_save(flags);
a302eb4e 730 __dec_zone_page_state(page, item);
2244b95a
CL
731 local_irq_restore(flags);
732}
733EXPORT_SYMBOL(dec_zone_page_state);
734
75ef7184
MG
735void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
736{
737 unsigned long flags;
738
739 local_irq_save(flags);
740 __inc_node_state(pgdat, item);
741 local_irq_restore(flags);
742}
743EXPORT_SYMBOL(inc_node_state);
744
745void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
746 long delta)
747{
748 unsigned long flags;
749
750 local_irq_save(flags);
751 __mod_node_page_state(pgdat, item, delta);
752 local_irq_restore(flags);
753}
754EXPORT_SYMBOL(mod_node_page_state);
755
756void inc_node_page_state(struct page *page, enum node_stat_item item)
757{
758 unsigned long flags;
759 struct pglist_data *pgdat;
760
761 pgdat = page_pgdat(page);
762 local_irq_save(flags);
763 __inc_node_state(pgdat, item);
764 local_irq_restore(flags);
765}
766EXPORT_SYMBOL(inc_node_page_state);
767
768void dec_node_page_state(struct page *page, enum node_stat_item item)
769{
770 unsigned long flags;
771
772 local_irq_save(flags);
773 __dec_node_page_state(page, item);
774 local_irq_restore(flags);
775}
776EXPORT_SYMBOL(dec_node_page_state);
777#endif
7cc36bbd
CL
778
779/*
780 * Fold a differential into the global counters.
781 * Returns the number of counters updated.
782 */
f19298b9 783static int fold_diff(int *zone_diff, int *node_diff)
3a321d2a
KW
784{
785 int i;
786 int changes = 0;
787
788 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
789 if (zone_diff[i]) {
790 atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
791 changes++;
792 }
793
3a321d2a
KW
794 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
795 if (node_diff[i]) {
796 atomic_long_add(node_diff[i], &vm_node_stat[i]);
797 changes++;
798 }
799 return changes;
800}
f19298b9 801
2244b95a 802/*
2bb921e5 803 * Update the zone counters for the current cpu.
a7f75e25 804 *
4037d452
CL
805 * Note that refresh_cpu_vm_stats strives to only access
806 * node local memory. The per cpu pagesets on remote zones are placed
807 * in the memory local to the processor using that pageset. So the
808 * loop over all zones will access a series of cachelines local to
809 * the processor.
810 *
811 * The call to zone_page_state_add updates the cachelines with the
812 * statistics in the remote zone struct as well as the global cachelines
813 * with the global counters. These could cause remote node cache line
814 * bouncing and will have to be only done when necessary.
7cc36bbd
CL
815 *
816 * The function returns the number of global counters updated.
2244b95a 817 */
0eb77e98 818static int refresh_cpu_vm_stats(bool do_pagesets)
2244b95a 819{
75ef7184 820 struct pglist_data *pgdat;
2244b95a
CL
821 struct zone *zone;
822 int i;
75ef7184
MG
823 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
824 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
7cc36bbd 825 int changes = 0;
2244b95a 826
ee99c71c 827 for_each_populated_zone(zone) {
28f836b6
MG
828 struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
829#ifdef CONFIG_NUMA
830 struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
831#endif
2244b95a 832
fbc2edb0
CL
833 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
834 int v;
2244b95a 835
28f836b6 836 v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0);
fbc2edb0 837 if (v) {
a7f75e25 838
a7f75e25 839 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 840 global_zone_diff[i] += v;
4037d452
CL
841#ifdef CONFIG_NUMA
842 /* 3 seconds idle till flush */
28f836b6 843 __this_cpu_write(pcp->expire, 3);
4037d452 844#endif
2244b95a 845 }
fbc2edb0 846 }
4037d452 847#ifdef CONFIG_NUMA
3a321d2a 848
0eb77e98
CL
849 if (do_pagesets) {
850 cond_resched();
851 /*
852 * Deal with draining the remote pageset of this
853 * processor
854 *
855 * Check if there are pages remaining in this pageset
856 * if not then there is nothing to expire.
857 */
28f836b6
MG
858 if (!__this_cpu_read(pcp->expire) ||
859 !__this_cpu_read(pcp->count))
0eb77e98 860 continue;
4037d452 861
0eb77e98
CL
862 /*
863 * We never drain zones local to this processor.
864 */
865 if (zone_to_nid(zone) == numa_node_id()) {
28f836b6 866 __this_cpu_write(pcp->expire, 0);
0eb77e98
CL
867 continue;
868 }
4037d452 869
28f836b6 870 if (__this_cpu_dec_return(pcp->expire))
0eb77e98 871 continue;
4037d452 872
28f836b6
MG
873 if (__this_cpu_read(pcp->count)) {
874 drain_zone_pages(zone, this_cpu_ptr(pcp));
0eb77e98
CL
875 changes++;
876 }
7cc36bbd 877 }
4037d452 878#endif
2244b95a 879 }
75ef7184
MG
880
881 for_each_online_pgdat(pgdat) {
882 struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
883
884 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
885 int v;
886
887 v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
888 if (v) {
889 atomic_long_add(v, &pgdat->vm_stat[i]);
890 global_node_diff[i] += v;
891 }
892 }
893 }
894
895 changes += fold_diff(global_zone_diff, global_node_diff);
7cc36bbd 896 return changes;
2244b95a
CL
897}
898
2bb921e5
CL
899/*
900 * Fold the data for an offline cpu into the global array.
901 * There cannot be any access by the offline cpu and therefore
902 * synchronization is simplified.
903 */
904void cpu_vm_stats_fold(int cpu)
905{
75ef7184 906 struct pglist_data *pgdat;
2bb921e5
CL
907 struct zone *zone;
908 int i;
75ef7184
MG
909 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
910 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
2bb921e5
CL
911
912 for_each_populated_zone(zone) {
28f836b6 913 struct per_cpu_zonestat *pzstats;
2bb921e5 914
28f836b6 915 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
2bb921e5 916
f19298b9 917 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
28f836b6 918 if (pzstats->vm_stat_diff[i]) {
2bb921e5
CL
919 int v;
920
28f836b6
MG
921 v = pzstats->vm_stat_diff[i];
922 pzstats->vm_stat_diff[i] = 0;
2bb921e5 923 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 924 global_zone_diff[i] += v;
2bb921e5 925 }
f19298b9 926 }
3a321d2a 927#ifdef CONFIG_NUMA
f19298b9
MG
928 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
929 if (pzstats->vm_numa_event[i]) {
930 unsigned long v;
3a321d2a 931
f19298b9
MG
932 v = pzstats->vm_numa_event[i];
933 pzstats->vm_numa_event[i] = 0;
934 zone_numa_event_add(v, zone, i);
3a321d2a 935 }
f19298b9 936 }
3a321d2a 937#endif
2bb921e5
CL
938 }
939
75ef7184
MG
940 for_each_online_pgdat(pgdat) {
941 struct per_cpu_nodestat *p;
942
943 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
944
945 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
946 if (p->vm_node_stat_diff[i]) {
947 int v;
948
949 v = p->vm_node_stat_diff[i];
950 p->vm_node_stat_diff[i] = 0;
951 atomic_long_add(v, &pgdat->vm_stat[i]);
952 global_node_diff[i] += v;
953 }
954 }
955
956 fold_diff(global_zone_diff, global_node_diff);
2bb921e5
CL
957}
958
40f4b1ea
CS
959/*
960 * this is only called if !populated_zone(zone), which implies no other users of
f0953a1b 961 * pset->vm_stat_diff[] exist.
40f4b1ea 962 */
28f836b6 963void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats)
5a883813 964{
f19298b9 965 unsigned long v;
5a883813
MK
966 int i;
967
f19298b9 968 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
28f836b6 969 if (pzstats->vm_stat_diff[i]) {
f19298b9 970 v = pzstats->vm_stat_diff[i];
28f836b6 971 pzstats->vm_stat_diff[i] = 0;
f19298b9 972 zone_page_state_add(v, zone, i);
5a883813 973 }
f19298b9 974 }
3a321d2a
KW
975
976#ifdef CONFIG_NUMA
f19298b9
MG
977 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
978 if (pzstats->vm_numa_event[i]) {
979 v = pzstats->vm_numa_event[i];
980 pzstats->vm_numa_event[i] = 0;
981 zone_numa_event_add(v, zone, i);
3a321d2a 982 }
f19298b9 983 }
3a321d2a 984#endif
5a883813 985}
2244b95a
CL
986#endif
987
ca889e6c 988#ifdef CONFIG_NUMA
c2d42c16 989/*
75ef7184
MG
990 * Determine the per node value of a stat item. This function
991 * is called frequently in a NUMA machine, so try to be as
992 * frugal as possible.
c2d42c16 993 */
75ef7184
MG
994unsigned long sum_zone_node_page_state(int node,
995 enum zone_stat_item item)
c2d42c16
AM
996{
997 struct zone *zones = NODE_DATA(node)->node_zones;
e87d59f7
JK
998 int i;
999 unsigned long count = 0;
c2d42c16 1000
e87d59f7
JK
1001 for (i = 0; i < MAX_NR_ZONES; i++)
1002 count += zone_page_state(zones + i, item);
1003
1004 return count;
c2d42c16
AM
1005}
1006
f19298b9
MG
1007/* Determine the per node value of a numa stat item. */
1008unsigned long sum_zone_numa_event_state(int node,
3a321d2a
KW
1009 enum numa_stat_item item)
1010{
1011 struct zone *zones = NODE_DATA(node)->node_zones;
3a321d2a 1012 unsigned long count = 0;
f19298b9 1013 int i;
3a321d2a
KW
1014
1015 for (i = 0; i < MAX_NR_ZONES; i++)
f19298b9 1016 count += zone_numa_event_state(zones + i, item);
3a321d2a
KW
1017
1018 return count;
1019}
1020
75ef7184
MG
1021/*
1022 * Determine the per node value of a stat item.
1023 */
ea426c2a
RG
1024unsigned long node_page_state_pages(struct pglist_data *pgdat,
1025 enum node_stat_item item)
75ef7184
MG
1026{
1027 long x = atomic_long_read(&pgdat->vm_stat[item]);
1028#ifdef CONFIG_SMP
1029 if (x < 0)
1030 x = 0;
1031#endif
1032 return x;
1033}
ea426c2a
RG
1034
1035unsigned long node_page_state(struct pglist_data *pgdat,
1036 enum node_stat_item item)
1037{
1038 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
1039
1040 return node_page_state_pages(pgdat, item);
1041}
ca889e6c
CL
1042#endif
1043
d7a5752c 1044#ifdef CONFIG_COMPACTION
36deb0be 1045
d7a5752c
MG
1046struct contig_page_info {
1047 unsigned long free_pages;
1048 unsigned long free_blocks_total;
1049 unsigned long free_blocks_suitable;
1050};
1051
1052/*
1053 * Calculate the number of free pages in a zone, how many contiguous
1054 * pages are free and how many are large enough to satisfy an allocation of
1055 * the target size. Note that this function makes no attempt to estimate
1056 * how many suitable free blocks there *might* be if MOVABLE pages were
1057 * migrated. Calculating that is possible, but expensive and can be
1058 * figured out from userspace
1059 */
1060static void fill_contig_page_info(struct zone *zone,
1061 unsigned int suitable_order,
1062 struct contig_page_info *info)
1063{
1064 unsigned int order;
1065
1066 info->free_pages = 0;
1067 info->free_blocks_total = 0;
1068 info->free_blocks_suitable = 0;
1069
1070 for (order = 0; order < MAX_ORDER; order++) {
1071 unsigned long blocks;
1072
af1c31ac
LS
1073 /*
1074 * Count number of free blocks.
1075 *
1076 * Access to nr_free is lockless as nr_free is used only for
1077 * diagnostic purposes. Use data_race to avoid KCSAN warning.
1078 */
1079 blocks = data_race(zone->free_area[order].nr_free);
d7a5752c
MG
1080 info->free_blocks_total += blocks;
1081
1082 /* Count free base pages */
1083 info->free_pages += blocks << order;
1084
1085 /* Count the suitable free blocks */
1086 if (order >= suitable_order)
1087 info->free_blocks_suitable += blocks <<
1088 (order - suitable_order);
1089 }
1090}
f1a5ab12
MG
1091
1092/*
1093 * A fragmentation index only makes sense if an allocation of a requested
1094 * size would fail. If that is true, the fragmentation index indicates
1095 * whether external fragmentation or a lack of memory was the problem.
1096 * The value can be used to determine if page reclaim or compaction
1097 * should be used
1098 */
56de7263 1099static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
f1a5ab12
MG
1100{
1101 unsigned long requested = 1UL << order;
1102
88d6ac40
WY
1103 if (WARN_ON_ONCE(order >= MAX_ORDER))
1104 return 0;
1105
f1a5ab12
MG
1106 if (!info->free_blocks_total)
1107 return 0;
1108
1109 /* Fragmentation index only makes sense when a request would fail */
1110 if (info->free_blocks_suitable)
1111 return -1000;
1112
1113 /*
1114 * Index is between 0 and 1 so return within 3 decimal places
1115 *
1116 * 0 => allocation would fail due to lack of memory
1117 * 1 => allocation would fail due to fragmentation
1118 */
1119 return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
1120}
56de7263 1121
facdaa91
NG
1122/*
1123 * Calculates external fragmentation within a zone wrt the given order.
1124 * It is defined as the percentage of pages found in blocks of size
1125 * less than 1 << order. It returns values in range [0, 100].
1126 */
d34c0a75 1127unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
facdaa91
NG
1128{
1129 struct contig_page_info info;
1130
1131 fill_contig_page_info(zone, order, &info);
1132 if (info.free_pages == 0)
1133 return 0;
1134
1135 return div_u64((info.free_pages -
1136 (info.free_blocks_suitable << order)) * 100,
1137 info.free_pages);
1138}
1139
56de7263
MG
1140/* Same as __fragmentation index but allocs contig_page_info on stack */
1141int fragmentation_index(struct zone *zone, unsigned int order)
1142{
1143 struct contig_page_info info;
1144
1145 fill_contig_page_info(zone, order, &info);
1146 return __fragmentation_index(order, &info);
1147}
d7a5752c
MG
1148#endif
1149
ebc5d83d
KK
1150#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
1151 defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
fa25c503
KM
1152#ifdef CONFIG_ZONE_DMA
1153#define TEXT_FOR_DMA(xx) xx "_dma",
1154#else
1155#define TEXT_FOR_DMA(xx)
1156#endif
1157
1158#ifdef CONFIG_ZONE_DMA32
1159#define TEXT_FOR_DMA32(xx) xx "_dma32",
1160#else
1161#define TEXT_FOR_DMA32(xx)
1162#endif
1163
1164#ifdef CONFIG_HIGHMEM
1165#define TEXT_FOR_HIGHMEM(xx) xx "_high",
1166#else
1167#define TEXT_FOR_HIGHMEM(xx)
1168#endif
1169
a39c5d3c
HL
1170#ifdef CONFIG_ZONE_DEVICE
1171#define TEXT_FOR_DEVICE(xx) xx "_device",
1172#else
1173#define TEXT_FOR_DEVICE(xx)
1174#endif
1175
fa25c503 1176#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
a39c5d3c
HL
1177 TEXT_FOR_HIGHMEM(xx) xx "_movable", \
1178 TEXT_FOR_DEVICE(xx)
fa25c503
KM
1179
1180const char * const vmstat_text[] = {
8d92890b 1181 /* enum zone_stat_item counters */
fa25c503 1182 "nr_free_pages",
71c799f4
MK
1183 "nr_zone_inactive_anon",
1184 "nr_zone_active_anon",
1185 "nr_zone_inactive_file",
1186 "nr_zone_active_file",
1187 "nr_zone_unevictable",
5a1c84b4 1188 "nr_zone_write_pending",
fa25c503 1189 "nr_mlock",
fa25c503 1190 "nr_bounce",
91537fee
MK
1191#if IS_ENABLED(CONFIG_ZSMALLOC)
1192 "nr_zspages",
1193#endif
3a321d2a
KW
1194 "nr_free_cma",
1195
1196 /* enum numa_stat_item counters */
fa25c503
KM
1197#ifdef CONFIG_NUMA
1198 "numa_hit",
1199 "numa_miss",
1200 "numa_foreign",
1201 "numa_interleave",
1202 "numa_local",
1203 "numa_other",
1204#endif
09316c09 1205
9d7ea9a2 1206 /* enum node_stat_item counters */
599d0c95
MG
1207 "nr_inactive_anon",
1208 "nr_active_anon",
1209 "nr_inactive_file",
1210 "nr_active_file",
1211 "nr_unevictable",
385386cf
JW
1212 "nr_slab_reclaimable",
1213 "nr_slab_unreclaimable",
599d0c95
MG
1214 "nr_isolated_anon",
1215 "nr_isolated_file",
68d48e6a 1216 "workingset_nodes",
170b04b7
JK
1217 "workingset_refault_anon",
1218 "workingset_refault_file",
1219 "workingset_activate_anon",
1220 "workingset_activate_file",
1221 "workingset_restore_anon",
1222 "workingset_restore_file",
1e6b1085 1223 "workingset_nodereclaim",
50658e2e
MG
1224 "nr_anon_pages",
1225 "nr_mapped",
11fb9989
MG
1226 "nr_file_pages",
1227 "nr_dirty",
1228 "nr_writeback",
1229 "nr_writeback_temp",
1230 "nr_shmem",
1231 "nr_shmem_hugepages",
1232 "nr_shmem_pmdmapped",
60fbf0ab
SL
1233 "nr_file_hugepages",
1234 "nr_file_pmdmapped",
11fb9989 1235 "nr_anon_transparent_hugepages",
c4a25635
MG
1236 "nr_vmscan_write",
1237 "nr_vmscan_immediate_reclaim",
1238 "nr_dirtied",
1239 "nr_written",
8cd7c588 1240 "nr_throttled_written",
b29940c1 1241 "nr_kernel_misc_reclaimable",
1970dc6f
JH
1242 "nr_foll_pin_acquired",
1243 "nr_foll_pin_released",
991e7673
SB
1244 "nr_kernel_stack",
1245#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
1246 "nr_shadow_call_stack",
1247#endif
f0c0c115 1248 "nr_page_table_pages",
b6038942
SB
1249#ifdef CONFIG_SWAP
1250 "nr_swapcached",
1251#endif
e39bb6be
HY
1252#ifdef CONFIG_NUMA_BALANCING
1253 "pgpromote_success",
c6833e10 1254 "pgpromote_candidate",
e39bb6be 1255#endif
599d0c95 1256
09316c09 1257 /* enum writeback_stat_item counters */
fa25c503
KM
1258 "nr_dirty_threshold",
1259 "nr_dirty_background_threshold",
1260
ebc5d83d 1261#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
09316c09 1262 /* enum vm_event_item counters */
fa25c503
KM
1263 "pgpgin",
1264 "pgpgout",
1265 "pswpin",
1266 "pswpout",
1267
1268 TEXTS_FOR_ZONES("pgalloc")
7cc30fcf
MG
1269 TEXTS_FOR_ZONES("allocstall")
1270 TEXTS_FOR_ZONES("pgskip")
fa25c503
KM
1271
1272 "pgfree",
1273 "pgactivate",
1274 "pgdeactivate",
f7ad2a6c 1275 "pglazyfree",
fa25c503
KM
1276
1277 "pgfault",
1278 "pgmajfault",
854e9ed0 1279 "pglazyfreed",
fa25c503 1280
599d0c95 1281 "pgrefill",
798a6b87 1282 "pgreuse",
599d0c95
MG
1283 "pgsteal_kswapd",
1284 "pgsteal_direct",
668e4147
YS
1285 "pgdemote_kswapd",
1286 "pgdemote_direct",
599d0c95
MG
1287 "pgscan_kswapd",
1288 "pgscan_direct",
68243e76 1289 "pgscan_direct_throttle",
497a6c1b
JW
1290 "pgscan_anon",
1291 "pgscan_file",
1292 "pgsteal_anon",
1293 "pgsteal_file",
fa25c503
KM
1294
1295#ifdef CONFIG_NUMA
1296 "zone_reclaim_failed",
1297#endif
1298 "pginodesteal",
1299 "slabs_scanned",
fa25c503
KM
1300 "kswapd_inodesteal",
1301 "kswapd_low_wmark_hit_quickly",
1302 "kswapd_high_wmark_hit_quickly",
fa25c503 1303 "pageoutrun",
fa25c503
KM
1304
1305 "pgrotated",
1306
5509a5d2
DH
1307 "drop_pagecache",
1308 "drop_slab",
8e675f7a 1309 "oom_kill",
5509a5d2 1310
03c5a6e1
MG
1311#ifdef CONFIG_NUMA_BALANCING
1312 "numa_pte_updates",
72403b4a 1313 "numa_huge_pte_updates",
03c5a6e1
MG
1314 "numa_hint_faults",
1315 "numa_hint_faults_local",
1316 "numa_pages_migrated",
1317#endif
5647bc29
MG
1318#ifdef CONFIG_MIGRATION
1319 "pgmigrate_success",
1320 "pgmigrate_fail",
1a5bae25
AK
1321 "thp_migration_success",
1322 "thp_migration_fail",
1323 "thp_migration_split",
5647bc29 1324#endif
fa25c503 1325#ifdef CONFIG_COMPACTION
397487db
MG
1326 "compact_migrate_scanned",
1327 "compact_free_scanned",
1328 "compact_isolated",
fa25c503
KM
1329 "compact_stall",
1330 "compact_fail",
1331 "compact_success",
698b1b30 1332 "compact_daemon_wake",
7f354a54
DR
1333 "compact_daemon_migrate_scanned",
1334 "compact_daemon_free_scanned",
fa25c503
KM
1335#endif
1336
1337#ifdef CONFIG_HUGETLB_PAGE
1338 "htlb_buddy_alloc_success",
1339 "htlb_buddy_alloc_fail",
bbb26920
MK
1340#endif
1341#ifdef CONFIG_CMA
1342 "cma_alloc_success",
1343 "cma_alloc_fail",
fa25c503
KM
1344#endif
1345 "unevictable_pgs_culled",
1346 "unevictable_pgs_scanned",
1347 "unevictable_pgs_rescued",
1348 "unevictable_pgs_mlocked",
1349 "unevictable_pgs_munlocked",
1350 "unevictable_pgs_cleared",
1351 "unevictable_pgs_stranded",
fa25c503
KM
1352
1353#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1354 "thp_fault_alloc",
1355 "thp_fault_fallback",
85b9f46e 1356 "thp_fault_fallback_charge",
fa25c503
KM
1357 "thp_collapse_alloc",
1358 "thp_collapse_alloc_failed",
95ecedcd 1359 "thp_file_alloc",
dcdf11ee 1360 "thp_file_fallback",
85b9f46e 1361 "thp_file_fallback_charge",
95ecedcd 1362 "thp_file_mapped",
122afea9
KS
1363 "thp_split_page",
1364 "thp_split_page_failed",
f9719a03 1365 "thp_deferred_split_page",
122afea9 1366 "thp_split_pmd",
e9ea874a
YY
1367 "thp_scan_exceed_none_pte",
1368 "thp_scan_exceed_swap_pte",
1369 "thp_scan_exceed_share_pte",
ce9311cf
YX
1370#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1371 "thp_split_pud",
1372#endif
d8a8e1f0
KS
1373 "thp_zero_page_alloc",
1374 "thp_zero_page_alloc_failed",
225311a4 1375 "thp_swpout",
fe490cc0 1376 "thp_swpout_fallback",
fa25c503 1377#endif
09316c09
KK
1378#ifdef CONFIG_MEMORY_BALLOON
1379 "balloon_inflate",
1380 "balloon_deflate",
1381#ifdef CONFIG_BALLOON_COMPACTION
1382 "balloon_migrate",
1383#endif
1384#endif /* CONFIG_MEMORY_BALLOON */
ec659934 1385#ifdef CONFIG_DEBUG_TLBFLUSH
9824cf97
DH
1386 "nr_tlb_remote_flush",
1387 "nr_tlb_remote_flush_received",
1388 "nr_tlb_local_flush_all",
1389 "nr_tlb_local_flush_one",
ec659934 1390#endif /* CONFIG_DEBUG_TLBFLUSH */
fa25c503 1391
cbc65df2
HY
1392#ifdef CONFIG_SWAP
1393 "swap_ra",
1394 "swap_ra_hit",
4d45c3af
YY
1395#ifdef CONFIG_KSM
1396 "ksm_swpin_copy",
1397#endif
cbc65df2 1398#endif
94bfe85b
YY
1399#ifdef CONFIG_KSM
1400 "cow_ksm",
1401#endif
f6498b77
JW
1402#ifdef CONFIG_ZSWAP
1403 "zswpin",
1404 "zswpout",
1405#endif
575299ea
S
1406#ifdef CONFIG_X86
1407 "direct_map_level2_splits",
1408 "direct_map_level3_splits",
1409#endif
ebc5d83d 1410#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
fa25c503 1411};
ebc5d83d 1412#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
fa25c503 1413
3c486871
AM
1414#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
1415 defined(CONFIG_PROC_FS)
1416static void *frag_start(struct seq_file *m, loff_t *pos)
1417{
1418 pg_data_t *pgdat;
1419 loff_t node = *pos;
1420
1421 for (pgdat = first_online_pgdat();
1422 pgdat && node;
1423 pgdat = next_online_pgdat(pgdat))
1424 --node;
1425
1426 return pgdat;
1427}
1428
1429static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
1430{
1431 pg_data_t *pgdat = (pg_data_t *)arg;
1432
1433 (*pos)++;
1434 return next_online_pgdat(pgdat);
1435}
1436
1437static void frag_stop(struct seq_file *m, void *arg)
1438{
1439}
1440
b2bd8598
DR
1441/*
1442 * Walk zones in a node and print using a callback.
1443 * If @assert_populated is true, only use callback for zones that are populated.
1444 */
3c486871 1445static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
727c080f 1446 bool assert_populated, bool nolock,
3c486871
AM
1447 void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
1448{
1449 struct zone *zone;
1450 struct zone *node_zones = pgdat->node_zones;
1451 unsigned long flags;
1452
1453 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
b2bd8598 1454 if (assert_populated && !populated_zone(zone))
3c486871
AM
1455 continue;
1456
727c080f
VM
1457 if (!nolock)
1458 spin_lock_irqsave(&zone->lock, flags);
3c486871 1459 print(m, pgdat, zone);
727c080f
VM
1460 if (!nolock)
1461 spin_unlock_irqrestore(&zone->lock, flags);
3c486871
AM
1462 }
1463}
1464#endif
1465
d7a5752c 1466#ifdef CONFIG_PROC_FS
467c996c
MG
1467static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
1468 struct zone *zone)
1469{
1470 int order;
1471
1472 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1473 for (order = 0; order < MAX_ORDER; ++order)
af1c31ac
LS
1474 /*
1475 * Access to nr_free is lockless as nr_free is used only for
1476 * printing purposes. Use data_race to avoid KCSAN warning.
1477 */
1478 seq_printf(m, "%6lu ", data_race(zone->free_area[order].nr_free));
467c996c
MG
1479 seq_putc(m, '\n');
1480}
1481
1482/*
1483 * This walks the free areas for each zone.
1484 */
1485static int frag_show(struct seq_file *m, void *arg)
1486{
1487 pg_data_t *pgdat = (pg_data_t *)arg;
727c080f 1488 walk_zones_in_node(m, pgdat, true, false, frag_show_print);
467c996c
MG
1489 return 0;
1490}
1491
1492static void pagetypeinfo_showfree_print(struct seq_file *m,
1493 pg_data_t *pgdat, struct zone *zone)
1494{
1495 int order, mtype;
1496
1497 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
1498 seq_printf(m, "Node %4d, zone %8s, type %12s ",
1499 pgdat->node_id,
1500 zone->name,
1501 migratetype_names[mtype]);
1502 for (order = 0; order < MAX_ORDER; ++order) {
1503 unsigned long freecount = 0;
1504 struct free_area *area;
1505 struct list_head *curr;
93b3a674 1506 bool overflow = false;
467c996c
MG
1507
1508 area = &(zone->free_area[order]);
1509
93b3a674
MH
1510 list_for_each(curr, &area->free_list[mtype]) {
1511 /*
1512 * Cap the free_list iteration because it might
1513 * be really large and we are under a spinlock
1514 * so a long time spent here could trigger a
1515 * hard lockup detector. Anyway this is a
1516 * debugging tool so knowing there is a handful
1517 * of pages of this order should be more than
1518 * sufficient.
1519 */
1520 if (++freecount >= 100000) {
1521 overflow = true;
1522 break;
1523 }
1524 }
1525 seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
1526 spin_unlock_irq(&zone->lock);
1527 cond_resched();
1528 spin_lock_irq(&zone->lock);
467c996c 1529 }
f6ac2354
CL
1530 seq_putc(m, '\n');
1531 }
467c996c
MG
1532}
1533
1534/* Print out the free pages at each order for each migatetype */
33090af9 1535static void pagetypeinfo_showfree(struct seq_file *m, void *arg)
467c996c
MG
1536{
1537 int order;
1538 pg_data_t *pgdat = (pg_data_t *)arg;
1539
1540 /* Print header */
1541 seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
1542 for (order = 0; order < MAX_ORDER; ++order)
1543 seq_printf(m, "%6d ", order);
1544 seq_putc(m, '\n');
1545
727c080f 1546 walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
467c996c
MG
1547}
1548
1549static void pagetypeinfo_showblockcount_print(struct seq_file *m,
1550 pg_data_t *pgdat, struct zone *zone)
1551{
1552 int mtype;
1553 unsigned long pfn;
1554 unsigned long start_pfn = zone->zone_start_pfn;
108bcc96 1555 unsigned long end_pfn = zone_end_pfn(zone);
467c996c
MG
1556 unsigned long count[MIGRATE_TYPES] = { 0, };
1557
1558 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
1559 struct page *page;
1560
d336e94e
MH
1561 page = pfn_to_online_page(pfn);
1562 if (!page)
467c996c
MG
1563 continue;
1564
a91c43c7
JK
1565 if (page_zone(page) != zone)
1566 continue;
1567
467c996c
MG
1568 mtype = get_pageblock_migratetype(page);
1569
e80d6a24
MG
1570 if (mtype < MIGRATE_TYPES)
1571 count[mtype]++;
467c996c
MG
1572 }
1573
1574 /* Print counts */
1575 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1576 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1577 seq_printf(m, "%12lu ", count[mtype]);
1578 seq_putc(m, '\n');
1579}
1580
f113e641 1581/* Print out the number of pageblocks for each migratetype */
33090af9 1582static void pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
467c996c
MG
1583{
1584 int mtype;
1585 pg_data_t *pgdat = (pg_data_t *)arg;
1586
1587 seq_printf(m, "\n%-23s", "Number of blocks type ");
1588 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1589 seq_printf(m, "%12s ", migratetype_names[mtype]);
1590 seq_putc(m, '\n');
727c080f
VM
1591 walk_zones_in_node(m, pgdat, true, false,
1592 pagetypeinfo_showblockcount_print);
467c996c
MG
1593}
1594
48c96a36
JK
1595/*
1596 * Print out the number of pageblocks for each migratetype that contain pages
1597 * of other types. This gives an indication of how well fallbacks are being
1598 * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
1599 * to determine what is going on
1600 */
1601static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
1602{
1603#ifdef CONFIG_PAGE_OWNER
1604 int mtype;
1605
7dd80b8a 1606 if (!static_branch_unlikely(&page_owner_inited))
48c96a36
JK
1607 return;
1608
1609 drain_all_pages(NULL);
1610
1611 seq_printf(m, "\n%-23s", "Number of mixed blocks ");
1612 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1613 seq_printf(m, "%12s ", migratetype_names[mtype]);
1614 seq_putc(m, '\n');
1615
727c080f
VM
1616 walk_zones_in_node(m, pgdat, true, true,
1617 pagetypeinfo_showmixedcount_print);
48c96a36
JK
1618#endif /* CONFIG_PAGE_OWNER */
1619}
1620
467c996c
MG
1621/*
1622 * This prints out statistics in relation to grouping pages by mobility.
1623 * It is expensive to collect so do not constantly read the file.
1624 */
1625static int pagetypeinfo_show(struct seq_file *m, void *arg)
1626{
1627 pg_data_t *pgdat = (pg_data_t *)arg;
1628
41b25a37 1629 /* check memoryless node */
a47b53c5 1630 if (!node_state(pgdat->node_id, N_MEMORY))
41b25a37
KM
1631 return 0;
1632
467c996c
MG
1633 seq_printf(m, "Page block order: %d\n", pageblock_order);
1634 seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages);
1635 seq_putc(m, '\n');
1636 pagetypeinfo_showfree(m, pgdat);
1637 pagetypeinfo_showblockcount(m, pgdat);
48c96a36 1638 pagetypeinfo_showmixedcount(m, pgdat);
467c996c 1639
f6ac2354
CL
1640 return 0;
1641}
1642
8f32f7e5 1643static const struct seq_operations fragmentation_op = {
f6ac2354
CL
1644 .start = frag_start,
1645 .next = frag_next,
1646 .stop = frag_stop,
1647 .show = frag_show,
1648};
1649
74e2e8e8 1650static const struct seq_operations pagetypeinfo_op = {
467c996c
MG
1651 .start = frag_start,
1652 .next = frag_next,
1653 .stop = frag_stop,
1654 .show = pagetypeinfo_show,
1655};
1656
e2ecc8a7
MG
1657static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
1658{
1659 int zid;
1660
1661 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1662 struct zone *compare = &pgdat->node_zones[zid];
1663
1664 if (populated_zone(compare))
1665 return zone == compare;
1666 }
1667
e2ecc8a7
MG
1668 return false;
1669}
1670
467c996c
MG
1671static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1672 struct zone *zone)
f6ac2354 1673{
467c996c
MG
1674 int i;
1675 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
e2ecc8a7
MG
1676 if (is_zone_first_populated(pgdat, zone)) {
1677 seq_printf(m, "\n per-node stats");
1678 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
69473e5d
MS
1679 unsigned long pages = node_page_state_pages(pgdat, i);
1680
1681 if (vmstat_item_print_in_thp(i))
1682 pages /= HPAGE_PMD_NR;
9d7ea9a2 1683 seq_printf(m, "\n %-12s %lu", node_stat_name(i),
69473e5d 1684 pages);
e2ecc8a7
MG
1685 }
1686 }
467c996c
MG
1687 seq_printf(m,
1688 "\n pages free %lu"
a6ea8b5b 1689 "\n boost %lu"
467c996c
MG
1690 "\n min %lu"
1691 "\n low %lu"
1692 "\n high %lu"
467c996c 1693 "\n spanned %lu"
9feedc9d 1694 "\n present %lu"
3c381db1
DH
1695 "\n managed %lu"
1696 "\n cma %lu",
88f5acf8 1697 zone_page_state(zone, NR_FREE_PAGES),
a6ea8b5b 1698 zone->watermark_boost,
41858966
MG
1699 min_wmark_pages(zone),
1700 low_wmark_pages(zone),
1701 high_wmark_pages(zone),
467c996c 1702 zone->spanned_pages,
9feedc9d 1703 zone->present_pages,
3c381db1
DH
1704 zone_managed_pages(zone),
1705 zone_cma_pages(zone));
467c996c 1706
467c996c 1707 seq_printf(m,
3484b2de 1708 "\n protection: (%ld",
467c996c
MG
1709 zone->lowmem_reserve[0]);
1710 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
3484b2de 1711 seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
7dfb8bf3
DR
1712 seq_putc(m, ')');
1713
a8a4b7ae
BH
1714 /* If unpopulated, no other information is useful */
1715 if (!populated_zone(zone)) {
1716 seq_putc(m, '\n');
1717 return;
1718 }
1719
7dfb8bf3 1720 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
9d7ea9a2
KK
1721 seq_printf(m, "\n %-12s %lu", zone_stat_name(i),
1722 zone_page_state(zone, i));
7dfb8bf3 1723
3a321d2a 1724#ifdef CONFIG_NUMA
f19298b9 1725 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
9d7ea9a2 1726 seq_printf(m, "\n %-12s %lu", numa_stat_name(i),
f19298b9 1727 zone_numa_event_state(zone, i));
3a321d2a
KW
1728#endif
1729
7dfb8bf3 1730 seq_printf(m, "\n pagesets");
467c996c 1731 for_each_online_cpu(i) {
28f836b6
MG
1732 struct per_cpu_pages *pcp;
1733 struct per_cpu_zonestat __maybe_unused *pzstats;
467c996c 1734
28f836b6 1735 pcp = per_cpu_ptr(zone->per_cpu_pageset, i);
3dfa5721
CL
1736 seq_printf(m,
1737 "\n cpu: %i"
1738 "\n count: %i"
1739 "\n high: %i"
1740 "\n batch: %i",
1741 i,
28f836b6
MG
1742 pcp->count,
1743 pcp->high,
1744 pcp->batch);
df9ecaba 1745#ifdef CONFIG_SMP
28f836b6 1746 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i);
467c996c 1747 seq_printf(m, "\n vm stats threshold: %d",
28f836b6 1748 pzstats->stat_threshold);
df9ecaba 1749#endif
f6ac2354 1750 }
467c996c 1751 seq_printf(m,
599d0c95 1752 "\n node_unreclaimable: %u"
3a50d14d 1753 "\n start_pfn: %lu",
c73322d0 1754 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
3a50d14d 1755 zone->zone_start_pfn);
467c996c
MG
1756 seq_putc(m, '\n');
1757}
1758
1759/*
b2bd8598
DR
1760 * Output information about zones in @pgdat. All zones are printed regardless
1761 * of whether they are populated or not: lowmem_reserve_ratio operates on the
1762 * set of all zones and userspace would not be aware of such zones if they are
1763 * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
467c996c
MG
1764 */
1765static int zoneinfo_show(struct seq_file *m, void *arg)
1766{
1767 pg_data_t *pgdat = (pg_data_t *)arg;
727c080f 1768 walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
f6ac2354
CL
1769 return 0;
1770}
1771
5c9fe628 1772static const struct seq_operations zoneinfo_op = {
f6ac2354
CL
1773 .start = frag_start, /* iterate over all zones. The same as in
1774 * fragmentation. */
1775 .next = frag_next,
1776 .stop = frag_stop,
1777 .show = zoneinfo_show,
1778};
1779
9d7ea9a2 1780#define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
f19298b9 1781 NR_VM_NUMA_EVENT_ITEMS + \
9d7ea9a2
KK
1782 NR_VM_NODE_STAT_ITEMS + \
1783 NR_VM_WRITEBACK_STAT_ITEMS + \
1784 (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
1785 NR_VM_EVENT_ITEMS : 0))
79da826a 1786
f6ac2354
CL
1787static void *vmstat_start(struct seq_file *m, loff_t *pos)
1788{
2244b95a 1789 unsigned long *v;
9d7ea9a2 1790 int i;
f6ac2354 1791
9d7ea9a2 1792 if (*pos >= NR_VMSTAT_ITEMS)
f6ac2354 1793 return NULL;
79da826a 1794
9d7ea9a2 1795 BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
f19298b9 1796 fold_vm_numa_events();
9d7ea9a2 1797 v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
2244b95a
CL
1798 m->private = v;
1799 if (!v)
f6ac2354 1800 return ERR_PTR(-ENOMEM);
2244b95a 1801 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
c41f012a 1802 v[i] = global_zone_page_state(i);
79da826a
MR
1803 v += NR_VM_ZONE_STAT_ITEMS;
1804
3a321d2a 1805#ifdef CONFIG_NUMA
f19298b9
MG
1806 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
1807 v[i] = global_numa_event_state(i);
1808 v += NR_VM_NUMA_EVENT_ITEMS;
3a321d2a
KW
1809#endif
1810
69473e5d 1811 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
ea426c2a 1812 v[i] = global_node_page_state_pages(i);
69473e5d
MS
1813 if (vmstat_item_print_in_thp(i))
1814 v[i] /= HPAGE_PMD_NR;
1815 }
75ef7184
MG
1816 v += NR_VM_NODE_STAT_ITEMS;
1817
79da826a
MR
1818 global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
1819 v + NR_DIRTY_THRESHOLD);
1820 v += NR_VM_WRITEBACK_STAT_ITEMS;
1821
f8891e5e 1822#ifdef CONFIG_VM_EVENT_COUNTERS
79da826a
MR
1823 all_vm_events(v);
1824 v[PGPGIN] /= 2; /* sectors -> kbytes */
1825 v[PGPGOUT] /= 2;
f8891e5e 1826#endif
ff8b16d7 1827 return (unsigned long *)m->private + *pos;
f6ac2354
CL
1828}
1829
1830static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1831{
1832 (*pos)++;
9d7ea9a2 1833 if (*pos >= NR_VMSTAT_ITEMS)
f6ac2354
CL
1834 return NULL;
1835 return (unsigned long *)m->private + *pos;
1836}
1837
1838static int vmstat_show(struct seq_file *m, void *arg)
1839{
1840 unsigned long *l = arg;
1841 unsigned long off = l - (unsigned long *)m->private;
68ba0326
AD
1842
1843 seq_puts(m, vmstat_text[off]);
75ba1d07 1844 seq_put_decimal_ull(m, " ", *l);
68ba0326 1845 seq_putc(m, '\n');
8d92890b
N
1846
1847 if (off == NR_VMSTAT_ITEMS - 1) {
1848 /*
1849 * We've come to the end - add any deprecated counters to avoid
1850 * breaking userspace which might depend on them being present.
1851 */
1852 seq_puts(m, "nr_unstable 0\n");
1853 }
f6ac2354
CL
1854 return 0;
1855}
1856
1857static void vmstat_stop(struct seq_file *m, void *arg)
1858{
1859 kfree(m->private);
1860 m->private = NULL;
1861}
1862
b6aa44ab 1863static const struct seq_operations vmstat_op = {
f6ac2354
CL
1864 .start = vmstat_start,
1865 .next = vmstat_next,
1866 .stop = vmstat_stop,
1867 .show = vmstat_show,
1868};
f6ac2354
CL
1869#endif /* CONFIG_PROC_FS */
1870
df9ecaba 1871#ifdef CONFIG_SMP
d1187ed2 1872static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
77461ab3 1873int sysctl_stat_interval __read_mostly = HZ;
d1187ed2 1874
52b6f46b
HD
1875#ifdef CONFIG_PROC_FS
1876static void refresh_vm_stats(struct work_struct *work)
1877{
1878 refresh_cpu_vm_stats(true);
1879}
1880
1881int vmstat_refresh(struct ctl_table *table, int write,
32927393 1882 void *buffer, size_t *lenp, loff_t *ppos)
52b6f46b
HD
1883{
1884 long val;
1885 int err;
1886 int i;
1887
1888 /*
1889 * The regular update, every sysctl_stat_interval, may come later
1890 * than expected: leaving a significant amount in per_cpu buckets.
1891 * This is particularly misleading when checking a quantity of HUGE
1892 * pages, immediately after running a test. /proc/sys/vm/stat_refresh,
1893 * which can equally be echo'ed to or cat'ted from (by root),
1894 * can be used to update the stats just before reading them.
1895 *
c41f012a 1896 * Oh, and since global_zone_page_state() etc. are so careful to hide
52b6f46b
HD
1897 * transiently negative values, report an error here if any of
1898 * the stats is negative, so we know to go looking for imbalance.
1899 */
1900 err = schedule_on_each_cpu(refresh_vm_stats);
1901 if (err)
1902 return err;
1903 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
75083aae
HD
1904 /*
1905 * Skip checking stats known to go negative occasionally.
1906 */
1907 switch (i) {
1908 case NR_ZONE_WRITE_PENDING:
1909 case NR_FREE_CMA_PAGES:
1910 continue;
1911 }
75ef7184 1912 val = atomic_long_read(&vm_zone_stat[i]);
52b6f46b 1913 if (val < 0) {
c822f622 1914 pr_warn("%s: %s %ld\n",
9d7ea9a2 1915 __func__, zone_stat_name(i), val);
52b6f46b
HD
1916 }
1917 }
76d8cc3c 1918 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
75083aae
HD
1919 /*
1920 * Skip checking stats known to go negative occasionally.
1921 */
1922 switch (i) {
1923 case NR_WRITEBACK:
1924 continue;
1925 }
76d8cc3c
HD
1926 val = atomic_long_read(&vm_node_stat[i]);
1927 if (val < 0) {
1928 pr_warn("%s: %s %ld\n",
1929 __func__, node_stat_name(i), val);
76d8cc3c
HD
1930 }
1931 }
52b6f46b
HD
1932 if (write)
1933 *ppos += *lenp;
1934 else
1935 *lenp = 0;
1936 return 0;
1937}
1938#endif /* CONFIG_PROC_FS */
1939
d1187ed2
CL
1940static void vmstat_update(struct work_struct *w)
1941{
0eb77e98 1942 if (refresh_cpu_vm_stats(true)) {
7cc36bbd
CL
1943 /*
1944 * Counters were updated so we expect more updates
1945 * to occur in the future. Keep on running the
1946 * update worker thread.
1947 */
ce612879 1948 queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
f01f17d3
MH
1949 this_cpu_ptr(&vmstat_work),
1950 round_jiffies_relative(sysctl_stat_interval));
7cc36bbd
CL
1951 }
1952}
1953
1954/*
1955 * Check if the diffs for a certain cpu indicate that
1956 * an update is needed.
1957 */
1958static bool need_update(int cpu)
1959{
2bbd00ae 1960 pg_data_t *last_pgdat = NULL;
7cc36bbd
CL
1961 struct zone *zone;
1962
1963 for_each_populated_zone(zone) {
28f836b6 1964 struct per_cpu_zonestat *pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
2bbd00ae 1965 struct per_cpu_nodestat *n;
28f836b6 1966
7cc36bbd
CL
1967 /*
1968 * The fast way of checking if there are any vmstat diffs.
7cc36bbd 1969 */
64632fd3 1970 if (memchr_inv(pzstats->vm_stat_diff, 0, sizeof(pzstats->vm_stat_diff)))
7cc36bbd 1971 return true;
f19298b9 1972
2bbd00ae
JW
1973 if (last_pgdat == zone->zone_pgdat)
1974 continue;
1975 last_pgdat = zone->zone_pgdat;
1976 n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu);
64632fd3
ML
1977 if (memchr_inv(n->vm_node_stat_diff, 0, sizeof(n->vm_node_stat_diff)))
1978 return true;
7cc36bbd
CL
1979 }
1980 return false;
1981}
1982
7b8da4c7
CL
1983/*
1984 * Switch off vmstat processing and then fold all the remaining differentials
1985 * until the diffs stay at zero. The function is used by NOHZ and can only be
1986 * invoked when tick processing is not active.
1987 */
f01f17d3
MH
1988void quiet_vmstat(void)
1989{
1990 if (system_state != SYSTEM_RUNNING)
1991 return;
1992
7b8da4c7 1993 if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
f01f17d3
MH
1994 return;
1995
1996 if (!need_update(smp_processor_id()))
1997 return;
1998
1999 /*
2000 * Just refresh counters and do not care about the pending delayed
2001 * vmstat_update. It doesn't fire that often to matter and canceling
2002 * it would be too expensive from this path.
2003 * vmstat_shepherd will take care about that for us.
2004 */
2005 refresh_cpu_vm_stats(false);
2006}
2007
7cc36bbd
CL
2008/*
2009 * Shepherd worker thread that checks the
2010 * differentials of processors that have their worker
2011 * threads for vm statistics updates disabled because of
2012 * inactivity.
2013 */
2014static void vmstat_shepherd(struct work_struct *w);
2015
0eb77e98 2016static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
7cc36bbd
CL
2017
2018static void vmstat_shepherd(struct work_struct *w)
2019{
2020 int cpu;
2021
7625eccd 2022 cpus_read_lock();
7cc36bbd 2023 /* Check processors whose vmstat worker threads have been disabled */
7b8da4c7 2024 for_each_online_cpu(cpu) {
f01f17d3 2025 struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
7cc36bbd 2026
7b8da4c7 2027 if (!delayed_work_pending(dw) && need_update(cpu))
ce612879 2028 queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
fbcc8183
JB
2029
2030 cond_resched();
f01f17d3 2031 }
7625eccd 2032 cpus_read_unlock();
7cc36bbd
CL
2033
2034 schedule_delayed_work(&shepherd,
98f4ebb2 2035 round_jiffies_relative(sysctl_stat_interval));
d1187ed2
CL
2036}
2037
7cc36bbd 2038static void __init start_shepherd_timer(void)
d1187ed2 2039{
7cc36bbd
CL
2040 int cpu;
2041
2042 for_each_possible_cpu(cpu)
ccde8bd4 2043 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
7cc36bbd
CL
2044 vmstat_update);
2045
7cc36bbd
CL
2046 schedule_delayed_work(&shepherd,
2047 round_jiffies_relative(sysctl_stat_interval));
d1187ed2
CL
2048}
2049
03e86dba
TC
2050static void __init init_cpu_node_state(void)
2051{
4c501327 2052 int node;
03e86dba 2053
4c501327 2054 for_each_online_node(node) {
b55032f1 2055 if (!cpumask_empty(cpumask_of_node(node)))
4c501327
SAS
2056 node_set_state(node, N_CPU);
2057 }
03e86dba
TC
2058}
2059
5438da97
SAS
2060static int vmstat_cpu_online(unsigned int cpu)
2061{
2062 refresh_zone_stat_thresholds();
734c1570
OS
2063
2064 if (!node_state(cpu_to_node(cpu), N_CPU)) {
2065 node_set_state(cpu_to_node(cpu), N_CPU);
734c1570
OS
2066 }
2067
5438da97
SAS
2068 return 0;
2069}
2070
2071static int vmstat_cpu_down_prep(unsigned int cpu)
2072{
2073 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
2074 return 0;
2075}
2076
2077static int vmstat_cpu_dead(unsigned int cpu)
807a1bd2 2078{
4c501327 2079 const struct cpumask *node_cpus;
5438da97 2080 int node;
807a1bd2 2081
5438da97
SAS
2082 node = cpu_to_node(cpu);
2083
2084 refresh_zone_stat_thresholds();
4c501327 2085 node_cpus = cpumask_of_node(node);
b55032f1 2086 if (!cpumask_empty(node_cpus))
5438da97 2087 return 0;
807a1bd2
TK
2088
2089 node_clear_state(node, N_CPU);
734c1570 2090
5438da97 2091 return 0;
807a1bd2
TK
2092}
2093
8f32f7e5 2094#endif
df9ecaba 2095
ce612879
MH
2096struct workqueue_struct *mm_percpu_wq;
2097
597b7305 2098void __init init_mm_internals(void)
df9ecaba 2099{
ce612879 2100 int ret __maybe_unused;
5438da97 2101
80d136e1 2102 mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
ce612879
MH
2103
2104#ifdef CONFIG_SMP
5438da97
SAS
2105 ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
2106 NULL, vmstat_cpu_dead);
2107 if (ret < 0)
2108 pr_err("vmstat: failed to register 'dead' hotplug state\n");
2109
2110 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
2111 vmstat_cpu_online,
2112 vmstat_cpu_down_prep);
2113 if (ret < 0)
2114 pr_err("vmstat: failed to register 'online' hotplug state\n");
2115
7625eccd 2116 cpus_read_lock();
03e86dba 2117 init_cpu_node_state();
7625eccd 2118 cpus_read_unlock();
d1187ed2 2119
7cc36bbd 2120 start_shepherd_timer();
8f32f7e5
AD
2121#endif
2122#ifdef CONFIG_PROC_FS
fddda2b7 2123 proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
abaed011 2124 proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
fddda2b7
CH
2125 proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
2126 proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
8f32f7e5 2127#endif
df9ecaba 2128}
d7a5752c
MG
2129
2130#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
d7a5752c
MG
2131
2132/*
2133 * Return an index indicating how much of the available free memory is
2134 * unusable for an allocation of the requested size.
2135 */
2136static int unusable_free_index(unsigned int order,
2137 struct contig_page_info *info)
2138{
2139 /* No free memory is interpreted as all free memory is unusable */
2140 if (info->free_pages == 0)
2141 return 1000;
2142
2143 /*
2144 * Index should be a value between 0 and 1. Return a value to 3
2145 * decimal places.
2146 *
2147 * 0 => no fragmentation
2148 * 1 => high fragmentation
2149 */
2150 return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
2151
2152}
2153
2154static void unusable_show_print(struct seq_file *m,
2155 pg_data_t *pgdat, struct zone *zone)
2156{
2157 unsigned int order;
2158 int index;
2159 struct contig_page_info info;
2160
2161 seq_printf(m, "Node %d, zone %8s ",
2162 pgdat->node_id,
2163 zone->name);
2164 for (order = 0; order < MAX_ORDER; ++order) {
2165 fill_contig_page_info(zone, order, &info);
2166 index = unusable_free_index(order, &info);
2167 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2168 }
2169
2170 seq_putc(m, '\n');
2171}
2172
2173/*
2174 * Display unusable free space index
2175 *
2176 * The unusable free space index measures how much of the available free
2177 * memory cannot be used to satisfy an allocation of a given size and is a
2178 * value between 0 and 1. The higher the value, the more of free memory is
2179 * unusable and by implication, the worse the external fragmentation is. This
2180 * can be expressed as a percentage by multiplying by 100.
2181 */
2182static int unusable_show(struct seq_file *m, void *arg)
2183{
2184 pg_data_t *pgdat = (pg_data_t *)arg;
2185
2186 /* check memoryless node */
a47b53c5 2187 if (!node_state(pgdat->node_id, N_MEMORY))
d7a5752c
MG
2188 return 0;
2189
727c080f 2190 walk_zones_in_node(m, pgdat, true, false, unusable_show_print);
d7a5752c
MG
2191
2192 return 0;
2193}
2194
01a99560 2195static const struct seq_operations unusable_sops = {
d7a5752c
MG
2196 .start = frag_start,
2197 .next = frag_next,
2198 .stop = frag_stop,
2199 .show = unusable_show,
2200};
2201
01a99560 2202DEFINE_SEQ_ATTRIBUTE(unusable);
d7a5752c 2203
f1a5ab12
MG
2204static void extfrag_show_print(struct seq_file *m,
2205 pg_data_t *pgdat, struct zone *zone)
2206{
2207 unsigned int order;
2208 int index;
2209
2210 /* Alloc on stack as interrupts are disabled for zone walk */
2211 struct contig_page_info info;
2212
2213 seq_printf(m, "Node %d, zone %8s ",
2214 pgdat->node_id,
2215 zone->name);
2216 for (order = 0; order < MAX_ORDER; ++order) {
2217 fill_contig_page_info(zone, order, &info);
56de7263 2218 index = __fragmentation_index(order, &info);
a9970586 2219 seq_printf(m, "%2d.%03d ", index / 1000, index % 1000);
f1a5ab12
MG
2220 }
2221
2222 seq_putc(m, '\n');
2223}
2224
2225/*
2226 * Display fragmentation index for orders that allocations would fail for
2227 */
2228static int extfrag_show(struct seq_file *m, void *arg)
2229{
2230 pg_data_t *pgdat = (pg_data_t *)arg;
2231
727c080f 2232 walk_zones_in_node(m, pgdat, true, false, extfrag_show_print);
f1a5ab12
MG
2233
2234 return 0;
2235}
2236
01a99560 2237static const struct seq_operations extfrag_sops = {
f1a5ab12
MG
2238 .start = frag_start,
2239 .next = frag_next,
2240 .stop = frag_stop,
2241 .show = extfrag_show,
2242};
2243
01a99560 2244DEFINE_SEQ_ATTRIBUTE(extfrag);
f1a5ab12 2245
d7a5752c
MG
2246static int __init extfrag_debug_init(void)
2247{
bde8bd8a
S
2248 struct dentry *extfrag_debug_root;
2249
d7a5752c 2250 extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
d7a5752c 2251
d9f7979c 2252 debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
01a99560 2253 &unusable_fops);
d7a5752c 2254
d9f7979c 2255 debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
01a99560 2256 &extfrag_fops);
f1a5ab12 2257
d7a5752c
MG
2258 return 0;
2259}
2260
2261module_init(extfrag_debug_init);
2262#endif