kasan, kmemleak: reset tags when scanning block
[linux-block.git] / mm / vmstat.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
f6ac2354
CL
2/*
3 * linux/mm/vmstat.c
4 *
5 * Manages VM statistics
6 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
2244b95a
CL
7 *
8 * zoned VM statistics
9 * Copyright (C) 2006 Silicon Graphics, Inc.,
10 * Christoph Lameter <christoph@lameter.com>
7cc36bbd 11 * Copyright (C) 2008-2014 Christoph Lameter
f6ac2354 12 */
8f32f7e5 13#include <linux/fs.h>
f6ac2354 14#include <linux/mm.h>
4e950f6f 15#include <linux/err.h>
2244b95a 16#include <linux/module.h>
5a0e3ad6 17#include <linux/slab.h>
df9ecaba 18#include <linux/cpu.h>
7cc36bbd 19#include <linux/cpumask.h>
c748e134 20#include <linux/vmstat.h>
3c486871
AM
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/debugfs.h>
e8edc6e0 24#include <linux/sched.h>
f1a5ab12 25#include <linux/math64.h>
79da826a 26#include <linux/writeback.h>
36deb0be 27#include <linux/compaction.h>
6e543d57 28#include <linux/mm_inline.h>
48c96a36
JK
29#include <linux/page_ext.h>
30#include <linux/page_owner.h>
6e543d57
LD
31
32#include "internal.h"
f6ac2354 33
4518085e
KW
34#ifdef CONFIG_NUMA
35int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
36
37/* zero numa counters within a zone */
38static void zero_zone_numa_counters(struct zone *zone)
39{
40 int item, cpu;
41
f19298b9
MG
42 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) {
43 atomic_long_set(&zone->vm_numa_event[item], 0);
44 for_each_online_cpu(cpu) {
45 per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item]
4518085e 46 = 0;
f19298b9 47 }
4518085e
KW
48 }
49}
50
51/* zero numa counters of all the populated zones */
52static void zero_zones_numa_counters(void)
53{
54 struct zone *zone;
55
56 for_each_populated_zone(zone)
57 zero_zone_numa_counters(zone);
58}
59
60/* zero global numa counters */
61static void zero_global_numa_counters(void)
62{
63 int item;
64
f19298b9
MG
65 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
66 atomic_long_set(&vm_numa_event[item], 0);
4518085e
KW
67}
68
69static void invalid_numa_statistics(void)
70{
71 zero_zones_numa_counters();
72 zero_global_numa_counters();
73}
74
75static DEFINE_MUTEX(vm_numa_stat_lock);
76
77int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
32927393 78 void *buffer, size_t *length, loff_t *ppos)
4518085e
KW
79{
80 int ret, oldval;
81
82 mutex_lock(&vm_numa_stat_lock);
83 if (write)
84 oldval = sysctl_vm_numa_stat;
85 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
86 if (ret || !write)
87 goto out;
88
89 if (oldval == sysctl_vm_numa_stat)
90 goto out;
91 else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
92 static_branch_enable(&vm_numa_stat_key);
93 pr_info("enable numa statistics\n");
94 } else {
95 static_branch_disable(&vm_numa_stat_key);
96 invalid_numa_statistics();
97 pr_info("disable numa statistics, and clear numa counters\n");
98 }
99
100out:
101 mutex_unlock(&vm_numa_stat_lock);
102 return ret;
103}
104#endif
105
f8891e5e
CL
106#ifdef CONFIG_VM_EVENT_COUNTERS
107DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
108EXPORT_PER_CPU_SYMBOL(vm_event_states);
109
31f961a8 110static void sum_vm_events(unsigned long *ret)
f8891e5e 111{
9eccf2a8 112 int cpu;
f8891e5e
CL
113 int i;
114
115 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
116
31f961a8 117 for_each_online_cpu(cpu) {
f8891e5e
CL
118 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
119
f8891e5e
CL
120 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
121 ret[i] += this->event[i];
122 }
123}
124
125/*
126 * Accumulate the vm event counters across all CPUs.
127 * The result is unavoidably approximate - it can change
128 * during and after execution of this function.
129*/
130void all_vm_events(unsigned long *ret)
131{
b5be1132 132 get_online_cpus();
31f961a8 133 sum_vm_events(ret);
b5be1132 134 put_online_cpus();
f8891e5e 135}
32dd66fc 136EXPORT_SYMBOL_GPL(all_vm_events);
f8891e5e 137
f8891e5e
CL
138/*
139 * Fold the foreign cpu events into our own.
140 *
141 * This is adding to the events on one processor
142 * but keeps the global counts constant.
143 */
144void vm_events_fold_cpu(int cpu)
145{
146 struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
147 int i;
148
149 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
150 count_vm_events(i, fold_state->event[i]);
151 fold_state->event[i] = 0;
152 }
153}
f8891e5e
CL
154
155#endif /* CONFIG_VM_EVENT_COUNTERS */
156
2244b95a
CL
157/*
158 * Manage combined zone based / global counters
159 *
160 * vm_stat contains the global counters
161 */
75ef7184
MG
162atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
163atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
f19298b9 164atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
75ef7184
MG
165EXPORT_SYMBOL(vm_zone_stat);
166EXPORT_SYMBOL(vm_node_stat);
2244b95a
CL
167
168#ifdef CONFIG_SMP
169
b44129b3 170int calculate_pressure_threshold(struct zone *zone)
88f5acf8
MG
171{
172 int threshold;
173 int watermark_distance;
174
175 /*
176 * As vmstats are not up to date, there is drift between the estimated
177 * and real values. For high thresholds and a high number of CPUs, it
178 * is possible for the min watermark to be breached while the estimated
179 * value looks fine. The pressure threshold is a reduced value such
180 * that even the maximum amount of drift will not accidentally breach
181 * the min watermark
182 */
183 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
184 threshold = max(1, (int)(watermark_distance / num_online_cpus()));
185
186 /*
187 * Maximum threshold is 125
188 */
189 threshold = min(125, threshold);
190
191 return threshold;
192}
193
b44129b3 194int calculate_normal_threshold(struct zone *zone)
df9ecaba
CL
195{
196 int threshold;
197 int mem; /* memory in 128 MB units */
198
199 /*
200 * The threshold scales with the number of processors and the amount
201 * of memory per zone. More memory means that we can defer updates for
202 * longer, more processors could lead to more contention.
203 * fls() is used to have a cheap way of logarithmic scaling.
204 *
205 * Some sample thresholds:
206 *
207 * Threshold Processors (fls) Zonesize fls(mem+1)
208 * ------------------------------------------------------------------
209 * 8 1 1 0.9-1 GB 4
210 * 16 2 2 0.9-1 GB 4
211 * 20 2 2 1-2 GB 5
212 * 24 2 2 2-4 GB 6
213 * 28 2 2 4-8 GB 7
214 * 32 2 2 8-16 GB 8
215 * 4 2 2 <128M 1
216 * 30 4 3 2-4 GB 5
217 * 48 4 3 8-16 GB 8
218 * 32 8 4 1-2 GB 4
219 * 32 8 4 0.9-1GB 4
220 * 10 16 5 <128M 1
221 * 40 16 5 900M 4
222 * 70 64 7 2-4 GB 5
223 * 84 64 7 4-8 GB 6
224 * 108 512 9 4-8 GB 6
225 * 125 1024 10 8-16 GB 8
226 * 125 1024 10 16-32 GB 9
227 */
228
9705bea5 229 mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
df9ecaba
CL
230
231 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
232
233 /*
234 * Maximum threshold is 125
235 */
236 threshold = min(125, threshold);
237
238 return threshold;
239}
2244b95a
CL
240
241/*
df9ecaba 242 * Refresh the thresholds for each zone.
2244b95a 243 */
a6cccdc3 244void refresh_zone_stat_thresholds(void)
2244b95a 245{
75ef7184 246 struct pglist_data *pgdat;
df9ecaba
CL
247 struct zone *zone;
248 int cpu;
249 int threshold;
250
75ef7184
MG
251 /* Zero current pgdat thresholds */
252 for_each_online_pgdat(pgdat) {
253 for_each_online_cpu(cpu) {
254 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
255 }
256 }
257
ee99c71c 258 for_each_populated_zone(zone) {
75ef7184 259 struct pglist_data *pgdat = zone->zone_pgdat;
aa454840
CL
260 unsigned long max_drift, tolerate_drift;
261
b44129b3 262 threshold = calculate_normal_threshold(zone);
df9ecaba 263
75ef7184
MG
264 for_each_online_cpu(cpu) {
265 int pgdat_threshold;
266
28f836b6 267 per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
99dcc3e5 268 = threshold;
1d90ca89 269
75ef7184
MG
270 /* Base nodestat threshold on the largest populated zone. */
271 pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
272 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
273 = max(threshold, pgdat_threshold);
274 }
275
aa454840
CL
276 /*
277 * Only set percpu_drift_mark if there is a danger that
278 * NR_FREE_PAGES reports the low watermark is ok when in fact
279 * the min watermark could be breached by an allocation
280 */
281 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
282 max_drift = num_online_cpus() * threshold;
283 if (max_drift > tolerate_drift)
284 zone->percpu_drift_mark = high_wmark_pages(zone) +
285 max_drift;
df9ecaba 286 }
2244b95a
CL
287}
288
b44129b3
MG
289void set_pgdat_percpu_threshold(pg_data_t *pgdat,
290 int (*calculate_pressure)(struct zone *))
88f5acf8
MG
291{
292 struct zone *zone;
293 int cpu;
294 int threshold;
295 int i;
296
88f5acf8
MG
297 for (i = 0; i < pgdat->nr_zones; i++) {
298 zone = &pgdat->node_zones[i];
299 if (!zone->percpu_drift_mark)
300 continue;
301
b44129b3 302 threshold = (*calculate_pressure)(zone);
1d90ca89 303 for_each_online_cpu(cpu)
28f836b6 304 per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
88f5acf8
MG
305 = threshold;
306 }
88f5acf8
MG
307}
308
2244b95a 309/*
bea04b07
JZ
310 * For use when we know that interrupts are disabled,
311 * or when we know that preemption is disabled and that
312 * particular counter cannot be updated from interrupt context.
2244b95a
CL
313 */
314void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 315 long delta)
2244b95a 316{
28f836b6 317 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
12938a92 318 s8 __percpu *p = pcp->vm_stat_diff + item;
2244b95a 319 long x;
12938a92
CL
320 long t;
321
322 x = delta + __this_cpu_read(*p);
2244b95a 323
12938a92 324 t = __this_cpu_read(pcp->stat_threshold);
2244b95a 325
40610076 326 if (unlikely(abs(x) > t)) {
2244b95a
CL
327 zone_page_state_add(x, zone, item);
328 x = 0;
329 }
12938a92 330 __this_cpu_write(*p, x);
2244b95a
CL
331}
332EXPORT_SYMBOL(__mod_zone_page_state);
333
75ef7184
MG
334void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
335 long delta)
336{
337 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
338 s8 __percpu *p = pcp->vm_node_stat_diff + item;
339 long x;
340 long t;
341
ea426c2a 342 if (vmstat_item_in_bytes(item)) {
629484ae
JW
343 /*
344 * Only cgroups use subpage accounting right now; at
345 * the global level, these items still change in
346 * multiples of whole pages. Store them as pages
347 * internally to keep the per-cpu counters compact.
348 */
ea426c2a
RG
349 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
350 delta >>= PAGE_SHIFT;
351 }
352
75ef7184
MG
353 x = delta + __this_cpu_read(*p);
354
355 t = __this_cpu_read(pcp->stat_threshold);
356
40610076 357 if (unlikely(abs(x) > t)) {
75ef7184
MG
358 node_page_state_add(x, pgdat, item);
359 x = 0;
360 }
361 __this_cpu_write(*p, x);
362}
363EXPORT_SYMBOL(__mod_node_page_state);
364
2244b95a
CL
365/*
366 * Optimized increment and decrement functions.
367 *
368 * These are only for a single page and therefore can take a struct page *
369 * argument instead of struct zone *. This allows the inclusion of the code
370 * generated for page_zone(page) into the optimized functions.
371 *
372 * No overflow check is necessary and therefore the differential can be
373 * incremented or decremented in place which may allow the compilers to
374 * generate better code.
2244b95a
CL
375 * The increment or decrement is known and therefore one boundary check can
376 * be omitted.
377 *
df9ecaba
CL
378 * NOTE: These functions are very performance sensitive. Change only
379 * with care.
380 *
2244b95a
CL
381 * Some processors have inc/dec instructions that are atomic vs an interrupt.
382 * However, the code must first determine the differential location in a zone
383 * based on the processor number and then inc/dec the counter. There is no
384 * guarantee without disabling preemption that the processor will not change
385 * in between and therefore the atomicity vs. interrupt cannot be exploited
386 * in a useful way here.
387 */
c8785385 388void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
2244b95a 389{
28f836b6 390 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
12938a92
CL
391 s8 __percpu *p = pcp->vm_stat_diff + item;
392 s8 v, t;
2244b95a 393
908ee0f1 394 v = __this_cpu_inc_return(*p);
12938a92
CL
395 t = __this_cpu_read(pcp->stat_threshold);
396 if (unlikely(v > t)) {
397 s8 overstep = t >> 1;
df9ecaba 398
12938a92
CL
399 zone_page_state_add(v + overstep, zone, item);
400 __this_cpu_write(*p, -overstep);
2244b95a
CL
401 }
402}
ca889e6c 403
75ef7184
MG
404void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
405{
406 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
407 s8 __percpu *p = pcp->vm_node_stat_diff + item;
408 s8 v, t;
409
ea426c2a
RG
410 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
411
75ef7184
MG
412 v = __this_cpu_inc_return(*p);
413 t = __this_cpu_read(pcp->stat_threshold);
414 if (unlikely(v > t)) {
415 s8 overstep = t >> 1;
416
417 node_page_state_add(v + overstep, pgdat, item);
418 __this_cpu_write(*p, -overstep);
419 }
420}
421
ca889e6c
CL
422void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
423{
424 __inc_zone_state(page_zone(page), item);
425}
2244b95a
CL
426EXPORT_SYMBOL(__inc_zone_page_state);
427
75ef7184
MG
428void __inc_node_page_state(struct page *page, enum node_stat_item item)
429{
430 __inc_node_state(page_pgdat(page), item);
431}
432EXPORT_SYMBOL(__inc_node_page_state);
433
c8785385 434void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
2244b95a 435{
28f836b6 436 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
12938a92
CL
437 s8 __percpu *p = pcp->vm_stat_diff + item;
438 s8 v, t;
2244b95a 439
908ee0f1 440 v = __this_cpu_dec_return(*p);
12938a92
CL
441 t = __this_cpu_read(pcp->stat_threshold);
442 if (unlikely(v < - t)) {
443 s8 overstep = t >> 1;
2244b95a 444
12938a92
CL
445 zone_page_state_add(v - overstep, zone, item);
446 __this_cpu_write(*p, overstep);
2244b95a
CL
447 }
448}
c8785385 449
75ef7184
MG
450void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
451{
452 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
453 s8 __percpu *p = pcp->vm_node_stat_diff + item;
454 s8 v, t;
455
ea426c2a
RG
456 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
457
75ef7184
MG
458 v = __this_cpu_dec_return(*p);
459 t = __this_cpu_read(pcp->stat_threshold);
460 if (unlikely(v < - t)) {
461 s8 overstep = t >> 1;
462
463 node_page_state_add(v - overstep, pgdat, item);
464 __this_cpu_write(*p, overstep);
465 }
466}
467
c8785385
CL
468void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
469{
470 __dec_zone_state(page_zone(page), item);
471}
2244b95a
CL
472EXPORT_SYMBOL(__dec_zone_page_state);
473
75ef7184
MG
474void __dec_node_page_state(struct page *page, enum node_stat_item item)
475{
476 __dec_node_state(page_pgdat(page), item);
477}
478EXPORT_SYMBOL(__dec_node_page_state);
479
4156153c 480#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
7c839120
CL
481/*
482 * If we have cmpxchg_local support then we do not need to incur the overhead
483 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
484 *
485 * mod_state() modifies the zone counter state through atomic per cpu
486 * operations.
487 *
488 * Overstep mode specifies how overstep should handled:
489 * 0 No overstepping
490 * 1 Overstepping half of threshold
491 * -1 Overstepping minus half of threshold
492*/
75ef7184
MG
493static inline void mod_zone_state(struct zone *zone,
494 enum zone_stat_item item, long delta, int overstep_mode)
7c839120 495{
28f836b6 496 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
7c839120
CL
497 s8 __percpu *p = pcp->vm_stat_diff + item;
498 long o, n, t, z;
499
500 do {
501 z = 0; /* overflow to zone counters */
502
503 /*
504 * The fetching of the stat_threshold is racy. We may apply
505 * a counter threshold to the wrong the cpu if we get
d3bc2367
CL
506 * rescheduled while executing here. However, the next
507 * counter update will apply the threshold again and
508 * therefore bring the counter under the threshold again.
509 *
510 * Most of the time the thresholds are the same anyways
511 * for all cpus in a zone.
7c839120
CL
512 */
513 t = this_cpu_read(pcp->stat_threshold);
514
515 o = this_cpu_read(*p);
516 n = delta + o;
517
40610076 518 if (abs(n) > t) {
7c839120
CL
519 int os = overstep_mode * (t >> 1) ;
520
521 /* Overflow must be added to zone counters */
522 z = n + os;
523 n = -os;
524 }
525 } while (this_cpu_cmpxchg(*p, o, n) != o);
526
527 if (z)
528 zone_page_state_add(z, zone, item);
529}
530
531void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 532 long delta)
7c839120 533{
75ef7184 534 mod_zone_state(zone, item, delta, 0);
7c839120
CL
535}
536EXPORT_SYMBOL(mod_zone_page_state);
537
7c839120
CL
538void inc_zone_page_state(struct page *page, enum zone_stat_item item)
539{
75ef7184 540 mod_zone_state(page_zone(page), item, 1, 1);
7c839120
CL
541}
542EXPORT_SYMBOL(inc_zone_page_state);
543
544void dec_zone_page_state(struct page *page, enum zone_stat_item item)
545{
75ef7184 546 mod_zone_state(page_zone(page), item, -1, -1);
7c839120
CL
547}
548EXPORT_SYMBOL(dec_zone_page_state);
75ef7184
MG
549
550static inline void mod_node_state(struct pglist_data *pgdat,
551 enum node_stat_item item, int delta, int overstep_mode)
552{
553 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
554 s8 __percpu *p = pcp->vm_node_stat_diff + item;
555 long o, n, t, z;
556
ea426c2a 557 if (vmstat_item_in_bytes(item)) {
629484ae
JW
558 /*
559 * Only cgroups use subpage accounting right now; at
560 * the global level, these items still change in
561 * multiples of whole pages. Store them as pages
562 * internally to keep the per-cpu counters compact.
563 */
ea426c2a
RG
564 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
565 delta >>= PAGE_SHIFT;
566 }
567
75ef7184
MG
568 do {
569 z = 0; /* overflow to node counters */
570
571 /*
572 * The fetching of the stat_threshold is racy. We may apply
573 * a counter threshold to the wrong the cpu if we get
574 * rescheduled while executing here. However, the next
575 * counter update will apply the threshold again and
576 * therefore bring the counter under the threshold again.
577 *
578 * Most of the time the thresholds are the same anyways
579 * for all cpus in a node.
580 */
581 t = this_cpu_read(pcp->stat_threshold);
582
583 o = this_cpu_read(*p);
584 n = delta + o;
585
40610076 586 if (abs(n) > t) {
75ef7184
MG
587 int os = overstep_mode * (t >> 1) ;
588
589 /* Overflow must be added to node counters */
590 z = n + os;
591 n = -os;
592 }
593 } while (this_cpu_cmpxchg(*p, o, n) != o);
594
595 if (z)
596 node_page_state_add(z, pgdat, item);
597}
598
599void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
600 long delta)
601{
602 mod_node_state(pgdat, item, delta, 0);
603}
604EXPORT_SYMBOL(mod_node_page_state);
605
606void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
607{
608 mod_node_state(pgdat, item, 1, 1);
609}
610
611void inc_node_page_state(struct page *page, enum node_stat_item item)
612{
613 mod_node_state(page_pgdat(page), item, 1, 1);
614}
615EXPORT_SYMBOL(inc_node_page_state);
616
617void dec_node_page_state(struct page *page, enum node_stat_item item)
618{
619 mod_node_state(page_pgdat(page), item, -1, -1);
620}
621EXPORT_SYMBOL(dec_node_page_state);
7c839120
CL
622#else
623/*
624 * Use interrupt disable to serialize counter updates
625 */
626void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 627 long delta)
7c839120
CL
628{
629 unsigned long flags;
630
631 local_irq_save(flags);
632 __mod_zone_page_state(zone, item, delta);
633 local_irq_restore(flags);
634}
635EXPORT_SYMBOL(mod_zone_page_state);
636
2244b95a
CL
637void inc_zone_page_state(struct page *page, enum zone_stat_item item)
638{
639 unsigned long flags;
640 struct zone *zone;
2244b95a
CL
641
642 zone = page_zone(page);
643 local_irq_save(flags);
ca889e6c 644 __inc_zone_state(zone, item);
2244b95a
CL
645 local_irq_restore(flags);
646}
647EXPORT_SYMBOL(inc_zone_page_state);
648
649void dec_zone_page_state(struct page *page, enum zone_stat_item item)
650{
651 unsigned long flags;
2244b95a 652
2244b95a 653 local_irq_save(flags);
a302eb4e 654 __dec_zone_page_state(page, item);
2244b95a
CL
655 local_irq_restore(flags);
656}
657EXPORT_SYMBOL(dec_zone_page_state);
658
75ef7184
MG
659void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
660{
661 unsigned long flags;
662
663 local_irq_save(flags);
664 __inc_node_state(pgdat, item);
665 local_irq_restore(flags);
666}
667EXPORT_SYMBOL(inc_node_state);
668
669void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
670 long delta)
671{
672 unsigned long flags;
673
674 local_irq_save(flags);
675 __mod_node_page_state(pgdat, item, delta);
676 local_irq_restore(flags);
677}
678EXPORT_SYMBOL(mod_node_page_state);
679
680void inc_node_page_state(struct page *page, enum node_stat_item item)
681{
682 unsigned long flags;
683 struct pglist_data *pgdat;
684
685 pgdat = page_pgdat(page);
686 local_irq_save(flags);
687 __inc_node_state(pgdat, item);
688 local_irq_restore(flags);
689}
690EXPORT_SYMBOL(inc_node_page_state);
691
692void dec_node_page_state(struct page *page, enum node_stat_item item)
693{
694 unsigned long flags;
695
696 local_irq_save(flags);
697 __dec_node_page_state(page, item);
698 local_irq_restore(flags);
699}
700EXPORT_SYMBOL(dec_node_page_state);
701#endif
7cc36bbd
CL
702
703/*
704 * Fold a differential into the global counters.
705 * Returns the number of counters updated.
706 */
f19298b9 707static int fold_diff(int *zone_diff, int *node_diff)
3a321d2a
KW
708{
709 int i;
710 int changes = 0;
711
712 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
713 if (zone_diff[i]) {
714 atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
715 changes++;
716 }
717
3a321d2a
KW
718 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
719 if (node_diff[i]) {
720 atomic_long_add(node_diff[i], &vm_node_stat[i]);
721 changes++;
722 }
723 return changes;
724}
f19298b9
MG
725
726#ifdef CONFIG_NUMA
727static void fold_vm_zone_numa_events(struct zone *zone)
4edb0748 728{
f19298b9
MG
729 unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
730 int cpu;
731 enum numa_stat_item item;
4edb0748 732
f19298b9
MG
733 for_each_online_cpu(cpu) {
734 struct per_cpu_zonestat *pzstats;
75ef7184 735
f19298b9
MG
736 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
737 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
738 zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
7cc36bbd 739 }
f19298b9
MG
740
741 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
742 zone_numa_event_add(zone_numa_events[item], zone, item);
4edb0748 743}
f19298b9
MG
744
745void fold_vm_numa_events(void)
746{
747 struct zone *zone;
748
749 for_each_populated_zone(zone)
750 fold_vm_zone_numa_events(zone);
751}
752#endif
4edb0748 753
2244b95a 754/*
2bb921e5 755 * Update the zone counters for the current cpu.
a7f75e25 756 *
4037d452
CL
757 * Note that refresh_cpu_vm_stats strives to only access
758 * node local memory. The per cpu pagesets on remote zones are placed
759 * in the memory local to the processor using that pageset. So the
760 * loop over all zones will access a series of cachelines local to
761 * the processor.
762 *
763 * The call to zone_page_state_add updates the cachelines with the
764 * statistics in the remote zone struct as well as the global cachelines
765 * with the global counters. These could cause remote node cache line
766 * bouncing and will have to be only done when necessary.
7cc36bbd
CL
767 *
768 * The function returns the number of global counters updated.
2244b95a 769 */
0eb77e98 770static int refresh_cpu_vm_stats(bool do_pagesets)
2244b95a 771{
75ef7184 772 struct pglist_data *pgdat;
2244b95a
CL
773 struct zone *zone;
774 int i;
75ef7184
MG
775 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
776 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
7cc36bbd 777 int changes = 0;
2244b95a 778
ee99c71c 779 for_each_populated_zone(zone) {
28f836b6
MG
780 struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
781#ifdef CONFIG_NUMA
782 struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
783#endif
2244b95a 784
fbc2edb0
CL
785 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
786 int v;
2244b95a 787
28f836b6 788 v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0);
fbc2edb0 789 if (v) {
a7f75e25 790
a7f75e25 791 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 792 global_zone_diff[i] += v;
4037d452
CL
793#ifdef CONFIG_NUMA
794 /* 3 seconds idle till flush */
28f836b6 795 __this_cpu_write(pcp->expire, 3);
4037d452 796#endif
2244b95a 797 }
fbc2edb0 798 }
4037d452 799#ifdef CONFIG_NUMA
3a321d2a 800
0eb77e98
CL
801 if (do_pagesets) {
802 cond_resched();
803 /*
804 * Deal with draining the remote pageset of this
805 * processor
806 *
807 * Check if there are pages remaining in this pageset
808 * if not then there is nothing to expire.
809 */
28f836b6
MG
810 if (!__this_cpu_read(pcp->expire) ||
811 !__this_cpu_read(pcp->count))
0eb77e98 812 continue;
4037d452 813
0eb77e98
CL
814 /*
815 * We never drain zones local to this processor.
816 */
817 if (zone_to_nid(zone) == numa_node_id()) {
28f836b6 818 __this_cpu_write(pcp->expire, 0);
0eb77e98
CL
819 continue;
820 }
4037d452 821
28f836b6 822 if (__this_cpu_dec_return(pcp->expire))
0eb77e98 823 continue;
4037d452 824
28f836b6
MG
825 if (__this_cpu_read(pcp->count)) {
826 drain_zone_pages(zone, this_cpu_ptr(pcp));
0eb77e98
CL
827 changes++;
828 }
7cc36bbd 829 }
4037d452 830#endif
2244b95a 831 }
75ef7184
MG
832
833 for_each_online_pgdat(pgdat) {
834 struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
835
836 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
837 int v;
838
839 v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
840 if (v) {
841 atomic_long_add(v, &pgdat->vm_stat[i]);
842 global_node_diff[i] += v;
843 }
844 }
845 }
846
847 changes += fold_diff(global_zone_diff, global_node_diff);
7cc36bbd 848 return changes;
2244b95a
CL
849}
850
2bb921e5
CL
851/*
852 * Fold the data for an offline cpu into the global array.
853 * There cannot be any access by the offline cpu and therefore
854 * synchronization is simplified.
855 */
856void cpu_vm_stats_fold(int cpu)
857{
75ef7184 858 struct pglist_data *pgdat;
2bb921e5
CL
859 struct zone *zone;
860 int i;
75ef7184
MG
861 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
862 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
2bb921e5
CL
863
864 for_each_populated_zone(zone) {
28f836b6 865 struct per_cpu_zonestat *pzstats;
2bb921e5 866
28f836b6 867 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
2bb921e5 868
f19298b9 869 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
28f836b6 870 if (pzstats->vm_stat_diff[i]) {
2bb921e5
CL
871 int v;
872
28f836b6
MG
873 v = pzstats->vm_stat_diff[i];
874 pzstats->vm_stat_diff[i] = 0;
2bb921e5 875 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 876 global_zone_diff[i] += v;
2bb921e5 877 }
f19298b9 878 }
3a321d2a 879#ifdef CONFIG_NUMA
f19298b9
MG
880 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
881 if (pzstats->vm_numa_event[i]) {
882 unsigned long v;
3a321d2a 883
f19298b9
MG
884 v = pzstats->vm_numa_event[i];
885 pzstats->vm_numa_event[i] = 0;
886 zone_numa_event_add(v, zone, i);
3a321d2a 887 }
f19298b9 888 }
3a321d2a 889#endif
2bb921e5
CL
890 }
891
75ef7184
MG
892 for_each_online_pgdat(pgdat) {
893 struct per_cpu_nodestat *p;
894
895 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
896
897 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
898 if (p->vm_node_stat_diff[i]) {
899 int v;
900
901 v = p->vm_node_stat_diff[i];
902 p->vm_node_stat_diff[i] = 0;
903 atomic_long_add(v, &pgdat->vm_stat[i]);
904 global_node_diff[i] += v;
905 }
906 }
907
908 fold_diff(global_zone_diff, global_node_diff);
2bb921e5
CL
909}
910
40f4b1ea
CS
911/*
912 * this is only called if !populated_zone(zone), which implies no other users of
f0953a1b 913 * pset->vm_stat_diff[] exist.
40f4b1ea 914 */
28f836b6 915void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats)
5a883813 916{
f19298b9 917 unsigned long v;
5a883813
MK
918 int i;
919
f19298b9 920 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
28f836b6 921 if (pzstats->vm_stat_diff[i]) {
f19298b9 922 v = pzstats->vm_stat_diff[i];
28f836b6 923 pzstats->vm_stat_diff[i] = 0;
f19298b9 924 zone_page_state_add(v, zone, i);
5a883813 925 }
f19298b9 926 }
3a321d2a
KW
927
928#ifdef CONFIG_NUMA
f19298b9
MG
929 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
930 if (pzstats->vm_numa_event[i]) {
931 v = pzstats->vm_numa_event[i];
932 pzstats->vm_numa_event[i] = 0;
933 zone_numa_event_add(v, zone, i);
3a321d2a 934 }
f19298b9 935 }
3a321d2a 936#endif
5a883813 937}
2244b95a
CL
938#endif
939
ca889e6c 940#ifdef CONFIG_NUMA
c2d42c16 941/*
75ef7184
MG
942 * Determine the per node value of a stat item. This function
943 * is called frequently in a NUMA machine, so try to be as
944 * frugal as possible.
c2d42c16 945 */
75ef7184
MG
946unsigned long sum_zone_node_page_state(int node,
947 enum zone_stat_item item)
c2d42c16
AM
948{
949 struct zone *zones = NODE_DATA(node)->node_zones;
e87d59f7
JK
950 int i;
951 unsigned long count = 0;
c2d42c16 952
e87d59f7
JK
953 for (i = 0; i < MAX_NR_ZONES; i++)
954 count += zone_page_state(zones + i, item);
955
956 return count;
c2d42c16
AM
957}
958
f19298b9
MG
959/* Determine the per node value of a numa stat item. */
960unsigned long sum_zone_numa_event_state(int node,
3a321d2a
KW
961 enum numa_stat_item item)
962{
963 struct zone *zones = NODE_DATA(node)->node_zones;
3a321d2a 964 unsigned long count = 0;
f19298b9 965 int i;
3a321d2a
KW
966
967 for (i = 0; i < MAX_NR_ZONES; i++)
f19298b9 968 count += zone_numa_event_state(zones + i, item);
3a321d2a
KW
969
970 return count;
971}
972
75ef7184
MG
973/*
974 * Determine the per node value of a stat item.
975 */
ea426c2a
RG
976unsigned long node_page_state_pages(struct pglist_data *pgdat,
977 enum node_stat_item item)
75ef7184
MG
978{
979 long x = atomic_long_read(&pgdat->vm_stat[item]);
980#ifdef CONFIG_SMP
981 if (x < 0)
982 x = 0;
983#endif
984 return x;
985}
ea426c2a
RG
986
987unsigned long node_page_state(struct pglist_data *pgdat,
988 enum node_stat_item item)
989{
990 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
991
992 return node_page_state_pages(pgdat, item);
993}
ca889e6c
CL
994#endif
995
d7a5752c 996#ifdef CONFIG_COMPACTION
36deb0be 997
d7a5752c
MG
998struct contig_page_info {
999 unsigned long free_pages;
1000 unsigned long free_blocks_total;
1001 unsigned long free_blocks_suitable;
1002};
1003
1004/*
1005 * Calculate the number of free pages in a zone, how many contiguous
1006 * pages are free and how many are large enough to satisfy an allocation of
1007 * the target size. Note that this function makes no attempt to estimate
1008 * how many suitable free blocks there *might* be if MOVABLE pages were
1009 * migrated. Calculating that is possible, but expensive and can be
1010 * figured out from userspace
1011 */
1012static void fill_contig_page_info(struct zone *zone,
1013 unsigned int suitable_order,
1014 struct contig_page_info *info)
1015{
1016 unsigned int order;
1017
1018 info->free_pages = 0;
1019 info->free_blocks_total = 0;
1020 info->free_blocks_suitable = 0;
1021
1022 for (order = 0; order < MAX_ORDER; order++) {
1023 unsigned long blocks;
1024
1025 /* Count number of free blocks */
1026 blocks = zone->free_area[order].nr_free;
1027 info->free_blocks_total += blocks;
1028
1029 /* Count free base pages */
1030 info->free_pages += blocks << order;
1031
1032 /* Count the suitable free blocks */
1033 if (order >= suitable_order)
1034 info->free_blocks_suitable += blocks <<
1035 (order - suitable_order);
1036 }
1037}
f1a5ab12
MG
1038
1039/*
1040 * A fragmentation index only makes sense if an allocation of a requested
1041 * size would fail. If that is true, the fragmentation index indicates
1042 * whether external fragmentation or a lack of memory was the problem.
1043 * The value can be used to determine if page reclaim or compaction
1044 * should be used
1045 */
56de7263 1046static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
f1a5ab12
MG
1047{
1048 unsigned long requested = 1UL << order;
1049
88d6ac40
WY
1050 if (WARN_ON_ONCE(order >= MAX_ORDER))
1051 return 0;
1052
f1a5ab12
MG
1053 if (!info->free_blocks_total)
1054 return 0;
1055
1056 /* Fragmentation index only makes sense when a request would fail */
1057 if (info->free_blocks_suitable)
1058 return -1000;
1059
1060 /*
1061 * Index is between 0 and 1 so return within 3 decimal places
1062 *
1063 * 0 => allocation would fail due to lack of memory
1064 * 1 => allocation would fail due to fragmentation
1065 */
1066 return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
1067}
56de7263 1068
facdaa91
NG
1069/*
1070 * Calculates external fragmentation within a zone wrt the given order.
1071 * It is defined as the percentage of pages found in blocks of size
1072 * less than 1 << order. It returns values in range [0, 100].
1073 */
d34c0a75 1074unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
facdaa91
NG
1075{
1076 struct contig_page_info info;
1077
1078 fill_contig_page_info(zone, order, &info);
1079 if (info.free_pages == 0)
1080 return 0;
1081
1082 return div_u64((info.free_pages -
1083 (info.free_blocks_suitable << order)) * 100,
1084 info.free_pages);
1085}
1086
56de7263
MG
1087/* Same as __fragmentation index but allocs contig_page_info on stack */
1088int fragmentation_index(struct zone *zone, unsigned int order)
1089{
1090 struct contig_page_info info;
1091
1092 fill_contig_page_info(zone, order, &info);
1093 return __fragmentation_index(order, &info);
1094}
d7a5752c
MG
1095#endif
1096
ebc5d83d
KK
1097#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
1098 defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
fa25c503
KM
1099#ifdef CONFIG_ZONE_DMA
1100#define TEXT_FOR_DMA(xx) xx "_dma",
1101#else
1102#define TEXT_FOR_DMA(xx)
1103#endif
1104
1105#ifdef CONFIG_ZONE_DMA32
1106#define TEXT_FOR_DMA32(xx) xx "_dma32",
1107#else
1108#define TEXT_FOR_DMA32(xx)
1109#endif
1110
1111#ifdef CONFIG_HIGHMEM
1112#define TEXT_FOR_HIGHMEM(xx) xx "_high",
1113#else
1114#define TEXT_FOR_HIGHMEM(xx)
1115#endif
1116
1117#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
1118 TEXT_FOR_HIGHMEM(xx) xx "_movable",
1119
1120const char * const vmstat_text[] = {
8d92890b 1121 /* enum zone_stat_item counters */
fa25c503 1122 "nr_free_pages",
71c799f4
MK
1123 "nr_zone_inactive_anon",
1124 "nr_zone_active_anon",
1125 "nr_zone_inactive_file",
1126 "nr_zone_active_file",
1127 "nr_zone_unevictable",
5a1c84b4 1128 "nr_zone_write_pending",
fa25c503 1129 "nr_mlock",
fa25c503 1130 "nr_bounce",
91537fee
MK
1131#if IS_ENABLED(CONFIG_ZSMALLOC)
1132 "nr_zspages",
1133#endif
3a321d2a
KW
1134 "nr_free_cma",
1135
1136 /* enum numa_stat_item counters */
fa25c503
KM
1137#ifdef CONFIG_NUMA
1138 "numa_hit",
1139 "numa_miss",
1140 "numa_foreign",
1141 "numa_interleave",
1142 "numa_local",
1143 "numa_other",
1144#endif
09316c09 1145
9d7ea9a2 1146 /* enum node_stat_item counters */
599d0c95
MG
1147 "nr_inactive_anon",
1148 "nr_active_anon",
1149 "nr_inactive_file",
1150 "nr_active_file",
1151 "nr_unevictable",
385386cf
JW
1152 "nr_slab_reclaimable",
1153 "nr_slab_unreclaimable",
599d0c95
MG
1154 "nr_isolated_anon",
1155 "nr_isolated_file",
68d48e6a 1156 "workingset_nodes",
170b04b7
JK
1157 "workingset_refault_anon",
1158 "workingset_refault_file",
1159 "workingset_activate_anon",
1160 "workingset_activate_file",
1161 "workingset_restore_anon",
1162 "workingset_restore_file",
1e6b1085 1163 "workingset_nodereclaim",
50658e2e
MG
1164 "nr_anon_pages",
1165 "nr_mapped",
11fb9989
MG
1166 "nr_file_pages",
1167 "nr_dirty",
1168 "nr_writeback",
1169 "nr_writeback_temp",
1170 "nr_shmem",
1171 "nr_shmem_hugepages",
1172 "nr_shmem_pmdmapped",
60fbf0ab
SL
1173 "nr_file_hugepages",
1174 "nr_file_pmdmapped",
11fb9989 1175 "nr_anon_transparent_hugepages",
c4a25635
MG
1176 "nr_vmscan_write",
1177 "nr_vmscan_immediate_reclaim",
1178 "nr_dirtied",
1179 "nr_written",
b29940c1 1180 "nr_kernel_misc_reclaimable",
1970dc6f
JH
1181 "nr_foll_pin_acquired",
1182 "nr_foll_pin_released",
991e7673
SB
1183 "nr_kernel_stack",
1184#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
1185 "nr_shadow_call_stack",
1186#endif
f0c0c115 1187 "nr_page_table_pages",
b6038942
SB
1188#ifdef CONFIG_SWAP
1189 "nr_swapcached",
1190#endif
599d0c95 1191
09316c09 1192 /* enum writeback_stat_item counters */
fa25c503
KM
1193 "nr_dirty_threshold",
1194 "nr_dirty_background_threshold",
1195
ebc5d83d 1196#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
09316c09 1197 /* enum vm_event_item counters */
fa25c503
KM
1198 "pgpgin",
1199 "pgpgout",
1200 "pswpin",
1201 "pswpout",
1202
1203 TEXTS_FOR_ZONES("pgalloc")
7cc30fcf
MG
1204 TEXTS_FOR_ZONES("allocstall")
1205 TEXTS_FOR_ZONES("pgskip")
fa25c503
KM
1206
1207 "pgfree",
1208 "pgactivate",
1209 "pgdeactivate",
f7ad2a6c 1210 "pglazyfree",
fa25c503
KM
1211
1212 "pgfault",
1213 "pgmajfault",
854e9ed0 1214 "pglazyfreed",
fa25c503 1215
599d0c95 1216 "pgrefill",
798a6b87 1217 "pgreuse",
599d0c95
MG
1218 "pgsteal_kswapd",
1219 "pgsteal_direct",
1220 "pgscan_kswapd",
1221 "pgscan_direct",
68243e76 1222 "pgscan_direct_throttle",
497a6c1b
JW
1223 "pgscan_anon",
1224 "pgscan_file",
1225 "pgsteal_anon",
1226 "pgsteal_file",
fa25c503
KM
1227
1228#ifdef CONFIG_NUMA
1229 "zone_reclaim_failed",
1230#endif
1231 "pginodesteal",
1232 "slabs_scanned",
fa25c503
KM
1233 "kswapd_inodesteal",
1234 "kswapd_low_wmark_hit_quickly",
1235 "kswapd_high_wmark_hit_quickly",
fa25c503 1236 "pageoutrun",
fa25c503
KM
1237
1238 "pgrotated",
1239
5509a5d2
DH
1240 "drop_pagecache",
1241 "drop_slab",
8e675f7a 1242 "oom_kill",
5509a5d2 1243
03c5a6e1
MG
1244#ifdef CONFIG_NUMA_BALANCING
1245 "numa_pte_updates",
72403b4a 1246 "numa_huge_pte_updates",
03c5a6e1
MG
1247 "numa_hint_faults",
1248 "numa_hint_faults_local",
1249 "numa_pages_migrated",
1250#endif
5647bc29
MG
1251#ifdef CONFIG_MIGRATION
1252 "pgmigrate_success",
1253 "pgmigrate_fail",
1a5bae25
AK
1254 "thp_migration_success",
1255 "thp_migration_fail",
1256 "thp_migration_split",
5647bc29 1257#endif
fa25c503 1258#ifdef CONFIG_COMPACTION
397487db
MG
1259 "compact_migrate_scanned",
1260 "compact_free_scanned",
1261 "compact_isolated",
fa25c503
KM
1262 "compact_stall",
1263 "compact_fail",
1264 "compact_success",
698b1b30 1265 "compact_daemon_wake",
7f354a54
DR
1266 "compact_daemon_migrate_scanned",
1267 "compact_daemon_free_scanned",
fa25c503
KM
1268#endif
1269
1270#ifdef CONFIG_HUGETLB_PAGE
1271 "htlb_buddy_alloc_success",
1272 "htlb_buddy_alloc_fail",
bbb26920
MK
1273#endif
1274#ifdef CONFIG_CMA
1275 "cma_alloc_success",
1276 "cma_alloc_fail",
fa25c503
KM
1277#endif
1278 "unevictable_pgs_culled",
1279 "unevictable_pgs_scanned",
1280 "unevictable_pgs_rescued",
1281 "unevictable_pgs_mlocked",
1282 "unevictable_pgs_munlocked",
1283 "unevictable_pgs_cleared",
1284 "unevictable_pgs_stranded",
fa25c503
KM
1285
1286#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1287 "thp_fault_alloc",
1288 "thp_fault_fallback",
85b9f46e 1289 "thp_fault_fallback_charge",
fa25c503
KM
1290 "thp_collapse_alloc",
1291 "thp_collapse_alloc_failed",
95ecedcd 1292 "thp_file_alloc",
dcdf11ee 1293 "thp_file_fallback",
85b9f46e 1294 "thp_file_fallback_charge",
95ecedcd 1295 "thp_file_mapped",
122afea9
KS
1296 "thp_split_page",
1297 "thp_split_page_failed",
f9719a03 1298 "thp_deferred_split_page",
122afea9 1299 "thp_split_pmd",
ce9311cf
YX
1300#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1301 "thp_split_pud",
1302#endif
d8a8e1f0
KS
1303 "thp_zero_page_alloc",
1304 "thp_zero_page_alloc_failed",
225311a4 1305 "thp_swpout",
fe490cc0 1306 "thp_swpout_fallback",
fa25c503 1307#endif
09316c09
KK
1308#ifdef CONFIG_MEMORY_BALLOON
1309 "balloon_inflate",
1310 "balloon_deflate",
1311#ifdef CONFIG_BALLOON_COMPACTION
1312 "balloon_migrate",
1313#endif
1314#endif /* CONFIG_MEMORY_BALLOON */
ec659934 1315#ifdef CONFIG_DEBUG_TLBFLUSH
9824cf97
DH
1316 "nr_tlb_remote_flush",
1317 "nr_tlb_remote_flush_received",
1318 "nr_tlb_local_flush_all",
1319 "nr_tlb_local_flush_one",
ec659934 1320#endif /* CONFIG_DEBUG_TLBFLUSH */
fa25c503 1321
4f115147
DB
1322#ifdef CONFIG_DEBUG_VM_VMACACHE
1323 "vmacache_find_calls",
1324 "vmacache_find_hits",
1325#endif
cbc65df2
HY
1326#ifdef CONFIG_SWAP
1327 "swap_ra",
1328 "swap_ra_hit",
1329#endif
575299ea
S
1330#ifdef CONFIG_X86
1331 "direct_map_level2_splits",
1332 "direct_map_level3_splits",
1333#endif
ebc5d83d 1334#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
fa25c503 1335};
ebc5d83d 1336#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
fa25c503 1337
3c486871
AM
1338#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
1339 defined(CONFIG_PROC_FS)
1340static void *frag_start(struct seq_file *m, loff_t *pos)
1341{
1342 pg_data_t *pgdat;
1343 loff_t node = *pos;
1344
1345 for (pgdat = first_online_pgdat();
1346 pgdat && node;
1347 pgdat = next_online_pgdat(pgdat))
1348 --node;
1349
1350 return pgdat;
1351}
1352
1353static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
1354{
1355 pg_data_t *pgdat = (pg_data_t *)arg;
1356
1357 (*pos)++;
1358 return next_online_pgdat(pgdat);
1359}
1360
1361static void frag_stop(struct seq_file *m, void *arg)
1362{
1363}
1364
b2bd8598
DR
1365/*
1366 * Walk zones in a node and print using a callback.
1367 * If @assert_populated is true, only use callback for zones that are populated.
1368 */
3c486871 1369static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
727c080f 1370 bool assert_populated, bool nolock,
3c486871
AM
1371 void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
1372{
1373 struct zone *zone;
1374 struct zone *node_zones = pgdat->node_zones;
1375 unsigned long flags;
1376
1377 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
b2bd8598 1378 if (assert_populated && !populated_zone(zone))
3c486871
AM
1379 continue;
1380
727c080f
VM
1381 if (!nolock)
1382 spin_lock_irqsave(&zone->lock, flags);
3c486871 1383 print(m, pgdat, zone);
727c080f
VM
1384 if (!nolock)
1385 spin_unlock_irqrestore(&zone->lock, flags);
3c486871
AM
1386 }
1387}
1388#endif
1389
d7a5752c 1390#ifdef CONFIG_PROC_FS
467c996c
MG
1391static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
1392 struct zone *zone)
1393{
1394 int order;
1395
1396 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1397 for (order = 0; order < MAX_ORDER; ++order)
1398 seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
1399 seq_putc(m, '\n');
1400}
1401
1402/*
1403 * This walks the free areas for each zone.
1404 */
1405static int frag_show(struct seq_file *m, void *arg)
1406{
1407 pg_data_t *pgdat = (pg_data_t *)arg;
727c080f 1408 walk_zones_in_node(m, pgdat, true, false, frag_show_print);
467c996c
MG
1409 return 0;
1410}
1411
1412static void pagetypeinfo_showfree_print(struct seq_file *m,
1413 pg_data_t *pgdat, struct zone *zone)
1414{
1415 int order, mtype;
1416
1417 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
1418 seq_printf(m, "Node %4d, zone %8s, type %12s ",
1419 pgdat->node_id,
1420 zone->name,
1421 migratetype_names[mtype]);
1422 for (order = 0; order < MAX_ORDER; ++order) {
1423 unsigned long freecount = 0;
1424 struct free_area *area;
1425 struct list_head *curr;
93b3a674 1426 bool overflow = false;
467c996c
MG
1427
1428 area = &(zone->free_area[order]);
1429
93b3a674
MH
1430 list_for_each(curr, &area->free_list[mtype]) {
1431 /*
1432 * Cap the free_list iteration because it might
1433 * be really large and we are under a spinlock
1434 * so a long time spent here could trigger a
1435 * hard lockup detector. Anyway this is a
1436 * debugging tool so knowing there is a handful
1437 * of pages of this order should be more than
1438 * sufficient.
1439 */
1440 if (++freecount >= 100000) {
1441 overflow = true;
1442 break;
1443 }
1444 }
1445 seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
1446 spin_unlock_irq(&zone->lock);
1447 cond_resched();
1448 spin_lock_irq(&zone->lock);
467c996c 1449 }
f6ac2354
CL
1450 seq_putc(m, '\n');
1451 }
467c996c
MG
1452}
1453
1454/* Print out the free pages at each order for each migatetype */
1455static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
1456{
1457 int order;
1458 pg_data_t *pgdat = (pg_data_t *)arg;
1459
1460 /* Print header */
1461 seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
1462 for (order = 0; order < MAX_ORDER; ++order)
1463 seq_printf(m, "%6d ", order);
1464 seq_putc(m, '\n');
1465
727c080f 1466 walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
467c996c
MG
1467
1468 return 0;
1469}
1470
1471static void pagetypeinfo_showblockcount_print(struct seq_file *m,
1472 pg_data_t *pgdat, struct zone *zone)
1473{
1474 int mtype;
1475 unsigned long pfn;
1476 unsigned long start_pfn = zone->zone_start_pfn;
108bcc96 1477 unsigned long end_pfn = zone_end_pfn(zone);
467c996c
MG
1478 unsigned long count[MIGRATE_TYPES] = { 0, };
1479
1480 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
1481 struct page *page;
1482
d336e94e
MH
1483 page = pfn_to_online_page(pfn);
1484 if (!page)
467c996c
MG
1485 continue;
1486
a91c43c7
JK
1487 if (page_zone(page) != zone)
1488 continue;
1489
467c996c
MG
1490 mtype = get_pageblock_migratetype(page);
1491
e80d6a24
MG
1492 if (mtype < MIGRATE_TYPES)
1493 count[mtype]++;
467c996c
MG
1494 }
1495
1496 /* Print counts */
1497 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1498 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1499 seq_printf(m, "%12lu ", count[mtype]);
1500 seq_putc(m, '\n');
1501}
1502
f113e641 1503/* Print out the number of pageblocks for each migratetype */
467c996c
MG
1504static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
1505{
1506 int mtype;
1507 pg_data_t *pgdat = (pg_data_t *)arg;
1508
1509 seq_printf(m, "\n%-23s", "Number of blocks type ");
1510 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1511 seq_printf(m, "%12s ", migratetype_names[mtype]);
1512 seq_putc(m, '\n');
727c080f
VM
1513 walk_zones_in_node(m, pgdat, true, false,
1514 pagetypeinfo_showblockcount_print);
467c996c
MG
1515
1516 return 0;
1517}
1518
48c96a36
JK
1519/*
1520 * Print out the number of pageblocks for each migratetype that contain pages
1521 * of other types. This gives an indication of how well fallbacks are being
1522 * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
1523 * to determine what is going on
1524 */
1525static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
1526{
1527#ifdef CONFIG_PAGE_OWNER
1528 int mtype;
1529
7dd80b8a 1530 if (!static_branch_unlikely(&page_owner_inited))
48c96a36
JK
1531 return;
1532
1533 drain_all_pages(NULL);
1534
1535 seq_printf(m, "\n%-23s", "Number of mixed blocks ");
1536 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1537 seq_printf(m, "%12s ", migratetype_names[mtype]);
1538 seq_putc(m, '\n');
1539
727c080f
VM
1540 walk_zones_in_node(m, pgdat, true, true,
1541 pagetypeinfo_showmixedcount_print);
48c96a36
JK
1542#endif /* CONFIG_PAGE_OWNER */
1543}
1544
467c996c
MG
1545/*
1546 * This prints out statistics in relation to grouping pages by mobility.
1547 * It is expensive to collect so do not constantly read the file.
1548 */
1549static int pagetypeinfo_show(struct seq_file *m, void *arg)
1550{
1551 pg_data_t *pgdat = (pg_data_t *)arg;
1552
41b25a37 1553 /* check memoryless node */
a47b53c5 1554 if (!node_state(pgdat->node_id, N_MEMORY))
41b25a37
KM
1555 return 0;
1556
467c996c
MG
1557 seq_printf(m, "Page block order: %d\n", pageblock_order);
1558 seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages);
1559 seq_putc(m, '\n');
1560 pagetypeinfo_showfree(m, pgdat);
1561 pagetypeinfo_showblockcount(m, pgdat);
48c96a36 1562 pagetypeinfo_showmixedcount(m, pgdat);
467c996c 1563
f6ac2354
CL
1564 return 0;
1565}
1566
8f32f7e5 1567static const struct seq_operations fragmentation_op = {
f6ac2354
CL
1568 .start = frag_start,
1569 .next = frag_next,
1570 .stop = frag_stop,
1571 .show = frag_show,
1572};
1573
74e2e8e8 1574static const struct seq_operations pagetypeinfo_op = {
467c996c
MG
1575 .start = frag_start,
1576 .next = frag_next,
1577 .stop = frag_stop,
1578 .show = pagetypeinfo_show,
1579};
1580
e2ecc8a7
MG
1581static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
1582{
1583 int zid;
1584
1585 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1586 struct zone *compare = &pgdat->node_zones[zid];
1587
1588 if (populated_zone(compare))
1589 return zone == compare;
1590 }
1591
e2ecc8a7
MG
1592 return false;
1593}
1594
467c996c
MG
1595static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1596 struct zone *zone)
f6ac2354 1597{
467c996c
MG
1598 int i;
1599 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
e2ecc8a7
MG
1600 if (is_zone_first_populated(pgdat, zone)) {
1601 seq_printf(m, "\n per-node stats");
1602 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
69473e5d
MS
1603 unsigned long pages = node_page_state_pages(pgdat, i);
1604
1605 if (vmstat_item_print_in_thp(i))
1606 pages /= HPAGE_PMD_NR;
9d7ea9a2 1607 seq_printf(m, "\n %-12s %lu", node_stat_name(i),
69473e5d 1608 pages);
e2ecc8a7
MG
1609 }
1610 }
467c996c
MG
1611 seq_printf(m,
1612 "\n pages free %lu"
1613 "\n min %lu"
1614 "\n low %lu"
1615 "\n high %lu"
467c996c 1616 "\n spanned %lu"
9feedc9d 1617 "\n present %lu"
3c381db1
DH
1618 "\n managed %lu"
1619 "\n cma %lu",
88f5acf8 1620 zone_page_state(zone, NR_FREE_PAGES),
41858966
MG
1621 min_wmark_pages(zone),
1622 low_wmark_pages(zone),
1623 high_wmark_pages(zone),
467c996c 1624 zone->spanned_pages,
9feedc9d 1625 zone->present_pages,
3c381db1
DH
1626 zone_managed_pages(zone),
1627 zone_cma_pages(zone));
467c996c 1628
467c996c 1629 seq_printf(m,
3484b2de 1630 "\n protection: (%ld",
467c996c
MG
1631 zone->lowmem_reserve[0]);
1632 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
3484b2de 1633 seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
7dfb8bf3
DR
1634 seq_putc(m, ')');
1635
a8a4b7ae
BH
1636 /* If unpopulated, no other information is useful */
1637 if (!populated_zone(zone)) {
1638 seq_putc(m, '\n');
1639 return;
1640 }
1641
7dfb8bf3 1642 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
9d7ea9a2
KK
1643 seq_printf(m, "\n %-12s %lu", zone_stat_name(i),
1644 zone_page_state(zone, i));
7dfb8bf3 1645
3a321d2a 1646#ifdef CONFIG_NUMA
f19298b9 1647 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
9d7ea9a2 1648 seq_printf(m, "\n %-12s %lu", numa_stat_name(i),
f19298b9 1649 zone_numa_event_state(zone, i));
3a321d2a
KW
1650#endif
1651
7dfb8bf3 1652 seq_printf(m, "\n pagesets");
467c996c 1653 for_each_online_cpu(i) {
28f836b6
MG
1654 struct per_cpu_pages *pcp;
1655 struct per_cpu_zonestat __maybe_unused *pzstats;
467c996c 1656
28f836b6 1657 pcp = per_cpu_ptr(zone->per_cpu_pageset, i);
3dfa5721
CL
1658 seq_printf(m,
1659 "\n cpu: %i"
1660 "\n count: %i"
1661 "\n high: %i"
1662 "\n batch: %i",
1663 i,
28f836b6
MG
1664 pcp->count,
1665 pcp->high,
1666 pcp->batch);
df9ecaba 1667#ifdef CONFIG_SMP
28f836b6 1668 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i);
467c996c 1669 seq_printf(m, "\n vm stats threshold: %d",
28f836b6 1670 pzstats->stat_threshold);
df9ecaba 1671#endif
f6ac2354 1672 }
467c996c 1673 seq_printf(m,
599d0c95 1674 "\n node_unreclaimable: %u"
3a50d14d 1675 "\n start_pfn: %lu",
c73322d0 1676 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
3a50d14d 1677 zone->zone_start_pfn);
467c996c
MG
1678 seq_putc(m, '\n');
1679}
1680
1681/*
b2bd8598
DR
1682 * Output information about zones in @pgdat. All zones are printed regardless
1683 * of whether they are populated or not: lowmem_reserve_ratio operates on the
1684 * set of all zones and userspace would not be aware of such zones if they are
1685 * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
467c996c
MG
1686 */
1687static int zoneinfo_show(struct seq_file *m, void *arg)
1688{
1689 pg_data_t *pgdat = (pg_data_t *)arg;
727c080f 1690 walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
f6ac2354
CL
1691 return 0;
1692}
1693
5c9fe628 1694static const struct seq_operations zoneinfo_op = {
f6ac2354
CL
1695 .start = frag_start, /* iterate over all zones. The same as in
1696 * fragmentation. */
1697 .next = frag_next,
1698 .stop = frag_stop,
1699 .show = zoneinfo_show,
1700};
1701
9d7ea9a2 1702#define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
f19298b9 1703 NR_VM_NUMA_EVENT_ITEMS + \
9d7ea9a2
KK
1704 NR_VM_NODE_STAT_ITEMS + \
1705 NR_VM_WRITEBACK_STAT_ITEMS + \
1706 (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
1707 NR_VM_EVENT_ITEMS : 0))
79da826a 1708
f6ac2354
CL
1709static void *vmstat_start(struct seq_file *m, loff_t *pos)
1710{
2244b95a 1711 unsigned long *v;
9d7ea9a2 1712 int i;
f6ac2354 1713
9d7ea9a2 1714 if (*pos >= NR_VMSTAT_ITEMS)
f6ac2354 1715 return NULL;
79da826a 1716
9d7ea9a2 1717 BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
f19298b9 1718 fold_vm_numa_events();
9d7ea9a2 1719 v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
2244b95a
CL
1720 m->private = v;
1721 if (!v)
f6ac2354 1722 return ERR_PTR(-ENOMEM);
2244b95a 1723 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
c41f012a 1724 v[i] = global_zone_page_state(i);
79da826a
MR
1725 v += NR_VM_ZONE_STAT_ITEMS;
1726
3a321d2a 1727#ifdef CONFIG_NUMA
f19298b9
MG
1728 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
1729 v[i] = global_numa_event_state(i);
1730 v += NR_VM_NUMA_EVENT_ITEMS;
3a321d2a
KW
1731#endif
1732
69473e5d 1733 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
ea426c2a 1734 v[i] = global_node_page_state_pages(i);
69473e5d
MS
1735 if (vmstat_item_print_in_thp(i))
1736 v[i] /= HPAGE_PMD_NR;
1737 }
75ef7184
MG
1738 v += NR_VM_NODE_STAT_ITEMS;
1739
79da826a
MR
1740 global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
1741 v + NR_DIRTY_THRESHOLD);
1742 v += NR_VM_WRITEBACK_STAT_ITEMS;
1743
f8891e5e 1744#ifdef CONFIG_VM_EVENT_COUNTERS
79da826a
MR
1745 all_vm_events(v);
1746 v[PGPGIN] /= 2; /* sectors -> kbytes */
1747 v[PGPGOUT] /= 2;
f8891e5e 1748#endif
ff8b16d7 1749 return (unsigned long *)m->private + *pos;
f6ac2354
CL
1750}
1751
1752static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1753{
1754 (*pos)++;
9d7ea9a2 1755 if (*pos >= NR_VMSTAT_ITEMS)
f6ac2354
CL
1756 return NULL;
1757 return (unsigned long *)m->private + *pos;
1758}
1759
1760static int vmstat_show(struct seq_file *m, void *arg)
1761{
1762 unsigned long *l = arg;
1763 unsigned long off = l - (unsigned long *)m->private;
68ba0326
AD
1764
1765 seq_puts(m, vmstat_text[off]);
75ba1d07 1766 seq_put_decimal_ull(m, " ", *l);
68ba0326 1767 seq_putc(m, '\n');
8d92890b
N
1768
1769 if (off == NR_VMSTAT_ITEMS - 1) {
1770 /*
1771 * We've come to the end - add any deprecated counters to avoid
1772 * breaking userspace which might depend on them being present.
1773 */
1774 seq_puts(m, "nr_unstable 0\n");
1775 }
f6ac2354
CL
1776 return 0;
1777}
1778
1779static void vmstat_stop(struct seq_file *m, void *arg)
1780{
1781 kfree(m->private);
1782 m->private = NULL;
1783}
1784
b6aa44ab 1785static const struct seq_operations vmstat_op = {
f6ac2354
CL
1786 .start = vmstat_start,
1787 .next = vmstat_next,
1788 .stop = vmstat_stop,
1789 .show = vmstat_show,
1790};
f6ac2354
CL
1791#endif /* CONFIG_PROC_FS */
1792
df9ecaba 1793#ifdef CONFIG_SMP
d1187ed2 1794static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
77461ab3 1795int sysctl_stat_interval __read_mostly = HZ;
d1187ed2 1796
52b6f46b
HD
1797#ifdef CONFIG_PROC_FS
1798static void refresh_vm_stats(struct work_struct *work)
1799{
1800 refresh_cpu_vm_stats(true);
1801}
1802
1803int vmstat_refresh(struct ctl_table *table, int write,
32927393 1804 void *buffer, size_t *lenp, loff_t *ppos)
52b6f46b
HD
1805{
1806 long val;
1807 int err;
1808 int i;
1809
1810 /*
1811 * The regular update, every sysctl_stat_interval, may come later
1812 * than expected: leaving a significant amount in per_cpu buckets.
1813 * This is particularly misleading when checking a quantity of HUGE
1814 * pages, immediately after running a test. /proc/sys/vm/stat_refresh,
1815 * which can equally be echo'ed to or cat'ted from (by root),
1816 * can be used to update the stats just before reading them.
1817 *
c41f012a 1818 * Oh, and since global_zone_page_state() etc. are so careful to hide
52b6f46b
HD
1819 * transiently negative values, report an error here if any of
1820 * the stats is negative, so we know to go looking for imbalance.
1821 */
1822 err = schedule_on_each_cpu(refresh_vm_stats);
1823 if (err)
1824 return err;
1825 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
75083aae
HD
1826 /*
1827 * Skip checking stats known to go negative occasionally.
1828 */
1829 switch (i) {
1830 case NR_ZONE_WRITE_PENDING:
1831 case NR_FREE_CMA_PAGES:
1832 continue;
1833 }
75ef7184 1834 val = atomic_long_read(&vm_zone_stat[i]);
52b6f46b 1835 if (val < 0) {
c822f622 1836 pr_warn("%s: %s %ld\n",
9d7ea9a2 1837 __func__, zone_stat_name(i), val);
52b6f46b
HD
1838 }
1839 }
76d8cc3c 1840 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
75083aae
HD
1841 /*
1842 * Skip checking stats known to go negative occasionally.
1843 */
1844 switch (i) {
1845 case NR_WRITEBACK:
1846 continue;
1847 }
76d8cc3c
HD
1848 val = atomic_long_read(&vm_node_stat[i]);
1849 if (val < 0) {
1850 pr_warn("%s: %s %ld\n",
1851 __func__, node_stat_name(i), val);
76d8cc3c
HD
1852 }
1853 }
52b6f46b
HD
1854 if (write)
1855 *ppos += *lenp;
1856 else
1857 *lenp = 0;
1858 return 0;
1859}
1860#endif /* CONFIG_PROC_FS */
1861
d1187ed2
CL
1862static void vmstat_update(struct work_struct *w)
1863{
0eb77e98 1864 if (refresh_cpu_vm_stats(true)) {
7cc36bbd
CL
1865 /*
1866 * Counters were updated so we expect more updates
1867 * to occur in the future. Keep on running the
1868 * update worker thread.
1869 */
ce612879 1870 queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
f01f17d3
MH
1871 this_cpu_ptr(&vmstat_work),
1872 round_jiffies_relative(sysctl_stat_interval));
7cc36bbd
CL
1873 }
1874}
1875
0eb77e98
CL
1876/*
1877 * Switch off vmstat processing and then fold all the remaining differentials
1878 * until the diffs stay at zero. The function is used by NOHZ and can only be
1879 * invoked when tick processing is not active.
1880 */
7cc36bbd
CL
1881/*
1882 * Check if the diffs for a certain cpu indicate that
1883 * an update is needed.
1884 */
1885static bool need_update(int cpu)
1886{
2bbd00ae 1887 pg_data_t *last_pgdat = NULL;
7cc36bbd
CL
1888 struct zone *zone;
1889
1890 for_each_populated_zone(zone) {
28f836b6 1891 struct per_cpu_zonestat *pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
2bbd00ae 1892 struct per_cpu_nodestat *n;
28f836b6 1893
7cc36bbd
CL
1894 /*
1895 * The fast way of checking if there are any vmstat diffs.
7cc36bbd 1896 */
28f836b6
MG
1897 if (memchr_inv(pzstats->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS *
1898 sizeof(pzstats->vm_stat_diff[0])))
7cc36bbd 1899 return true;
f19298b9 1900
2bbd00ae
JW
1901 if (last_pgdat == zone->zone_pgdat)
1902 continue;
1903 last_pgdat = zone->zone_pgdat;
1904 n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu);
1905 if (memchr_inv(n->vm_node_stat_diff, 0, NR_VM_NODE_STAT_ITEMS *
1906 sizeof(n->vm_node_stat_diff[0])))
1907 return true;
7cc36bbd
CL
1908 }
1909 return false;
1910}
1911
7b8da4c7
CL
1912/*
1913 * Switch off vmstat processing and then fold all the remaining differentials
1914 * until the diffs stay at zero. The function is used by NOHZ and can only be
1915 * invoked when tick processing is not active.
1916 */
f01f17d3
MH
1917void quiet_vmstat(void)
1918{
1919 if (system_state != SYSTEM_RUNNING)
1920 return;
1921
7b8da4c7 1922 if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
f01f17d3
MH
1923 return;
1924
1925 if (!need_update(smp_processor_id()))
1926 return;
1927
1928 /*
1929 * Just refresh counters and do not care about the pending delayed
1930 * vmstat_update. It doesn't fire that often to matter and canceling
1931 * it would be too expensive from this path.
1932 * vmstat_shepherd will take care about that for us.
1933 */
1934 refresh_cpu_vm_stats(false);
1935}
1936
7cc36bbd
CL
1937/*
1938 * Shepherd worker thread that checks the
1939 * differentials of processors that have their worker
1940 * threads for vm statistics updates disabled because of
1941 * inactivity.
1942 */
1943static void vmstat_shepherd(struct work_struct *w);
1944
0eb77e98 1945static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
7cc36bbd
CL
1946
1947static void vmstat_shepherd(struct work_struct *w)
1948{
1949 int cpu;
1950
1951 get_online_cpus();
1952 /* Check processors whose vmstat worker threads have been disabled */
7b8da4c7 1953 for_each_online_cpu(cpu) {
f01f17d3 1954 struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
7cc36bbd 1955
7b8da4c7 1956 if (!delayed_work_pending(dw) && need_update(cpu))
ce612879 1957 queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
fbcc8183
JB
1958
1959 cond_resched();
f01f17d3 1960 }
7cc36bbd
CL
1961 put_online_cpus();
1962
1963 schedule_delayed_work(&shepherd,
98f4ebb2 1964 round_jiffies_relative(sysctl_stat_interval));
d1187ed2
CL
1965}
1966
7cc36bbd 1967static void __init start_shepherd_timer(void)
d1187ed2 1968{
7cc36bbd
CL
1969 int cpu;
1970
1971 for_each_possible_cpu(cpu)
ccde8bd4 1972 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
7cc36bbd
CL
1973 vmstat_update);
1974
7cc36bbd
CL
1975 schedule_delayed_work(&shepherd,
1976 round_jiffies_relative(sysctl_stat_interval));
d1187ed2
CL
1977}
1978
03e86dba
TC
1979static void __init init_cpu_node_state(void)
1980{
4c501327 1981 int node;
03e86dba 1982
4c501327
SAS
1983 for_each_online_node(node) {
1984 if (cpumask_weight(cpumask_of_node(node)) > 0)
1985 node_set_state(node, N_CPU);
1986 }
03e86dba
TC
1987}
1988
5438da97
SAS
1989static int vmstat_cpu_online(unsigned int cpu)
1990{
1991 refresh_zone_stat_thresholds();
1992 node_set_state(cpu_to_node(cpu), N_CPU);
1993 return 0;
1994}
1995
1996static int vmstat_cpu_down_prep(unsigned int cpu)
1997{
1998 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
1999 return 0;
2000}
2001
2002static int vmstat_cpu_dead(unsigned int cpu)
807a1bd2 2003{
4c501327 2004 const struct cpumask *node_cpus;
5438da97 2005 int node;
807a1bd2 2006
5438da97
SAS
2007 node = cpu_to_node(cpu);
2008
2009 refresh_zone_stat_thresholds();
4c501327
SAS
2010 node_cpus = cpumask_of_node(node);
2011 if (cpumask_weight(node_cpus) > 0)
5438da97 2012 return 0;
807a1bd2
TK
2013
2014 node_clear_state(node, N_CPU);
5438da97 2015 return 0;
807a1bd2
TK
2016}
2017
8f32f7e5 2018#endif
df9ecaba 2019
ce612879
MH
2020struct workqueue_struct *mm_percpu_wq;
2021
597b7305 2022void __init init_mm_internals(void)
df9ecaba 2023{
ce612879 2024 int ret __maybe_unused;
5438da97 2025
80d136e1 2026 mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
ce612879
MH
2027
2028#ifdef CONFIG_SMP
5438da97
SAS
2029 ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
2030 NULL, vmstat_cpu_dead);
2031 if (ret < 0)
2032 pr_err("vmstat: failed to register 'dead' hotplug state\n");
2033
2034 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
2035 vmstat_cpu_online,
2036 vmstat_cpu_down_prep);
2037 if (ret < 0)
2038 pr_err("vmstat: failed to register 'online' hotplug state\n");
2039
2040 get_online_cpus();
03e86dba 2041 init_cpu_node_state();
5438da97 2042 put_online_cpus();
d1187ed2 2043
7cc36bbd 2044 start_shepherd_timer();
8f32f7e5
AD
2045#endif
2046#ifdef CONFIG_PROC_FS
fddda2b7 2047 proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
abaed011 2048 proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
fddda2b7
CH
2049 proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
2050 proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
8f32f7e5 2051#endif
df9ecaba 2052}
d7a5752c
MG
2053
2054#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
d7a5752c
MG
2055
2056/*
2057 * Return an index indicating how much of the available free memory is
2058 * unusable for an allocation of the requested size.
2059 */
2060static int unusable_free_index(unsigned int order,
2061 struct contig_page_info *info)
2062{
2063 /* No free memory is interpreted as all free memory is unusable */
2064 if (info->free_pages == 0)
2065 return 1000;
2066
2067 /*
2068 * Index should be a value between 0 and 1. Return a value to 3
2069 * decimal places.
2070 *
2071 * 0 => no fragmentation
2072 * 1 => high fragmentation
2073 */
2074 return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
2075
2076}
2077
2078static void unusable_show_print(struct seq_file *m,
2079 pg_data_t *pgdat, struct zone *zone)
2080{
2081 unsigned int order;
2082 int index;
2083 struct contig_page_info info;
2084
2085 seq_printf(m, "Node %d, zone %8s ",
2086 pgdat->node_id,
2087 zone->name);
2088 for (order = 0; order < MAX_ORDER; ++order) {
2089 fill_contig_page_info(zone, order, &info);
2090 index = unusable_free_index(order, &info);
2091 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2092 }
2093
2094 seq_putc(m, '\n');
2095}
2096
2097/*
2098 * Display unusable free space index
2099 *
2100 * The unusable free space index measures how much of the available free
2101 * memory cannot be used to satisfy an allocation of a given size and is a
2102 * value between 0 and 1. The higher the value, the more of free memory is
2103 * unusable and by implication, the worse the external fragmentation is. This
2104 * can be expressed as a percentage by multiplying by 100.
2105 */
2106static int unusable_show(struct seq_file *m, void *arg)
2107{
2108 pg_data_t *pgdat = (pg_data_t *)arg;
2109
2110 /* check memoryless node */
a47b53c5 2111 if (!node_state(pgdat->node_id, N_MEMORY))
d7a5752c
MG
2112 return 0;
2113
727c080f 2114 walk_zones_in_node(m, pgdat, true, false, unusable_show_print);
d7a5752c
MG
2115
2116 return 0;
2117}
2118
01a99560 2119static const struct seq_operations unusable_sops = {
d7a5752c
MG
2120 .start = frag_start,
2121 .next = frag_next,
2122 .stop = frag_stop,
2123 .show = unusable_show,
2124};
2125
01a99560 2126DEFINE_SEQ_ATTRIBUTE(unusable);
d7a5752c 2127
f1a5ab12
MG
2128static void extfrag_show_print(struct seq_file *m,
2129 pg_data_t *pgdat, struct zone *zone)
2130{
2131 unsigned int order;
2132 int index;
2133
2134 /* Alloc on stack as interrupts are disabled for zone walk */
2135 struct contig_page_info info;
2136
2137 seq_printf(m, "Node %d, zone %8s ",
2138 pgdat->node_id,
2139 zone->name);
2140 for (order = 0; order < MAX_ORDER; ++order) {
2141 fill_contig_page_info(zone, order, &info);
56de7263 2142 index = __fragmentation_index(order, &info);
f1a5ab12
MG
2143 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2144 }
2145
2146 seq_putc(m, '\n');
2147}
2148
2149/*
2150 * Display fragmentation index for orders that allocations would fail for
2151 */
2152static int extfrag_show(struct seq_file *m, void *arg)
2153{
2154 pg_data_t *pgdat = (pg_data_t *)arg;
2155
727c080f 2156 walk_zones_in_node(m, pgdat, true, false, extfrag_show_print);
f1a5ab12
MG
2157
2158 return 0;
2159}
2160
01a99560 2161static const struct seq_operations extfrag_sops = {
f1a5ab12
MG
2162 .start = frag_start,
2163 .next = frag_next,
2164 .stop = frag_stop,
2165 .show = extfrag_show,
2166};
2167
01a99560 2168DEFINE_SEQ_ATTRIBUTE(extfrag);
f1a5ab12 2169
d7a5752c
MG
2170static int __init extfrag_debug_init(void)
2171{
bde8bd8a
S
2172 struct dentry *extfrag_debug_root;
2173
d7a5752c 2174 extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
d7a5752c 2175
d9f7979c 2176 debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
01a99560 2177 &unusable_fops);
d7a5752c 2178
d9f7979c 2179 debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
01a99560 2180 &extfrag_fops);
f1a5ab12 2181
d7a5752c
MG
2182 return 0;
2183}
2184
2185module_init(extfrag_debug_init);
2186#endif