mm,memory_hotplug: allocate memmap from the added memory range
[linux-2.6-block.git] / mm / vmstat.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
f6ac2354
CL
2/*
3 * linux/mm/vmstat.c
4 *
5 * Manages VM statistics
6 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
2244b95a
CL
7 *
8 * zoned VM statistics
9 * Copyright (C) 2006 Silicon Graphics, Inc.,
10 * Christoph Lameter <christoph@lameter.com>
7cc36bbd 11 * Copyright (C) 2008-2014 Christoph Lameter
f6ac2354 12 */
8f32f7e5 13#include <linux/fs.h>
f6ac2354 14#include <linux/mm.h>
4e950f6f 15#include <linux/err.h>
2244b95a 16#include <linux/module.h>
5a0e3ad6 17#include <linux/slab.h>
df9ecaba 18#include <linux/cpu.h>
7cc36bbd 19#include <linux/cpumask.h>
c748e134 20#include <linux/vmstat.h>
3c486871
AM
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/debugfs.h>
e8edc6e0 24#include <linux/sched.h>
f1a5ab12 25#include <linux/math64.h>
79da826a 26#include <linux/writeback.h>
36deb0be 27#include <linux/compaction.h>
6e543d57 28#include <linux/mm_inline.h>
48c96a36
JK
29#include <linux/page_ext.h>
30#include <linux/page_owner.h>
6e543d57
LD
31
32#include "internal.h"
f6ac2354 33
1d90ca89
KW
34#define NUMA_STATS_THRESHOLD (U16_MAX - 2)
35
4518085e
KW
36#ifdef CONFIG_NUMA
37int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
38
39/* zero numa counters within a zone */
40static void zero_zone_numa_counters(struct zone *zone)
41{
42 int item, cpu;
43
44 for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) {
45 atomic_long_set(&zone->vm_numa_stat[item], 0);
46 for_each_online_cpu(cpu)
47 per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]
48 = 0;
49 }
50}
51
52/* zero numa counters of all the populated zones */
53static void zero_zones_numa_counters(void)
54{
55 struct zone *zone;
56
57 for_each_populated_zone(zone)
58 zero_zone_numa_counters(zone);
59}
60
61/* zero global numa counters */
62static void zero_global_numa_counters(void)
63{
64 int item;
65
66 for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++)
67 atomic_long_set(&vm_numa_stat[item], 0);
68}
69
70static void invalid_numa_statistics(void)
71{
72 zero_zones_numa_counters();
73 zero_global_numa_counters();
74}
75
76static DEFINE_MUTEX(vm_numa_stat_lock);
77
78int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
32927393 79 void *buffer, size_t *length, loff_t *ppos)
4518085e
KW
80{
81 int ret, oldval;
82
83 mutex_lock(&vm_numa_stat_lock);
84 if (write)
85 oldval = sysctl_vm_numa_stat;
86 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
87 if (ret || !write)
88 goto out;
89
90 if (oldval == sysctl_vm_numa_stat)
91 goto out;
92 else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
93 static_branch_enable(&vm_numa_stat_key);
94 pr_info("enable numa statistics\n");
95 } else {
96 static_branch_disable(&vm_numa_stat_key);
97 invalid_numa_statistics();
98 pr_info("disable numa statistics, and clear numa counters\n");
99 }
100
101out:
102 mutex_unlock(&vm_numa_stat_lock);
103 return ret;
104}
105#endif
106
f8891e5e
CL
107#ifdef CONFIG_VM_EVENT_COUNTERS
108DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
109EXPORT_PER_CPU_SYMBOL(vm_event_states);
110
31f961a8 111static void sum_vm_events(unsigned long *ret)
f8891e5e 112{
9eccf2a8 113 int cpu;
f8891e5e
CL
114 int i;
115
116 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
117
31f961a8 118 for_each_online_cpu(cpu) {
f8891e5e
CL
119 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
120
f8891e5e
CL
121 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
122 ret[i] += this->event[i];
123 }
124}
125
126/*
127 * Accumulate the vm event counters across all CPUs.
128 * The result is unavoidably approximate - it can change
129 * during and after execution of this function.
130*/
131void all_vm_events(unsigned long *ret)
132{
b5be1132 133 get_online_cpus();
31f961a8 134 sum_vm_events(ret);
b5be1132 135 put_online_cpus();
f8891e5e 136}
32dd66fc 137EXPORT_SYMBOL_GPL(all_vm_events);
f8891e5e 138
f8891e5e
CL
139/*
140 * Fold the foreign cpu events into our own.
141 *
142 * This is adding to the events on one processor
143 * but keeps the global counts constant.
144 */
145void vm_events_fold_cpu(int cpu)
146{
147 struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
148 int i;
149
150 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
151 count_vm_events(i, fold_state->event[i]);
152 fold_state->event[i] = 0;
153 }
154}
f8891e5e
CL
155
156#endif /* CONFIG_VM_EVENT_COUNTERS */
157
2244b95a
CL
158/*
159 * Manage combined zone based / global counters
160 *
161 * vm_stat contains the global counters
162 */
75ef7184 163atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
3a321d2a 164atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp;
75ef7184
MG
165atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
166EXPORT_SYMBOL(vm_zone_stat);
3a321d2a 167EXPORT_SYMBOL(vm_numa_stat);
75ef7184 168EXPORT_SYMBOL(vm_node_stat);
2244b95a
CL
169
170#ifdef CONFIG_SMP
171
b44129b3 172int calculate_pressure_threshold(struct zone *zone)
88f5acf8
MG
173{
174 int threshold;
175 int watermark_distance;
176
177 /*
178 * As vmstats are not up to date, there is drift between the estimated
179 * and real values. For high thresholds and a high number of CPUs, it
180 * is possible for the min watermark to be breached while the estimated
181 * value looks fine. The pressure threshold is a reduced value such
182 * that even the maximum amount of drift will not accidentally breach
183 * the min watermark
184 */
185 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
186 threshold = max(1, (int)(watermark_distance / num_online_cpus()));
187
188 /*
189 * Maximum threshold is 125
190 */
191 threshold = min(125, threshold);
192
193 return threshold;
194}
195
b44129b3 196int calculate_normal_threshold(struct zone *zone)
df9ecaba
CL
197{
198 int threshold;
199 int mem; /* memory in 128 MB units */
200
201 /*
202 * The threshold scales with the number of processors and the amount
203 * of memory per zone. More memory means that we can defer updates for
204 * longer, more processors could lead to more contention.
205 * fls() is used to have a cheap way of logarithmic scaling.
206 *
207 * Some sample thresholds:
208 *
209 * Threshold Processors (fls) Zonesize fls(mem+1)
210 * ------------------------------------------------------------------
211 * 8 1 1 0.9-1 GB 4
212 * 16 2 2 0.9-1 GB 4
213 * 20 2 2 1-2 GB 5
214 * 24 2 2 2-4 GB 6
215 * 28 2 2 4-8 GB 7
216 * 32 2 2 8-16 GB 8
217 * 4 2 2 <128M 1
218 * 30 4 3 2-4 GB 5
219 * 48 4 3 8-16 GB 8
220 * 32 8 4 1-2 GB 4
221 * 32 8 4 0.9-1GB 4
222 * 10 16 5 <128M 1
223 * 40 16 5 900M 4
224 * 70 64 7 2-4 GB 5
225 * 84 64 7 4-8 GB 6
226 * 108 512 9 4-8 GB 6
227 * 125 1024 10 8-16 GB 8
228 * 125 1024 10 16-32 GB 9
229 */
230
9705bea5 231 mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
df9ecaba
CL
232
233 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
234
235 /*
236 * Maximum threshold is 125
237 */
238 threshold = min(125, threshold);
239
240 return threshold;
241}
2244b95a
CL
242
243/*
df9ecaba 244 * Refresh the thresholds for each zone.
2244b95a 245 */
a6cccdc3 246void refresh_zone_stat_thresholds(void)
2244b95a 247{
75ef7184 248 struct pglist_data *pgdat;
df9ecaba
CL
249 struct zone *zone;
250 int cpu;
251 int threshold;
252
75ef7184
MG
253 /* Zero current pgdat thresholds */
254 for_each_online_pgdat(pgdat) {
255 for_each_online_cpu(cpu) {
256 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
257 }
258 }
259
ee99c71c 260 for_each_populated_zone(zone) {
75ef7184 261 struct pglist_data *pgdat = zone->zone_pgdat;
aa454840
CL
262 unsigned long max_drift, tolerate_drift;
263
b44129b3 264 threshold = calculate_normal_threshold(zone);
df9ecaba 265
75ef7184
MG
266 for_each_online_cpu(cpu) {
267 int pgdat_threshold;
268
99dcc3e5
CL
269 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
270 = threshold;
1d90ca89 271
75ef7184
MG
272 /* Base nodestat threshold on the largest populated zone. */
273 pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
274 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
275 = max(threshold, pgdat_threshold);
276 }
277
aa454840
CL
278 /*
279 * Only set percpu_drift_mark if there is a danger that
280 * NR_FREE_PAGES reports the low watermark is ok when in fact
281 * the min watermark could be breached by an allocation
282 */
283 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
284 max_drift = num_online_cpus() * threshold;
285 if (max_drift > tolerate_drift)
286 zone->percpu_drift_mark = high_wmark_pages(zone) +
287 max_drift;
df9ecaba 288 }
2244b95a
CL
289}
290
b44129b3
MG
291void set_pgdat_percpu_threshold(pg_data_t *pgdat,
292 int (*calculate_pressure)(struct zone *))
88f5acf8
MG
293{
294 struct zone *zone;
295 int cpu;
296 int threshold;
297 int i;
298
88f5acf8
MG
299 for (i = 0; i < pgdat->nr_zones; i++) {
300 zone = &pgdat->node_zones[i];
301 if (!zone->percpu_drift_mark)
302 continue;
303
b44129b3 304 threshold = (*calculate_pressure)(zone);
1d90ca89 305 for_each_online_cpu(cpu)
88f5acf8
MG
306 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
307 = threshold;
308 }
88f5acf8
MG
309}
310
2244b95a 311/*
bea04b07
JZ
312 * For use when we know that interrupts are disabled,
313 * or when we know that preemption is disabled and that
314 * particular counter cannot be updated from interrupt context.
2244b95a
CL
315 */
316void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 317 long delta)
2244b95a 318{
12938a92
CL
319 struct per_cpu_pageset __percpu *pcp = zone->pageset;
320 s8 __percpu *p = pcp->vm_stat_diff + item;
2244b95a 321 long x;
12938a92
CL
322 long t;
323
324 x = delta + __this_cpu_read(*p);
2244b95a 325
12938a92 326 t = __this_cpu_read(pcp->stat_threshold);
2244b95a 327
40610076 328 if (unlikely(abs(x) > t)) {
2244b95a
CL
329 zone_page_state_add(x, zone, item);
330 x = 0;
331 }
12938a92 332 __this_cpu_write(*p, x);
2244b95a
CL
333}
334EXPORT_SYMBOL(__mod_zone_page_state);
335
75ef7184
MG
336void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
337 long delta)
338{
339 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
340 s8 __percpu *p = pcp->vm_node_stat_diff + item;
341 long x;
342 long t;
343
ea426c2a 344 if (vmstat_item_in_bytes(item)) {
629484ae
JW
345 /*
346 * Only cgroups use subpage accounting right now; at
347 * the global level, these items still change in
348 * multiples of whole pages. Store them as pages
349 * internally to keep the per-cpu counters compact.
350 */
ea426c2a
RG
351 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
352 delta >>= PAGE_SHIFT;
353 }
354
75ef7184
MG
355 x = delta + __this_cpu_read(*p);
356
357 t = __this_cpu_read(pcp->stat_threshold);
358
40610076 359 if (unlikely(abs(x) > t)) {
75ef7184
MG
360 node_page_state_add(x, pgdat, item);
361 x = 0;
362 }
363 __this_cpu_write(*p, x);
364}
365EXPORT_SYMBOL(__mod_node_page_state);
366
2244b95a
CL
367/*
368 * Optimized increment and decrement functions.
369 *
370 * These are only for a single page and therefore can take a struct page *
371 * argument instead of struct zone *. This allows the inclusion of the code
372 * generated for page_zone(page) into the optimized functions.
373 *
374 * No overflow check is necessary and therefore the differential can be
375 * incremented or decremented in place which may allow the compilers to
376 * generate better code.
2244b95a
CL
377 * The increment or decrement is known and therefore one boundary check can
378 * be omitted.
379 *
df9ecaba
CL
380 * NOTE: These functions are very performance sensitive. Change only
381 * with care.
382 *
2244b95a
CL
383 * Some processors have inc/dec instructions that are atomic vs an interrupt.
384 * However, the code must first determine the differential location in a zone
385 * based on the processor number and then inc/dec the counter. There is no
386 * guarantee without disabling preemption that the processor will not change
387 * in between and therefore the atomicity vs. interrupt cannot be exploited
388 * in a useful way here.
389 */
c8785385 390void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
2244b95a 391{
12938a92
CL
392 struct per_cpu_pageset __percpu *pcp = zone->pageset;
393 s8 __percpu *p = pcp->vm_stat_diff + item;
394 s8 v, t;
2244b95a 395
908ee0f1 396 v = __this_cpu_inc_return(*p);
12938a92
CL
397 t = __this_cpu_read(pcp->stat_threshold);
398 if (unlikely(v > t)) {
399 s8 overstep = t >> 1;
df9ecaba 400
12938a92
CL
401 zone_page_state_add(v + overstep, zone, item);
402 __this_cpu_write(*p, -overstep);
2244b95a
CL
403 }
404}
ca889e6c 405
75ef7184
MG
406void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
407{
408 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
409 s8 __percpu *p = pcp->vm_node_stat_diff + item;
410 s8 v, t;
411
ea426c2a
RG
412 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
413
75ef7184
MG
414 v = __this_cpu_inc_return(*p);
415 t = __this_cpu_read(pcp->stat_threshold);
416 if (unlikely(v > t)) {
417 s8 overstep = t >> 1;
418
419 node_page_state_add(v + overstep, pgdat, item);
420 __this_cpu_write(*p, -overstep);
421 }
422}
423
ca889e6c
CL
424void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
425{
426 __inc_zone_state(page_zone(page), item);
427}
2244b95a
CL
428EXPORT_SYMBOL(__inc_zone_page_state);
429
75ef7184
MG
430void __inc_node_page_state(struct page *page, enum node_stat_item item)
431{
432 __inc_node_state(page_pgdat(page), item);
433}
434EXPORT_SYMBOL(__inc_node_page_state);
435
c8785385 436void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
2244b95a 437{
12938a92
CL
438 struct per_cpu_pageset __percpu *pcp = zone->pageset;
439 s8 __percpu *p = pcp->vm_stat_diff + item;
440 s8 v, t;
2244b95a 441
908ee0f1 442 v = __this_cpu_dec_return(*p);
12938a92
CL
443 t = __this_cpu_read(pcp->stat_threshold);
444 if (unlikely(v < - t)) {
445 s8 overstep = t >> 1;
2244b95a 446
12938a92
CL
447 zone_page_state_add(v - overstep, zone, item);
448 __this_cpu_write(*p, overstep);
2244b95a
CL
449 }
450}
c8785385 451
75ef7184
MG
452void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
453{
454 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
455 s8 __percpu *p = pcp->vm_node_stat_diff + item;
456 s8 v, t;
457
ea426c2a
RG
458 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
459
75ef7184
MG
460 v = __this_cpu_dec_return(*p);
461 t = __this_cpu_read(pcp->stat_threshold);
462 if (unlikely(v < - t)) {
463 s8 overstep = t >> 1;
464
465 node_page_state_add(v - overstep, pgdat, item);
466 __this_cpu_write(*p, overstep);
467 }
468}
469
c8785385
CL
470void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
471{
472 __dec_zone_state(page_zone(page), item);
473}
2244b95a
CL
474EXPORT_SYMBOL(__dec_zone_page_state);
475
75ef7184
MG
476void __dec_node_page_state(struct page *page, enum node_stat_item item)
477{
478 __dec_node_state(page_pgdat(page), item);
479}
480EXPORT_SYMBOL(__dec_node_page_state);
481
4156153c 482#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
7c839120
CL
483/*
484 * If we have cmpxchg_local support then we do not need to incur the overhead
485 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
486 *
487 * mod_state() modifies the zone counter state through atomic per cpu
488 * operations.
489 *
490 * Overstep mode specifies how overstep should handled:
491 * 0 No overstepping
492 * 1 Overstepping half of threshold
493 * -1 Overstepping minus half of threshold
494*/
75ef7184
MG
495static inline void mod_zone_state(struct zone *zone,
496 enum zone_stat_item item, long delta, int overstep_mode)
7c839120
CL
497{
498 struct per_cpu_pageset __percpu *pcp = zone->pageset;
499 s8 __percpu *p = pcp->vm_stat_diff + item;
500 long o, n, t, z;
501
502 do {
503 z = 0; /* overflow to zone counters */
504
505 /*
506 * The fetching of the stat_threshold is racy. We may apply
507 * a counter threshold to the wrong the cpu if we get
d3bc2367
CL
508 * rescheduled while executing here. However, the next
509 * counter update will apply the threshold again and
510 * therefore bring the counter under the threshold again.
511 *
512 * Most of the time the thresholds are the same anyways
513 * for all cpus in a zone.
7c839120
CL
514 */
515 t = this_cpu_read(pcp->stat_threshold);
516
517 o = this_cpu_read(*p);
518 n = delta + o;
519
40610076 520 if (abs(n) > t) {
7c839120
CL
521 int os = overstep_mode * (t >> 1) ;
522
523 /* Overflow must be added to zone counters */
524 z = n + os;
525 n = -os;
526 }
527 } while (this_cpu_cmpxchg(*p, o, n) != o);
528
529 if (z)
530 zone_page_state_add(z, zone, item);
531}
532
533void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 534 long delta)
7c839120 535{
75ef7184 536 mod_zone_state(zone, item, delta, 0);
7c839120
CL
537}
538EXPORT_SYMBOL(mod_zone_page_state);
539
7c839120
CL
540void inc_zone_page_state(struct page *page, enum zone_stat_item item)
541{
75ef7184 542 mod_zone_state(page_zone(page), item, 1, 1);
7c839120
CL
543}
544EXPORT_SYMBOL(inc_zone_page_state);
545
546void dec_zone_page_state(struct page *page, enum zone_stat_item item)
547{
75ef7184 548 mod_zone_state(page_zone(page), item, -1, -1);
7c839120
CL
549}
550EXPORT_SYMBOL(dec_zone_page_state);
75ef7184
MG
551
552static inline void mod_node_state(struct pglist_data *pgdat,
553 enum node_stat_item item, int delta, int overstep_mode)
554{
555 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
556 s8 __percpu *p = pcp->vm_node_stat_diff + item;
557 long o, n, t, z;
558
ea426c2a 559 if (vmstat_item_in_bytes(item)) {
629484ae
JW
560 /*
561 * Only cgroups use subpage accounting right now; at
562 * the global level, these items still change in
563 * multiples of whole pages. Store them as pages
564 * internally to keep the per-cpu counters compact.
565 */
ea426c2a
RG
566 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
567 delta >>= PAGE_SHIFT;
568 }
569
75ef7184
MG
570 do {
571 z = 0; /* overflow to node counters */
572
573 /*
574 * The fetching of the stat_threshold is racy. We may apply
575 * a counter threshold to the wrong the cpu if we get
576 * rescheduled while executing here. However, the next
577 * counter update will apply the threshold again and
578 * therefore bring the counter under the threshold again.
579 *
580 * Most of the time the thresholds are the same anyways
581 * for all cpus in a node.
582 */
583 t = this_cpu_read(pcp->stat_threshold);
584
585 o = this_cpu_read(*p);
586 n = delta + o;
587
40610076 588 if (abs(n) > t) {
75ef7184
MG
589 int os = overstep_mode * (t >> 1) ;
590
591 /* Overflow must be added to node counters */
592 z = n + os;
593 n = -os;
594 }
595 } while (this_cpu_cmpxchg(*p, o, n) != o);
596
597 if (z)
598 node_page_state_add(z, pgdat, item);
599}
600
601void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
602 long delta)
603{
604 mod_node_state(pgdat, item, delta, 0);
605}
606EXPORT_SYMBOL(mod_node_page_state);
607
608void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
609{
610 mod_node_state(pgdat, item, 1, 1);
611}
612
613void inc_node_page_state(struct page *page, enum node_stat_item item)
614{
615 mod_node_state(page_pgdat(page), item, 1, 1);
616}
617EXPORT_SYMBOL(inc_node_page_state);
618
619void dec_node_page_state(struct page *page, enum node_stat_item item)
620{
621 mod_node_state(page_pgdat(page), item, -1, -1);
622}
623EXPORT_SYMBOL(dec_node_page_state);
7c839120
CL
624#else
625/*
626 * Use interrupt disable to serialize counter updates
627 */
628void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 629 long delta)
7c839120
CL
630{
631 unsigned long flags;
632
633 local_irq_save(flags);
634 __mod_zone_page_state(zone, item, delta);
635 local_irq_restore(flags);
636}
637EXPORT_SYMBOL(mod_zone_page_state);
638
2244b95a
CL
639void inc_zone_page_state(struct page *page, enum zone_stat_item item)
640{
641 unsigned long flags;
642 struct zone *zone;
2244b95a
CL
643
644 zone = page_zone(page);
645 local_irq_save(flags);
ca889e6c 646 __inc_zone_state(zone, item);
2244b95a
CL
647 local_irq_restore(flags);
648}
649EXPORT_SYMBOL(inc_zone_page_state);
650
651void dec_zone_page_state(struct page *page, enum zone_stat_item item)
652{
653 unsigned long flags;
2244b95a 654
2244b95a 655 local_irq_save(flags);
a302eb4e 656 __dec_zone_page_state(page, item);
2244b95a
CL
657 local_irq_restore(flags);
658}
659EXPORT_SYMBOL(dec_zone_page_state);
660
75ef7184
MG
661void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
662{
663 unsigned long flags;
664
665 local_irq_save(flags);
666 __inc_node_state(pgdat, item);
667 local_irq_restore(flags);
668}
669EXPORT_SYMBOL(inc_node_state);
670
671void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
672 long delta)
673{
674 unsigned long flags;
675
676 local_irq_save(flags);
677 __mod_node_page_state(pgdat, item, delta);
678 local_irq_restore(flags);
679}
680EXPORT_SYMBOL(mod_node_page_state);
681
682void inc_node_page_state(struct page *page, enum node_stat_item item)
683{
684 unsigned long flags;
685 struct pglist_data *pgdat;
686
687 pgdat = page_pgdat(page);
688 local_irq_save(flags);
689 __inc_node_state(pgdat, item);
690 local_irq_restore(flags);
691}
692EXPORT_SYMBOL(inc_node_page_state);
693
694void dec_node_page_state(struct page *page, enum node_stat_item item)
695{
696 unsigned long flags;
697
698 local_irq_save(flags);
699 __dec_node_page_state(page, item);
700 local_irq_restore(flags);
701}
702EXPORT_SYMBOL(dec_node_page_state);
703#endif
7cc36bbd
CL
704
705/*
706 * Fold a differential into the global counters.
707 * Returns the number of counters updated.
708 */
3a321d2a
KW
709#ifdef CONFIG_NUMA
710static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff)
711{
712 int i;
713 int changes = 0;
714
715 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
716 if (zone_diff[i]) {
717 atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
718 changes++;
719 }
720
721 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
722 if (numa_diff[i]) {
723 atomic_long_add(numa_diff[i], &vm_numa_stat[i]);
724 changes++;
725 }
726
727 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
728 if (node_diff[i]) {
729 atomic_long_add(node_diff[i], &vm_node_stat[i]);
730 changes++;
731 }
732 return changes;
733}
734#else
75ef7184 735static int fold_diff(int *zone_diff, int *node_diff)
4edb0748
CL
736{
737 int i;
7cc36bbd 738 int changes = 0;
4edb0748
CL
739
740 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
75ef7184
MG
741 if (zone_diff[i]) {
742 atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
743 changes++;
744 }
745
746 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
747 if (node_diff[i]) {
748 atomic_long_add(node_diff[i], &vm_node_stat[i]);
7cc36bbd
CL
749 changes++;
750 }
751 return changes;
4edb0748 752}
3a321d2a 753#endif /* CONFIG_NUMA */
4edb0748 754
2244b95a 755/*
2bb921e5 756 * Update the zone counters for the current cpu.
a7f75e25 757 *
4037d452
CL
758 * Note that refresh_cpu_vm_stats strives to only access
759 * node local memory. The per cpu pagesets on remote zones are placed
760 * in the memory local to the processor using that pageset. So the
761 * loop over all zones will access a series of cachelines local to
762 * the processor.
763 *
764 * The call to zone_page_state_add updates the cachelines with the
765 * statistics in the remote zone struct as well as the global cachelines
766 * with the global counters. These could cause remote node cache line
767 * bouncing and will have to be only done when necessary.
7cc36bbd
CL
768 *
769 * The function returns the number of global counters updated.
2244b95a 770 */
0eb77e98 771static int refresh_cpu_vm_stats(bool do_pagesets)
2244b95a 772{
75ef7184 773 struct pglist_data *pgdat;
2244b95a
CL
774 struct zone *zone;
775 int i;
75ef7184 776 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
3a321d2a
KW
777#ifdef CONFIG_NUMA
778 int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
779#endif
75ef7184 780 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
7cc36bbd 781 int changes = 0;
2244b95a 782
ee99c71c 783 for_each_populated_zone(zone) {
fbc2edb0 784 struct per_cpu_pageset __percpu *p = zone->pageset;
2244b95a 785
fbc2edb0
CL
786 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
787 int v;
2244b95a 788
fbc2edb0
CL
789 v = this_cpu_xchg(p->vm_stat_diff[i], 0);
790 if (v) {
a7f75e25 791
a7f75e25 792 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 793 global_zone_diff[i] += v;
4037d452
CL
794#ifdef CONFIG_NUMA
795 /* 3 seconds idle till flush */
fbc2edb0 796 __this_cpu_write(p->expire, 3);
4037d452 797#endif
2244b95a 798 }
fbc2edb0 799 }
4037d452 800#ifdef CONFIG_NUMA
3a321d2a
KW
801 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
802 int v;
803
804 v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0);
805 if (v) {
806
807 atomic_long_add(v, &zone->vm_numa_stat[i]);
808 global_numa_diff[i] += v;
809 __this_cpu_write(p->expire, 3);
810 }
811 }
812
0eb77e98
CL
813 if (do_pagesets) {
814 cond_resched();
815 /*
816 * Deal with draining the remote pageset of this
817 * processor
818 *
819 * Check if there are pages remaining in this pageset
820 * if not then there is nothing to expire.
821 */
822 if (!__this_cpu_read(p->expire) ||
fbc2edb0 823 !__this_cpu_read(p->pcp.count))
0eb77e98 824 continue;
4037d452 825
0eb77e98
CL
826 /*
827 * We never drain zones local to this processor.
828 */
829 if (zone_to_nid(zone) == numa_node_id()) {
830 __this_cpu_write(p->expire, 0);
831 continue;
832 }
4037d452 833
0eb77e98
CL
834 if (__this_cpu_dec_return(p->expire))
835 continue;
4037d452 836
0eb77e98
CL
837 if (__this_cpu_read(p->pcp.count)) {
838 drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
839 changes++;
840 }
7cc36bbd 841 }
4037d452 842#endif
2244b95a 843 }
75ef7184
MG
844
845 for_each_online_pgdat(pgdat) {
846 struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
847
848 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
849 int v;
850
851 v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
852 if (v) {
853 atomic_long_add(v, &pgdat->vm_stat[i]);
854 global_node_diff[i] += v;
855 }
856 }
857 }
858
3a321d2a
KW
859#ifdef CONFIG_NUMA
860 changes += fold_diff(global_zone_diff, global_numa_diff,
861 global_node_diff);
862#else
75ef7184 863 changes += fold_diff(global_zone_diff, global_node_diff);
3a321d2a 864#endif
7cc36bbd 865 return changes;
2244b95a
CL
866}
867
2bb921e5
CL
868/*
869 * Fold the data for an offline cpu into the global array.
870 * There cannot be any access by the offline cpu and therefore
871 * synchronization is simplified.
872 */
873void cpu_vm_stats_fold(int cpu)
874{
75ef7184 875 struct pglist_data *pgdat;
2bb921e5
CL
876 struct zone *zone;
877 int i;
75ef7184 878 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
3a321d2a
KW
879#ifdef CONFIG_NUMA
880 int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
881#endif
75ef7184 882 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
2bb921e5
CL
883
884 for_each_populated_zone(zone) {
885 struct per_cpu_pageset *p;
886
887 p = per_cpu_ptr(zone->pageset, cpu);
888
889 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
890 if (p->vm_stat_diff[i]) {
891 int v;
892
893 v = p->vm_stat_diff[i];
894 p->vm_stat_diff[i] = 0;
895 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 896 global_zone_diff[i] += v;
2bb921e5 897 }
3a321d2a
KW
898
899#ifdef CONFIG_NUMA
900 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
901 if (p->vm_numa_stat_diff[i]) {
902 int v;
903
904 v = p->vm_numa_stat_diff[i];
905 p->vm_numa_stat_diff[i] = 0;
906 atomic_long_add(v, &zone->vm_numa_stat[i]);
907 global_numa_diff[i] += v;
908 }
909#endif
2bb921e5
CL
910 }
911
75ef7184
MG
912 for_each_online_pgdat(pgdat) {
913 struct per_cpu_nodestat *p;
914
915 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
916
917 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
918 if (p->vm_node_stat_diff[i]) {
919 int v;
920
921 v = p->vm_node_stat_diff[i];
922 p->vm_node_stat_diff[i] = 0;
923 atomic_long_add(v, &pgdat->vm_stat[i]);
924 global_node_diff[i] += v;
925 }
926 }
927
3a321d2a
KW
928#ifdef CONFIG_NUMA
929 fold_diff(global_zone_diff, global_numa_diff, global_node_diff);
930#else
75ef7184 931 fold_diff(global_zone_diff, global_node_diff);
3a321d2a 932#endif
2bb921e5
CL
933}
934
40f4b1ea
CS
935/*
936 * this is only called if !populated_zone(zone), which implies no other users of
937 * pset->vm_stat_diff[] exsist.
938 */
5a883813
MK
939void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
940{
941 int i;
942
943 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
944 if (pset->vm_stat_diff[i]) {
945 int v = pset->vm_stat_diff[i];
946 pset->vm_stat_diff[i] = 0;
947 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 948 atomic_long_add(v, &vm_zone_stat[i]);
5a883813 949 }
3a321d2a
KW
950
951#ifdef CONFIG_NUMA
952 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
953 if (pset->vm_numa_stat_diff[i]) {
954 int v = pset->vm_numa_stat_diff[i];
955
956 pset->vm_numa_stat_diff[i] = 0;
957 atomic_long_add(v, &zone->vm_numa_stat[i]);
958 atomic_long_add(v, &vm_numa_stat[i]);
959 }
960#endif
5a883813 961}
2244b95a
CL
962#endif
963
ca889e6c 964#ifdef CONFIG_NUMA
3a321d2a
KW
965void __inc_numa_state(struct zone *zone,
966 enum numa_stat_item item)
967{
968 struct per_cpu_pageset __percpu *pcp = zone->pageset;
1d90ca89
KW
969 u16 __percpu *p = pcp->vm_numa_stat_diff + item;
970 u16 v;
3a321d2a
KW
971
972 v = __this_cpu_inc_return(*p);
3a321d2a 973
1d90ca89
KW
974 if (unlikely(v > NUMA_STATS_THRESHOLD)) {
975 zone_numa_state_add(v, zone, item);
976 __this_cpu_write(*p, 0);
3a321d2a
KW
977 }
978}
979
c2d42c16 980/*
75ef7184
MG
981 * Determine the per node value of a stat item. This function
982 * is called frequently in a NUMA machine, so try to be as
983 * frugal as possible.
c2d42c16 984 */
75ef7184
MG
985unsigned long sum_zone_node_page_state(int node,
986 enum zone_stat_item item)
c2d42c16
AM
987{
988 struct zone *zones = NODE_DATA(node)->node_zones;
e87d59f7
JK
989 int i;
990 unsigned long count = 0;
c2d42c16 991
e87d59f7
JK
992 for (i = 0; i < MAX_NR_ZONES; i++)
993 count += zone_page_state(zones + i, item);
994
995 return count;
c2d42c16
AM
996}
997
63803222
KW
998/*
999 * Determine the per node value of a numa stat item. To avoid deviation,
1000 * the per cpu stat number in vm_numa_stat_diff[] is also included.
1001 */
3a321d2a
KW
1002unsigned long sum_zone_numa_state(int node,
1003 enum numa_stat_item item)
1004{
1005 struct zone *zones = NODE_DATA(node)->node_zones;
1006 int i;
1007 unsigned long count = 0;
1008
1009 for (i = 0; i < MAX_NR_ZONES; i++)
63803222 1010 count += zone_numa_state_snapshot(zones + i, item);
3a321d2a
KW
1011
1012 return count;
1013}
1014
75ef7184
MG
1015/*
1016 * Determine the per node value of a stat item.
1017 */
ea426c2a
RG
1018unsigned long node_page_state_pages(struct pglist_data *pgdat,
1019 enum node_stat_item item)
75ef7184
MG
1020{
1021 long x = atomic_long_read(&pgdat->vm_stat[item]);
1022#ifdef CONFIG_SMP
1023 if (x < 0)
1024 x = 0;
1025#endif
1026 return x;
1027}
ea426c2a
RG
1028
1029unsigned long node_page_state(struct pglist_data *pgdat,
1030 enum node_stat_item item)
1031{
1032 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
1033
1034 return node_page_state_pages(pgdat, item);
1035}
ca889e6c
CL
1036#endif
1037
d7a5752c 1038#ifdef CONFIG_COMPACTION
36deb0be 1039
d7a5752c
MG
1040struct contig_page_info {
1041 unsigned long free_pages;
1042 unsigned long free_blocks_total;
1043 unsigned long free_blocks_suitable;
1044};
1045
1046/*
1047 * Calculate the number of free pages in a zone, how many contiguous
1048 * pages are free and how many are large enough to satisfy an allocation of
1049 * the target size. Note that this function makes no attempt to estimate
1050 * how many suitable free blocks there *might* be if MOVABLE pages were
1051 * migrated. Calculating that is possible, but expensive and can be
1052 * figured out from userspace
1053 */
1054static void fill_contig_page_info(struct zone *zone,
1055 unsigned int suitable_order,
1056 struct contig_page_info *info)
1057{
1058 unsigned int order;
1059
1060 info->free_pages = 0;
1061 info->free_blocks_total = 0;
1062 info->free_blocks_suitable = 0;
1063
1064 for (order = 0; order < MAX_ORDER; order++) {
1065 unsigned long blocks;
1066
1067 /* Count number of free blocks */
1068 blocks = zone->free_area[order].nr_free;
1069 info->free_blocks_total += blocks;
1070
1071 /* Count free base pages */
1072 info->free_pages += blocks << order;
1073
1074 /* Count the suitable free blocks */
1075 if (order >= suitable_order)
1076 info->free_blocks_suitable += blocks <<
1077 (order - suitable_order);
1078 }
1079}
f1a5ab12
MG
1080
1081/*
1082 * A fragmentation index only makes sense if an allocation of a requested
1083 * size would fail. If that is true, the fragmentation index indicates
1084 * whether external fragmentation or a lack of memory was the problem.
1085 * The value can be used to determine if page reclaim or compaction
1086 * should be used
1087 */
56de7263 1088static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
f1a5ab12
MG
1089{
1090 unsigned long requested = 1UL << order;
1091
88d6ac40
WY
1092 if (WARN_ON_ONCE(order >= MAX_ORDER))
1093 return 0;
1094
f1a5ab12
MG
1095 if (!info->free_blocks_total)
1096 return 0;
1097
1098 /* Fragmentation index only makes sense when a request would fail */
1099 if (info->free_blocks_suitable)
1100 return -1000;
1101
1102 /*
1103 * Index is between 0 and 1 so return within 3 decimal places
1104 *
1105 * 0 => allocation would fail due to lack of memory
1106 * 1 => allocation would fail due to fragmentation
1107 */
1108 return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
1109}
56de7263 1110
facdaa91
NG
1111/*
1112 * Calculates external fragmentation within a zone wrt the given order.
1113 * It is defined as the percentage of pages found in blocks of size
1114 * less than 1 << order. It returns values in range [0, 100].
1115 */
d34c0a75 1116unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
facdaa91
NG
1117{
1118 struct contig_page_info info;
1119
1120 fill_contig_page_info(zone, order, &info);
1121 if (info.free_pages == 0)
1122 return 0;
1123
1124 return div_u64((info.free_pages -
1125 (info.free_blocks_suitable << order)) * 100,
1126 info.free_pages);
1127}
1128
56de7263
MG
1129/* Same as __fragmentation index but allocs contig_page_info on stack */
1130int fragmentation_index(struct zone *zone, unsigned int order)
1131{
1132 struct contig_page_info info;
1133
1134 fill_contig_page_info(zone, order, &info);
1135 return __fragmentation_index(order, &info);
1136}
d7a5752c
MG
1137#endif
1138
ebc5d83d
KK
1139#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
1140 defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
fa25c503
KM
1141#ifdef CONFIG_ZONE_DMA
1142#define TEXT_FOR_DMA(xx) xx "_dma",
1143#else
1144#define TEXT_FOR_DMA(xx)
1145#endif
1146
1147#ifdef CONFIG_ZONE_DMA32
1148#define TEXT_FOR_DMA32(xx) xx "_dma32",
1149#else
1150#define TEXT_FOR_DMA32(xx)
1151#endif
1152
1153#ifdef CONFIG_HIGHMEM
1154#define TEXT_FOR_HIGHMEM(xx) xx "_high",
1155#else
1156#define TEXT_FOR_HIGHMEM(xx)
1157#endif
1158
1159#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
1160 TEXT_FOR_HIGHMEM(xx) xx "_movable",
1161
1162const char * const vmstat_text[] = {
8d92890b 1163 /* enum zone_stat_item counters */
fa25c503 1164 "nr_free_pages",
71c799f4
MK
1165 "nr_zone_inactive_anon",
1166 "nr_zone_active_anon",
1167 "nr_zone_inactive_file",
1168 "nr_zone_active_file",
1169 "nr_zone_unevictable",
5a1c84b4 1170 "nr_zone_write_pending",
fa25c503 1171 "nr_mlock",
fa25c503 1172 "nr_bounce",
91537fee
MK
1173#if IS_ENABLED(CONFIG_ZSMALLOC)
1174 "nr_zspages",
1175#endif
3a321d2a
KW
1176 "nr_free_cma",
1177
1178 /* enum numa_stat_item counters */
fa25c503
KM
1179#ifdef CONFIG_NUMA
1180 "numa_hit",
1181 "numa_miss",
1182 "numa_foreign",
1183 "numa_interleave",
1184 "numa_local",
1185 "numa_other",
1186#endif
09316c09 1187
9d7ea9a2 1188 /* enum node_stat_item counters */
599d0c95
MG
1189 "nr_inactive_anon",
1190 "nr_active_anon",
1191 "nr_inactive_file",
1192 "nr_active_file",
1193 "nr_unevictable",
385386cf
JW
1194 "nr_slab_reclaimable",
1195 "nr_slab_unreclaimable",
599d0c95
MG
1196 "nr_isolated_anon",
1197 "nr_isolated_file",
68d48e6a 1198 "workingset_nodes",
170b04b7
JK
1199 "workingset_refault_anon",
1200 "workingset_refault_file",
1201 "workingset_activate_anon",
1202 "workingset_activate_file",
1203 "workingset_restore_anon",
1204 "workingset_restore_file",
1e6b1085 1205 "workingset_nodereclaim",
50658e2e
MG
1206 "nr_anon_pages",
1207 "nr_mapped",
11fb9989
MG
1208 "nr_file_pages",
1209 "nr_dirty",
1210 "nr_writeback",
1211 "nr_writeback_temp",
1212 "nr_shmem",
1213 "nr_shmem_hugepages",
1214 "nr_shmem_pmdmapped",
60fbf0ab
SL
1215 "nr_file_hugepages",
1216 "nr_file_pmdmapped",
11fb9989 1217 "nr_anon_transparent_hugepages",
c4a25635
MG
1218 "nr_vmscan_write",
1219 "nr_vmscan_immediate_reclaim",
1220 "nr_dirtied",
1221 "nr_written",
b29940c1 1222 "nr_kernel_misc_reclaimable",
1970dc6f
JH
1223 "nr_foll_pin_acquired",
1224 "nr_foll_pin_released",
991e7673
SB
1225 "nr_kernel_stack",
1226#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
1227 "nr_shadow_call_stack",
1228#endif
f0c0c115 1229 "nr_page_table_pages",
b6038942
SB
1230#ifdef CONFIG_SWAP
1231 "nr_swapcached",
1232#endif
599d0c95 1233
09316c09 1234 /* enum writeback_stat_item counters */
fa25c503
KM
1235 "nr_dirty_threshold",
1236 "nr_dirty_background_threshold",
1237
ebc5d83d 1238#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
09316c09 1239 /* enum vm_event_item counters */
fa25c503
KM
1240 "pgpgin",
1241 "pgpgout",
1242 "pswpin",
1243 "pswpout",
1244
1245 TEXTS_FOR_ZONES("pgalloc")
7cc30fcf
MG
1246 TEXTS_FOR_ZONES("allocstall")
1247 TEXTS_FOR_ZONES("pgskip")
fa25c503
KM
1248
1249 "pgfree",
1250 "pgactivate",
1251 "pgdeactivate",
f7ad2a6c 1252 "pglazyfree",
fa25c503
KM
1253
1254 "pgfault",
1255 "pgmajfault",
854e9ed0 1256 "pglazyfreed",
fa25c503 1257
599d0c95 1258 "pgrefill",
798a6b87 1259 "pgreuse",
599d0c95
MG
1260 "pgsteal_kswapd",
1261 "pgsteal_direct",
1262 "pgscan_kswapd",
1263 "pgscan_direct",
68243e76 1264 "pgscan_direct_throttle",
497a6c1b
JW
1265 "pgscan_anon",
1266 "pgscan_file",
1267 "pgsteal_anon",
1268 "pgsteal_file",
fa25c503
KM
1269
1270#ifdef CONFIG_NUMA
1271 "zone_reclaim_failed",
1272#endif
1273 "pginodesteal",
1274 "slabs_scanned",
fa25c503
KM
1275 "kswapd_inodesteal",
1276 "kswapd_low_wmark_hit_quickly",
1277 "kswapd_high_wmark_hit_quickly",
fa25c503 1278 "pageoutrun",
fa25c503
KM
1279
1280 "pgrotated",
1281
5509a5d2
DH
1282 "drop_pagecache",
1283 "drop_slab",
8e675f7a 1284 "oom_kill",
5509a5d2 1285
03c5a6e1
MG
1286#ifdef CONFIG_NUMA_BALANCING
1287 "numa_pte_updates",
72403b4a 1288 "numa_huge_pte_updates",
03c5a6e1
MG
1289 "numa_hint_faults",
1290 "numa_hint_faults_local",
1291 "numa_pages_migrated",
1292#endif
5647bc29
MG
1293#ifdef CONFIG_MIGRATION
1294 "pgmigrate_success",
1295 "pgmigrate_fail",
1a5bae25
AK
1296 "thp_migration_success",
1297 "thp_migration_fail",
1298 "thp_migration_split",
5647bc29 1299#endif
fa25c503 1300#ifdef CONFIG_COMPACTION
397487db
MG
1301 "compact_migrate_scanned",
1302 "compact_free_scanned",
1303 "compact_isolated",
fa25c503
KM
1304 "compact_stall",
1305 "compact_fail",
1306 "compact_success",
698b1b30 1307 "compact_daemon_wake",
7f354a54
DR
1308 "compact_daemon_migrate_scanned",
1309 "compact_daemon_free_scanned",
fa25c503
KM
1310#endif
1311
1312#ifdef CONFIG_HUGETLB_PAGE
1313 "htlb_buddy_alloc_success",
1314 "htlb_buddy_alloc_fail",
bbb26920
MK
1315#endif
1316#ifdef CONFIG_CMA
1317 "cma_alloc_success",
1318 "cma_alloc_fail",
fa25c503
KM
1319#endif
1320 "unevictable_pgs_culled",
1321 "unevictable_pgs_scanned",
1322 "unevictable_pgs_rescued",
1323 "unevictable_pgs_mlocked",
1324 "unevictable_pgs_munlocked",
1325 "unevictable_pgs_cleared",
1326 "unevictable_pgs_stranded",
fa25c503
KM
1327
1328#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1329 "thp_fault_alloc",
1330 "thp_fault_fallback",
85b9f46e 1331 "thp_fault_fallback_charge",
fa25c503
KM
1332 "thp_collapse_alloc",
1333 "thp_collapse_alloc_failed",
95ecedcd 1334 "thp_file_alloc",
dcdf11ee 1335 "thp_file_fallback",
85b9f46e 1336 "thp_file_fallback_charge",
95ecedcd 1337 "thp_file_mapped",
122afea9
KS
1338 "thp_split_page",
1339 "thp_split_page_failed",
f9719a03 1340 "thp_deferred_split_page",
122afea9 1341 "thp_split_pmd",
ce9311cf
YX
1342#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1343 "thp_split_pud",
1344#endif
d8a8e1f0
KS
1345 "thp_zero_page_alloc",
1346 "thp_zero_page_alloc_failed",
225311a4 1347 "thp_swpout",
fe490cc0 1348 "thp_swpout_fallback",
fa25c503 1349#endif
09316c09
KK
1350#ifdef CONFIG_MEMORY_BALLOON
1351 "balloon_inflate",
1352 "balloon_deflate",
1353#ifdef CONFIG_BALLOON_COMPACTION
1354 "balloon_migrate",
1355#endif
1356#endif /* CONFIG_MEMORY_BALLOON */
ec659934 1357#ifdef CONFIG_DEBUG_TLBFLUSH
9824cf97
DH
1358 "nr_tlb_remote_flush",
1359 "nr_tlb_remote_flush_received",
1360 "nr_tlb_local_flush_all",
1361 "nr_tlb_local_flush_one",
ec659934 1362#endif /* CONFIG_DEBUG_TLBFLUSH */
fa25c503 1363
4f115147
DB
1364#ifdef CONFIG_DEBUG_VM_VMACACHE
1365 "vmacache_find_calls",
1366 "vmacache_find_hits",
1367#endif
cbc65df2
HY
1368#ifdef CONFIG_SWAP
1369 "swap_ra",
1370 "swap_ra_hit",
1371#endif
575299ea
S
1372#ifdef CONFIG_X86
1373 "direct_map_level2_splits",
1374 "direct_map_level3_splits",
1375#endif
ebc5d83d 1376#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
fa25c503 1377};
ebc5d83d 1378#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
fa25c503 1379
3c486871
AM
1380#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
1381 defined(CONFIG_PROC_FS)
1382static void *frag_start(struct seq_file *m, loff_t *pos)
1383{
1384 pg_data_t *pgdat;
1385 loff_t node = *pos;
1386
1387 for (pgdat = first_online_pgdat();
1388 pgdat && node;
1389 pgdat = next_online_pgdat(pgdat))
1390 --node;
1391
1392 return pgdat;
1393}
1394
1395static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
1396{
1397 pg_data_t *pgdat = (pg_data_t *)arg;
1398
1399 (*pos)++;
1400 return next_online_pgdat(pgdat);
1401}
1402
1403static void frag_stop(struct seq_file *m, void *arg)
1404{
1405}
1406
b2bd8598
DR
1407/*
1408 * Walk zones in a node and print using a callback.
1409 * If @assert_populated is true, only use callback for zones that are populated.
1410 */
3c486871 1411static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
727c080f 1412 bool assert_populated, bool nolock,
3c486871
AM
1413 void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
1414{
1415 struct zone *zone;
1416 struct zone *node_zones = pgdat->node_zones;
1417 unsigned long flags;
1418
1419 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
b2bd8598 1420 if (assert_populated && !populated_zone(zone))
3c486871
AM
1421 continue;
1422
727c080f
VM
1423 if (!nolock)
1424 spin_lock_irqsave(&zone->lock, flags);
3c486871 1425 print(m, pgdat, zone);
727c080f
VM
1426 if (!nolock)
1427 spin_unlock_irqrestore(&zone->lock, flags);
3c486871
AM
1428 }
1429}
1430#endif
1431
d7a5752c 1432#ifdef CONFIG_PROC_FS
467c996c
MG
1433static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
1434 struct zone *zone)
1435{
1436 int order;
1437
1438 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1439 for (order = 0; order < MAX_ORDER; ++order)
1440 seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
1441 seq_putc(m, '\n');
1442}
1443
1444/*
1445 * This walks the free areas for each zone.
1446 */
1447static int frag_show(struct seq_file *m, void *arg)
1448{
1449 pg_data_t *pgdat = (pg_data_t *)arg;
727c080f 1450 walk_zones_in_node(m, pgdat, true, false, frag_show_print);
467c996c
MG
1451 return 0;
1452}
1453
1454static void pagetypeinfo_showfree_print(struct seq_file *m,
1455 pg_data_t *pgdat, struct zone *zone)
1456{
1457 int order, mtype;
1458
1459 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
1460 seq_printf(m, "Node %4d, zone %8s, type %12s ",
1461 pgdat->node_id,
1462 zone->name,
1463 migratetype_names[mtype]);
1464 for (order = 0; order < MAX_ORDER; ++order) {
1465 unsigned long freecount = 0;
1466 struct free_area *area;
1467 struct list_head *curr;
93b3a674 1468 bool overflow = false;
467c996c
MG
1469
1470 area = &(zone->free_area[order]);
1471
93b3a674
MH
1472 list_for_each(curr, &area->free_list[mtype]) {
1473 /*
1474 * Cap the free_list iteration because it might
1475 * be really large and we are under a spinlock
1476 * so a long time spent here could trigger a
1477 * hard lockup detector. Anyway this is a
1478 * debugging tool so knowing there is a handful
1479 * of pages of this order should be more than
1480 * sufficient.
1481 */
1482 if (++freecount >= 100000) {
1483 overflow = true;
1484 break;
1485 }
1486 }
1487 seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
1488 spin_unlock_irq(&zone->lock);
1489 cond_resched();
1490 spin_lock_irq(&zone->lock);
467c996c 1491 }
f6ac2354
CL
1492 seq_putc(m, '\n');
1493 }
467c996c
MG
1494}
1495
1496/* Print out the free pages at each order for each migatetype */
1497static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
1498{
1499 int order;
1500 pg_data_t *pgdat = (pg_data_t *)arg;
1501
1502 /* Print header */
1503 seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
1504 for (order = 0; order < MAX_ORDER; ++order)
1505 seq_printf(m, "%6d ", order);
1506 seq_putc(m, '\n');
1507
727c080f 1508 walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
467c996c
MG
1509
1510 return 0;
1511}
1512
1513static void pagetypeinfo_showblockcount_print(struct seq_file *m,
1514 pg_data_t *pgdat, struct zone *zone)
1515{
1516 int mtype;
1517 unsigned long pfn;
1518 unsigned long start_pfn = zone->zone_start_pfn;
108bcc96 1519 unsigned long end_pfn = zone_end_pfn(zone);
467c996c
MG
1520 unsigned long count[MIGRATE_TYPES] = { 0, };
1521
1522 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
1523 struct page *page;
1524
d336e94e
MH
1525 page = pfn_to_online_page(pfn);
1526 if (!page)
467c996c
MG
1527 continue;
1528
a91c43c7
JK
1529 if (page_zone(page) != zone)
1530 continue;
1531
467c996c
MG
1532 mtype = get_pageblock_migratetype(page);
1533
e80d6a24
MG
1534 if (mtype < MIGRATE_TYPES)
1535 count[mtype]++;
467c996c
MG
1536 }
1537
1538 /* Print counts */
1539 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1540 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1541 seq_printf(m, "%12lu ", count[mtype]);
1542 seq_putc(m, '\n');
1543}
1544
f113e641 1545/* Print out the number of pageblocks for each migratetype */
467c996c
MG
1546static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
1547{
1548 int mtype;
1549 pg_data_t *pgdat = (pg_data_t *)arg;
1550
1551 seq_printf(m, "\n%-23s", "Number of blocks type ");
1552 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1553 seq_printf(m, "%12s ", migratetype_names[mtype]);
1554 seq_putc(m, '\n');
727c080f
VM
1555 walk_zones_in_node(m, pgdat, true, false,
1556 pagetypeinfo_showblockcount_print);
467c996c
MG
1557
1558 return 0;
1559}
1560
48c96a36
JK
1561/*
1562 * Print out the number of pageblocks for each migratetype that contain pages
1563 * of other types. This gives an indication of how well fallbacks are being
1564 * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
1565 * to determine what is going on
1566 */
1567static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
1568{
1569#ifdef CONFIG_PAGE_OWNER
1570 int mtype;
1571
7dd80b8a 1572 if (!static_branch_unlikely(&page_owner_inited))
48c96a36
JK
1573 return;
1574
1575 drain_all_pages(NULL);
1576
1577 seq_printf(m, "\n%-23s", "Number of mixed blocks ");
1578 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1579 seq_printf(m, "%12s ", migratetype_names[mtype]);
1580 seq_putc(m, '\n');
1581
727c080f
VM
1582 walk_zones_in_node(m, pgdat, true, true,
1583 pagetypeinfo_showmixedcount_print);
48c96a36
JK
1584#endif /* CONFIG_PAGE_OWNER */
1585}
1586
467c996c
MG
1587/*
1588 * This prints out statistics in relation to grouping pages by mobility.
1589 * It is expensive to collect so do not constantly read the file.
1590 */
1591static int pagetypeinfo_show(struct seq_file *m, void *arg)
1592{
1593 pg_data_t *pgdat = (pg_data_t *)arg;
1594
41b25a37 1595 /* check memoryless node */
a47b53c5 1596 if (!node_state(pgdat->node_id, N_MEMORY))
41b25a37
KM
1597 return 0;
1598
467c996c
MG
1599 seq_printf(m, "Page block order: %d\n", pageblock_order);
1600 seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages);
1601 seq_putc(m, '\n');
1602 pagetypeinfo_showfree(m, pgdat);
1603 pagetypeinfo_showblockcount(m, pgdat);
48c96a36 1604 pagetypeinfo_showmixedcount(m, pgdat);
467c996c 1605
f6ac2354
CL
1606 return 0;
1607}
1608
8f32f7e5 1609static const struct seq_operations fragmentation_op = {
f6ac2354
CL
1610 .start = frag_start,
1611 .next = frag_next,
1612 .stop = frag_stop,
1613 .show = frag_show,
1614};
1615
74e2e8e8 1616static const struct seq_operations pagetypeinfo_op = {
467c996c
MG
1617 .start = frag_start,
1618 .next = frag_next,
1619 .stop = frag_stop,
1620 .show = pagetypeinfo_show,
1621};
1622
e2ecc8a7
MG
1623static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
1624{
1625 int zid;
1626
1627 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1628 struct zone *compare = &pgdat->node_zones[zid];
1629
1630 if (populated_zone(compare))
1631 return zone == compare;
1632 }
1633
e2ecc8a7
MG
1634 return false;
1635}
1636
467c996c
MG
1637static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1638 struct zone *zone)
f6ac2354 1639{
467c996c
MG
1640 int i;
1641 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
e2ecc8a7
MG
1642 if (is_zone_first_populated(pgdat, zone)) {
1643 seq_printf(m, "\n per-node stats");
1644 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
69473e5d
MS
1645 unsigned long pages = node_page_state_pages(pgdat, i);
1646
1647 if (vmstat_item_print_in_thp(i))
1648 pages /= HPAGE_PMD_NR;
9d7ea9a2 1649 seq_printf(m, "\n %-12s %lu", node_stat_name(i),
69473e5d 1650 pages);
e2ecc8a7
MG
1651 }
1652 }
467c996c
MG
1653 seq_printf(m,
1654 "\n pages free %lu"
1655 "\n min %lu"
1656 "\n low %lu"
1657 "\n high %lu"
467c996c 1658 "\n spanned %lu"
9feedc9d 1659 "\n present %lu"
3c381db1
DH
1660 "\n managed %lu"
1661 "\n cma %lu",
88f5acf8 1662 zone_page_state(zone, NR_FREE_PAGES),
41858966
MG
1663 min_wmark_pages(zone),
1664 low_wmark_pages(zone),
1665 high_wmark_pages(zone),
467c996c 1666 zone->spanned_pages,
9feedc9d 1667 zone->present_pages,
3c381db1
DH
1668 zone_managed_pages(zone),
1669 zone_cma_pages(zone));
467c996c 1670
467c996c 1671 seq_printf(m,
3484b2de 1672 "\n protection: (%ld",
467c996c
MG
1673 zone->lowmem_reserve[0]);
1674 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
3484b2de 1675 seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
7dfb8bf3
DR
1676 seq_putc(m, ')');
1677
a8a4b7ae
BH
1678 /* If unpopulated, no other information is useful */
1679 if (!populated_zone(zone)) {
1680 seq_putc(m, '\n');
1681 return;
1682 }
1683
7dfb8bf3 1684 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
9d7ea9a2
KK
1685 seq_printf(m, "\n %-12s %lu", zone_stat_name(i),
1686 zone_page_state(zone, i));
7dfb8bf3 1687
3a321d2a
KW
1688#ifdef CONFIG_NUMA
1689 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
9d7ea9a2
KK
1690 seq_printf(m, "\n %-12s %lu", numa_stat_name(i),
1691 zone_numa_state_snapshot(zone, i));
3a321d2a
KW
1692#endif
1693
7dfb8bf3 1694 seq_printf(m, "\n pagesets");
467c996c
MG
1695 for_each_online_cpu(i) {
1696 struct per_cpu_pageset *pageset;
467c996c 1697
99dcc3e5 1698 pageset = per_cpu_ptr(zone->pageset, i);
3dfa5721
CL
1699 seq_printf(m,
1700 "\n cpu: %i"
1701 "\n count: %i"
1702 "\n high: %i"
1703 "\n batch: %i",
1704 i,
1705 pageset->pcp.count,
1706 pageset->pcp.high,
1707 pageset->pcp.batch);
df9ecaba 1708#ifdef CONFIG_SMP
467c996c
MG
1709 seq_printf(m, "\n vm stats threshold: %d",
1710 pageset->stat_threshold);
df9ecaba 1711#endif
f6ac2354 1712 }
467c996c 1713 seq_printf(m,
599d0c95 1714 "\n node_unreclaimable: %u"
3a50d14d 1715 "\n start_pfn: %lu",
c73322d0 1716 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
3a50d14d 1717 zone->zone_start_pfn);
467c996c
MG
1718 seq_putc(m, '\n');
1719}
1720
1721/*
b2bd8598
DR
1722 * Output information about zones in @pgdat. All zones are printed regardless
1723 * of whether they are populated or not: lowmem_reserve_ratio operates on the
1724 * set of all zones and userspace would not be aware of such zones if they are
1725 * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
467c996c
MG
1726 */
1727static int zoneinfo_show(struct seq_file *m, void *arg)
1728{
1729 pg_data_t *pgdat = (pg_data_t *)arg;
727c080f 1730 walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
f6ac2354
CL
1731 return 0;
1732}
1733
5c9fe628 1734static const struct seq_operations zoneinfo_op = {
f6ac2354
CL
1735 .start = frag_start, /* iterate over all zones. The same as in
1736 * fragmentation. */
1737 .next = frag_next,
1738 .stop = frag_stop,
1739 .show = zoneinfo_show,
1740};
1741
9d7ea9a2
KK
1742#define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
1743 NR_VM_NUMA_STAT_ITEMS + \
1744 NR_VM_NODE_STAT_ITEMS + \
1745 NR_VM_WRITEBACK_STAT_ITEMS + \
1746 (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
1747 NR_VM_EVENT_ITEMS : 0))
79da826a 1748
f6ac2354
CL
1749static void *vmstat_start(struct seq_file *m, loff_t *pos)
1750{
2244b95a 1751 unsigned long *v;
9d7ea9a2 1752 int i;
f6ac2354 1753
9d7ea9a2 1754 if (*pos >= NR_VMSTAT_ITEMS)
f6ac2354 1755 return NULL;
79da826a 1756
9d7ea9a2
KK
1757 BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
1758 v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
2244b95a
CL
1759 m->private = v;
1760 if (!v)
f6ac2354 1761 return ERR_PTR(-ENOMEM);
2244b95a 1762 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
c41f012a 1763 v[i] = global_zone_page_state(i);
79da826a
MR
1764 v += NR_VM_ZONE_STAT_ITEMS;
1765
3a321d2a
KW
1766#ifdef CONFIG_NUMA
1767 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
1768 v[i] = global_numa_state(i);
1769 v += NR_VM_NUMA_STAT_ITEMS;
1770#endif
1771
69473e5d 1772 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
ea426c2a 1773 v[i] = global_node_page_state_pages(i);
69473e5d
MS
1774 if (vmstat_item_print_in_thp(i))
1775 v[i] /= HPAGE_PMD_NR;
1776 }
75ef7184
MG
1777 v += NR_VM_NODE_STAT_ITEMS;
1778
79da826a
MR
1779 global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
1780 v + NR_DIRTY_THRESHOLD);
1781 v += NR_VM_WRITEBACK_STAT_ITEMS;
1782
f8891e5e 1783#ifdef CONFIG_VM_EVENT_COUNTERS
79da826a
MR
1784 all_vm_events(v);
1785 v[PGPGIN] /= 2; /* sectors -> kbytes */
1786 v[PGPGOUT] /= 2;
f8891e5e 1787#endif
ff8b16d7 1788 return (unsigned long *)m->private + *pos;
f6ac2354
CL
1789}
1790
1791static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1792{
1793 (*pos)++;
9d7ea9a2 1794 if (*pos >= NR_VMSTAT_ITEMS)
f6ac2354
CL
1795 return NULL;
1796 return (unsigned long *)m->private + *pos;
1797}
1798
1799static int vmstat_show(struct seq_file *m, void *arg)
1800{
1801 unsigned long *l = arg;
1802 unsigned long off = l - (unsigned long *)m->private;
68ba0326
AD
1803
1804 seq_puts(m, vmstat_text[off]);
75ba1d07 1805 seq_put_decimal_ull(m, " ", *l);
68ba0326 1806 seq_putc(m, '\n');
8d92890b
N
1807
1808 if (off == NR_VMSTAT_ITEMS - 1) {
1809 /*
1810 * We've come to the end - add any deprecated counters to avoid
1811 * breaking userspace which might depend on them being present.
1812 */
1813 seq_puts(m, "nr_unstable 0\n");
1814 }
f6ac2354
CL
1815 return 0;
1816}
1817
1818static void vmstat_stop(struct seq_file *m, void *arg)
1819{
1820 kfree(m->private);
1821 m->private = NULL;
1822}
1823
b6aa44ab 1824static const struct seq_operations vmstat_op = {
f6ac2354
CL
1825 .start = vmstat_start,
1826 .next = vmstat_next,
1827 .stop = vmstat_stop,
1828 .show = vmstat_show,
1829};
f6ac2354
CL
1830#endif /* CONFIG_PROC_FS */
1831
df9ecaba 1832#ifdef CONFIG_SMP
d1187ed2 1833static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
77461ab3 1834int sysctl_stat_interval __read_mostly = HZ;
d1187ed2 1835
52b6f46b
HD
1836#ifdef CONFIG_PROC_FS
1837static void refresh_vm_stats(struct work_struct *work)
1838{
1839 refresh_cpu_vm_stats(true);
1840}
1841
1842int vmstat_refresh(struct ctl_table *table, int write,
32927393 1843 void *buffer, size_t *lenp, loff_t *ppos)
52b6f46b
HD
1844{
1845 long val;
1846 int err;
1847 int i;
1848
1849 /*
1850 * The regular update, every sysctl_stat_interval, may come later
1851 * than expected: leaving a significant amount in per_cpu buckets.
1852 * This is particularly misleading when checking a quantity of HUGE
1853 * pages, immediately after running a test. /proc/sys/vm/stat_refresh,
1854 * which can equally be echo'ed to or cat'ted from (by root),
1855 * can be used to update the stats just before reading them.
1856 *
c41f012a 1857 * Oh, and since global_zone_page_state() etc. are so careful to hide
52b6f46b
HD
1858 * transiently negative values, report an error here if any of
1859 * the stats is negative, so we know to go looking for imbalance.
1860 */
1861 err = schedule_on_each_cpu(refresh_vm_stats);
1862 if (err)
1863 return err;
1864 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
75083aae
HD
1865 /*
1866 * Skip checking stats known to go negative occasionally.
1867 */
1868 switch (i) {
1869 case NR_ZONE_WRITE_PENDING:
1870 case NR_FREE_CMA_PAGES:
1871 continue;
1872 }
75ef7184 1873 val = atomic_long_read(&vm_zone_stat[i]);
52b6f46b 1874 if (val < 0) {
c822f622 1875 pr_warn("%s: %s %ld\n",
9d7ea9a2 1876 __func__, zone_stat_name(i), val);
52b6f46b
HD
1877 }
1878 }
76d8cc3c 1879 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
75083aae
HD
1880 /*
1881 * Skip checking stats known to go negative occasionally.
1882 */
1883 switch (i) {
1884 case NR_WRITEBACK:
1885 continue;
1886 }
76d8cc3c
HD
1887 val = atomic_long_read(&vm_node_stat[i]);
1888 if (val < 0) {
1889 pr_warn("%s: %s %ld\n",
1890 __func__, node_stat_name(i), val);
76d8cc3c
HD
1891 }
1892 }
52b6f46b
HD
1893 if (write)
1894 *ppos += *lenp;
1895 else
1896 *lenp = 0;
1897 return 0;
1898}
1899#endif /* CONFIG_PROC_FS */
1900
d1187ed2
CL
1901static void vmstat_update(struct work_struct *w)
1902{
0eb77e98 1903 if (refresh_cpu_vm_stats(true)) {
7cc36bbd
CL
1904 /*
1905 * Counters were updated so we expect more updates
1906 * to occur in the future. Keep on running the
1907 * update worker thread.
1908 */
ce612879 1909 queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
f01f17d3
MH
1910 this_cpu_ptr(&vmstat_work),
1911 round_jiffies_relative(sysctl_stat_interval));
7cc36bbd
CL
1912 }
1913}
1914
0eb77e98
CL
1915/*
1916 * Switch off vmstat processing and then fold all the remaining differentials
1917 * until the diffs stay at zero. The function is used by NOHZ and can only be
1918 * invoked when tick processing is not active.
1919 */
7cc36bbd
CL
1920/*
1921 * Check if the diffs for a certain cpu indicate that
1922 * an update is needed.
1923 */
1924static bool need_update(int cpu)
1925{
2bbd00ae 1926 pg_data_t *last_pgdat = NULL;
7cc36bbd
CL
1927 struct zone *zone;
1928
1929 for_each_populated_zone(zone) {
1930 struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
2bbd00ae 1931 struct per_cpu_nodestat *n;
7cc36bbd
CL
1932 /*
1933 * The fast way of checking if there are any vmstat diffs.
7cc36bbd 1934 */
13c9aaf7
JH
1935 if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS *
1936 sizeof(p->vm_stat_diff[0])))
7cc36bbd 1937 return true;
3a321d2a 1938#ifdef CONFIG_NUMA
13c9aaf7
JH
1939 if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS *
1940 sizeof(p->vm_numa_stat_diff[0])))
3a321d2a
KW
1941 return true;
1942#endif
2bbd00ae
JW
1943 if (last_pgdat == zone->zone_pgdat)
1944 continue;
1945 last_pgdat = zone->zone_pgdat;
1946 n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu);
1947 if (memchr_inv(n->vm_node_stat_diff, 0, NR_VM_NODE_STAT_ITEMS *
1948 sizeof(n->vm_node_stat_diff[0])))
1949 return true;
7cc36bbd
CL
1950 }
1951 return false;
1952}
1953
7b8da4c7
CL
1954/*
1955 * Switch off vmstat processing and then fold all the remaining differentials
1956 * until the diffs stay at zero. The function is used by NOHZ and can only be
1957 * invoked when tick processing is not active.
1958 */
f01f17d3
MH
1959void quiet_vmstat(void)
1960{
1961 if (system_state != SYSTEM_RUNNING)
1962 return;
1963
7b8da4c7 1964 if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
f01f17d3
MH
1965 return;
1966
1967 if (!need_update(smp_processor_id()))
1968 return;
1969
1970 /*
1971 * Just refresh counters and do not care about the pending delayed
1972 * vmstat_update. It doesn't fire that often to matter and canceling
1973 * it would be too expensive from this path.
1974 * vmstat_shepherd will take care about that for us.
1975 */
1976 refresh_cpu_vm_stats(false);
1977}
1978
7cc36bbd
CL
1979/*
1980 * Shepherd worker thread that checks the
1981 * differentials of processors that have their worker
1982 * threads for vm statistics updates disabled because of
1983 * inactivity.
1984 */
1985static void vmstat_shepherd(struct work_struct *w);
1986
0eb77e98 1987static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
7cc36bbd
CL
1988
1989static void vmstat_shepherd(struct work_struct *w)
1990{
1991 int cpu;
1992
1993 get_online_cpus();
1994 /* Check processors whose vmstat worker threads have been disabled */
7b8da4c7 1995 for_each_online_cpu(cpu) {
f01f17d3 1996 struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
7cc36bbd 1997
7b8da4c7 1998 if (!delayed_work_pending(dw) && need_update(cpu))
ce612879 1999 queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
fbcc8183
JB
2000
2001 cond_resched();
f01f17d3 2002 }
7cc36bbd
CL
2003 put_online_cpus();
2004
2005 schedule_delayed_work(&shepherd,
98f4ebb2 2006 round_jiffies_relative(sysctl_stat_interval));
d1187ed2
CL
2007}
2008
7cc36bbd 2009static void __init start_shepherd_timer(void)
d1187ed2 2010{
7cc36bbd
CL
2011 int cpu;
2012
2013 for_each_possible_cpu(cpu)
ccde8bd4 2014 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
7cc36bbd
CL
2015 vmstat_update);
2016
7cc36bbd
CL
2017 schedule_delayed_work(&shepherd,
2018 round_jiffies_relative(sysctl_stat_interval));
d1187ed2
CL
2019}
2020
03e86dba
TC
2021static void __init init_cpu_node_state(void)
2022{
4c501327 2023 int node;
03e86dba 2024
4c501327
SAS
2025 for_each_online_node(node) {
2026 if (cpumask_weight(cpumask_of_node(node)) > 0)
2027 node_set_state(node, N_CPU);
2028 }
03e86dba
TC
2029}
2030
5438da97
SAS
2031static int vmstat_cpu_online(unsigned int cpu)
2032{
2033 refresh_zone_stat_thresholds();
2034 node_set_state(cpu_to_node(cpu), N_CPU);
2035 return 0;
2036}
2037
2038static int vmstat_cpu_down_prep(unsigned int cpu)
2039{
2040 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
2041 return 0;
2042}
2043
2044static int vmstat_cpu_dead(unsigned int cpu)
807a1bd2 2045{
4c501327 2046 const struct cpumask *node_cpus;
5438da97 2047 int node;
807a1bd2 2048
5438da97
SAS
2049 node = cpu_to_node(cpu);
2050
2051 refresh_zone_stat_thresholds();
4c501327
SAS
2052 node_cpus = cpumask_of_node(node);
2053 if (cpumask_weight(node_cpus) > 0)
5438da97 2054 return 0;
807a1bd2
TK
2055
2056 node_clear_state(node, N_CPU);
5438da97 2057 return 0;
807a1bd2
TK
2058}
2059
8f32f7e5 2060#endif
df9ecaba 2061
ce612879
MH
2062struct workqueue_struct *mm_percpu_wq;
2063
597b7305 2064void __init init_mm_internals(void)
df9ecaba 2065{
ce612879 2066 int ret __maybe_unused;
5438da97 2067
80d136e1 2068 mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
ce612879
MH
2069
2070#ifdef CONFIG_SMP
5438da97
SAS
2071 ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
2072 NULL, vmstat_cpu_dead);
2073 if (ret < 0)
2074 pr_err("vmstat: failed to register 'dead' hotplug state\n");
2075
2076 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
2077 vmstat_cpu_online,
2078 vmstat_cpu_down_prep);
2079 if (ret < 0)
2080 pr_err("vmstat: failed to register 'online' hotplug state\n");
2081
2082 get_online_cpus();
03e86dba 2083 init_cpu_node_state();
5438da97 2084 put_online_cpus();
d1187ed2 2085
7cc36bbd 2086 start_shepherd_timer();
8f32f7e5
AD
2087#endif
2088#ifdef CONFIG_PROC_FS
fddda2b7 2089 proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
abaed011 2090 proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
fddda2b7
CH
2091 proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
2092 proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
8f32f7e5 2093#endif
df9ecaba 2094}
d7a5752c
MG
2095
2096#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
d7a5752c
MG
2097
2098/*
2099 * Return an index indicating how much of the available free memory is
2100 * unusable for an allocation of the requested size.
2101 */
2102static int unusable_free_index(unsigned int order,
2103 struct contig_page_info *info)
2104{
2105 /* No free memory is interpreted as all free memory is unusable */
2106 if (info->free_pages == 0)
2107 return 1000;
2108
2109 /*
2110 * Index should be a value between 0 and 1. Return a value to 3
2111 * decimal places.
2112 *
2113 * 0 => no fragmentation
2114 * 1 => high fragmentation
2115 */
2116 return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
2117
2118}
2119
2120static void unusable_show_print(struct seq_file *m,
2121 pg_data_t *pgdat, struct zone *zone)
2122{
2123 unsigned int order;
2124 int index;
2125 struct contig_page_info info;
2126
2127 seq_printf(m, "Node %d, zone %8s ",
2128 pgdat->node_id,
2129 zone->name);
2130 for (order = 0; order < MAX_ORDER; ++order) {
2131 fill_contig_page_info(zone, order, &info);
2132 index = unusable_free_index(order, &info);
2133 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2134 }
2135
2136 seq_putc(m, '\n');
2137}
2138
2139/*
2140 * Display unusable free space index
2141 *
2142 * The unusable free space index measures how much of the available free
2143 * memory cannot be used to satisfy an allocation of a given size and is a
2144 * value between 0 and 1. The higher the value, the more of free memory is
2145 * unusable and by implication, the worse the external fragmentation is. This
2146 * can be expressed as a percentage by multiplying by 100.
2147 */
2148static int unusable_show(struct seq_file *m, void *arg)
2149{
2150 pg_data_t *pgdat = (pg_data_t *)arg;
2151
2152 /* check memoryless node */
a47b53c5 2153 if (!node_state(pgdat->node_id, N_MEMORY))
d7a5752c
MG
2154 return 0;
2155
727c080f 2156 walk_zones_in_node(m, pgdat, true, false, unusable_show_print);
d7a5752c
MG
2157
2158 return 0;
2159}
2160
01a99560 2161static const struct seq_operations unusable_sops = {
d7a5752c
MG
2162 .start = frag_start,
2163 .next = frag_next,
2164 .stop = frag_stop,
2165 .show = unusable_show,
2166};
2167
01a99560 2168DEFINE_SEQ_ATTRIBUTE(unusable);
d7a5752c 2169
f1a5ab12
MG
2170static void extfrag_show_print(struct seq_file *m,
2171 pg_data_t *pgdat, struct zone *zone)
2172{
2173 unsigned int order;
2174 int index;
2175
2176 /* Alloc on stack as interrupts are disabled for zone walk */
2177 struct contig_page_info info;
2178
2179 seq_printf(m, "Node %d, zone %8s ",
2180 pgdat->node_id,
2181 zone->name);
2182 for (order = 0; order < MAX_ORDER; ++order) {
2183 fill_contig_page_info(zone, order, &info);
56de7263 2184 index = __fragmentation_index(order, &info);
f1a5ab12
MG
2185 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2186 }
2187
2188 seq_putc(m, '\n');
2189}
2190
2191/*
2192 * Display fragmentation index for orders that allocations would fail for
2193 */
2194static int extfrag_show(struct seq_file *m, void *arg)
2195{
2196 pg_data_t *pgdat = (pg_data_t *)arg;
2197
727c080f 2198 walk_zones_in_node(m, pgdat, true, false, extfrag_show_print);
f1a5ab12
MG
2199
2200 return 0;
2201}
2202
01a99560 2203static const struct seq_operations extfrag_sops = {
f1a5ab12
MG
2204 .start = frag_start,
2205 .next = frag_next,
2206 .stop = frag_stop,
2207 .show = extfrag_show,
2208};
2209
01a99560 2210DEFINE_SEQ_ATTRIBUTE(extfrag);
f1a5ab12 2211
d7a5752c
MG
2212static int __init extfrag_debug_init(void)
2213{
bde8bd8a
S
2214 struct dentry *extfrag_debug_root;
2215
d7a5752c 2216 extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
d7a5752c 2217
d9f7979c 2218 debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
01a99560 2219 &unusable_fops);
d7a5752c 2220
d9f7979c 2221 debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
01a99560 2222 &extfrag_fops);
f1a5ab12 2223
d7a5752c
MG
2224 return 0;
2225}
2226
2227module_init(extfrag_debug_init);
2228#endif