Merge tag 'devprop-5.13-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael...
[linux-block.git] / mm / vmstat.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
f6ac2354
CL
2/*
3 * linux/mm/vmstat.c
4 *
5 * Manages VM statistics
6 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
2244b95a
CL
7 *
8 * zoned VM statistics
9 * Copyright (C) 2006 Silicon Graphics, Inc.,
10 * Christoph Lameter <christoph@lameter.com>
7cc36bbd 11 * Copyright (C) 2008-2014 Christoph Lameter
f6ac2354 12 */
8f32f7e5 13#include <linux/fs.h>
f6ac2354 14#include <linux/mm.h>
4e950f6f 15#include <linux/err.h>
2244b95a 16#include <linux/module.h>
5a0e3ad6 17#include <linux/slab.h>
df9ecaba 18#include <linux/cpu.h>
7cc36bbd 19#include <linux/cpumask.h>
c748e134 20#include <linux/vmstat.h>
3c486871
AM
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/debugfs.h>
e8edc6e0 24#include <linux/sched.h>
f1a5ab12 25#include <linux/math64.h>
79da826a 26#include <linux/writeback.h>
36deb0be 27#include <linux/compaction.h>
6e543d57 28#include <linux/mm_inline.h>
48c96a36
JK
29#include <linux/page_ext.h>
30#include <linux/page_owner.h>
6e543d57
LD
31
32#include "internal.h"
f6ac2354 33
1d90ca89
KW
34#define NUMA_STATS_THRESHOLD (U16_MAX - 2)
35
4518085e
KW
36#ifdef CONFIG_NUMA
37int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
38
39/* zero numa counters within a zone */
40static void zero_zone_numa_counters(struct zone *zone)
41{
42 int item, cpu;
43
44 for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) {
45 atomic_long_set(&zone->vm_numa_stat[item], 0);
46 for_each_online_cpu(cpu)
47 per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]
48 = 0;
49 }
50}
51
52/* zero numa counters of all the populated zones */
53static void zero_zones_numa_counters(void)
54{
55 struct zone *zone;
56
57 for_each_populated_zone(zone)
58 zero_zone_numa_counters(zone);
59}
60
61/* zero global numa counters */
62static void zero_global_numa_counters(void)
63{
64 int item;
65
66 for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++)
67 atomic_long_set(&vm_numa_stat[item], 0);
68}
69
70static void invalid_numa_statistics(void)
71{
72 zero_zones_numa_counters();
73 zero_global_numa_counters();
74}
75
76static DEFINE_MUTEX(vm_numa_stat_lock);
77
78int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
32927393 79 void *buffer, size_t *length, loff_t *ppos)
4518085e
KW
80{
81 int ret, oldval;
82
83 mutex_lock(&vm_numa_stat_lock);
84 if (write)
85 oldval = sysctl_vm_numa_stat;
86 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
87 if (ret || !write)
88 goto out;
89
90 if (oldval == sysctl_vm_numa_stat)
91 goto out;
92 else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
93 static_branch_enable(&vm_numa_stat_key);
94 pr_info("enable numa statistics\n");
95 } else {
96 static_branch_disable(&vm_numa_stat_key);
97 invalid_numa_statistics();
98 pr_info("disable numa statistics, and clear numa counters\n");
99 }
100
101out:
102 mutex_unlock(&vm_numa_stat_lock);
103 return ret;
104}
105#endif
106
f8891e5e
CL
107#ifdef CONFIG_VM_EVENT_COUNTERS
108DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
109EXPORT_PER_CPU_SYMBOL(vm_event_states);
110
31f961a8 111static void sum_vm_events(unsigned long *ret)
f8891e5e 112{
9eccf2a8 113 int cpu;
f8891e5e
CL
114 int i;
115
116 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
117
31f961a8 118 for_each_online_cpu(cpu) {
f8891e5e
CL
119 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
120
f8891e5e
CL
121 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
122 ret[i] += this->event[i];
123 }
124}
125
126/*
127 * Accumulate the vm event counters across all CPUs.
128 * The result is unavoidably approximate - it can change
129 * during and after execution of this function.
130*/
131void all_vm_events(unsigned long *ret)
132{
b5be1132 133 get_online_cpus();
31f961a8 134 sum_vm_events(ret);
b5be1132 135 put_online_cpus();
f8891e5e 136}
32dd66fc 137EXPORT_SYMBOL_GPL(all_vm_events);
f8891e5e 138
f8891e5e
CL
139/*
140 * Fold the foreign cpu events into our own.
141 *
142 * This is adding to the events on one processor
143 * but keeps the global counts constant.
144 */
145void vm_events_fold_cpu(int cpu)
146{
147 struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
148 int i;
149
150 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
151 count_vm_events(i, fold_state->event[i]);
152 fold_state->event[i] = 0;
153 }
154}
f8891e5e
CL
155
156#endif /* CONFIG_VM_EVENT_COUNTERS */
157
2244b95a
CL
158/*
159 * Manage combined zone based / global counters
160 *
161 * vm_stat contains the global counters
162 */
75ef7184 163atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
3a321d2a 164atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp;
75ef7184
MG
165atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
166EXPORT_SYMBOL(vm_zone_stat);
3a321d2a 167EXPORT_SYMBOL(vm_numa_stat);
75ef7184 168EXPORT_SYMBOL(vm_node_stat);
2244b95a
CL
169
170#ifdef CONFIG_SMP
171
b44129b3 172int calculate_pressure_threshold(struct zone *zone)
88f5acf8
MG
173{
174 int threshold;
175 int watermark_distance;
176
177 /*
178 * As vmstats are not up to date, there is drift between the estimated
179 * and real values. For high thresholds and a high number of CPUs, it
180 * is possible for the min watermark to be breached while the estimated
181 * value looks fine. The pressure threshold is a reduced value such
182 * that even the maximum amount of drift will not accidentally breach
183 * the min watermark
184 */
185 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
186 threshold = max(1, (int)(watermark_distance / num_online_cpus()));
187
188 /*
189 * Maximum threshold is 125
190 */
191 threshold = min(125, threshold);
192
193 return threshold;
194}
195
b44129b3 196int calculate_normal_threshold(struct zone *zone)
df9ecaba
CL
197{
198 int threshold;
199 int mem; /* memory in 128 MB units */
200
201 /*
202 * The threshold scales with the number of processors and the amount
203 * of memory per zone. More memory means that we can defer updates for
204 * longer, more processors could lead to more contention.
205 * fls() is used to have a cheap way of logarithmic scaling.
206 *
207 * Some sample thresholds:
208 *
209 * Threshold Processors (fls) Zonesize fls(mem+1)
210 * ------------------------------------------------------------------
211 * 8 1 1 0.9-1 GB 4
212 * 16 2 2 0.9-1 GB 4
213 * 20 2 2 1-2 GB 5
214 * 24 2 2 2-4 GB 6
215 * 28 2 2 4-8 GB 7
216 * 32 2 2 8-16 GB 8
217 * 4 2 2 <128M 1
218 * 30 4 3 2-4 GB 5
219 * 48 4 3 8-16 GB 8
220 * 32 8 4 1-2 GB 4
221 * 32 8 4 0.9-1GB 4
222 * 10 16 5 <128M 1
223 * 40 16 5 900M 4
224 * 70 64 7 2-4 GB 5
225 * 84 64 7 4-8 GB 6
226 * 108 512 9 4-8 GB 6
227 * 125 1024 10 8-16 GB 8
228 * 125 1024 10 16-32 GB 9
229 */
230
9705bea5 231 mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
df9ecaba
CL
232
233 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
234
235 /*
236 * Maximum threshold is 125
237 */
238 threshold = min(125, threshold);
239
240 return threshold;
241}
2244b95a
CL
242
243/*
df9ecaba 244 * Refresh the thresholds for each zone.
2244b95a 245 */
a6cccdc3 246void refresh_zone_stat_thresholds(void)
2244b95a 247{
75ef7184 248 struct pglist_data *pgdat;
df9ecaba
CL
249 struct zone *zone;
250 int cpu;
251 int threshold;
252
75ef7184
MG
253 /* Zero current pgdat thresholds */
254 for_each_online_pgdat(pgdat) {
255 for_each_online_cpu(cpu) {
256 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
257 }
258 }
259
ee99c71c 260 for_each_populated_zone(zone) {
75ef7184 261 struct pglist_data *pgdat = zone->zone_pgdat;
aa454840
CL
262 unsigned long max_drift, tolerate_drift;
263
b44129b3 264 threshold = calculate_normal_threshold(zone);
df9ecaba 265
75ef7184
MG
266 for_each_online_cpu(cpu) {
267 int pgdat_threshold;
268
99dcc3e5
CL
269 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
270 = threshold;
1d90ca89 271
75ef7184
MG
272 /* Base nodestat threshold on the largest populated zone. */
273 pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
274 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
275 = max(threshold, pgdat_threshold);
276 }
277
aa454840
CL
278 /*
279 * Only set percpu_drift_mark if there is a danger that
280 * NR_FREE_PAGES reports the low watermark is ok when in fact
281 * the min watermark could be breached by an allocation
282 */
283 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
284 max_drift = num_online_cpus() * threshold;
285 if (max_drift > tolerate_drift)
286 zone->percpu_drift_mark = high_wmark_pages(zone) +
287 max_drift;
df9ecaba 288 }
2244b95a
CL
289}
290
b44129b3
MG
291void set_pgdat_percpu_threshold(pg_data_t *pgdat,
292 int (*calculate_pressure)(struct zone *))
88f5acf8
MG
293{
294 struct zone *zone;
295 int cpu;
296 int threshold;
297 int i;
298
88f5acf8
MG
299 for (i = 0; i < pgdat->nr_zones; i++) {
300 zone = &pgdat->node_zones[i];
301 if (!zone->percpu_drift_mark)
302 continue;
303
b44129b3 304 threshold = (*calculate_pressure)(zone);
1d90ca89 305 for_each_online_cpu(cpu)
88f5acf8
MG
306 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
307 = threshold;
308 }
88f5acf8
MG
309}
310
2244b95a 311/*
bea04b07
JZ
312 * For use when we know that interrupts are disabled,
313 * or when we know that preemption is disabled and that
314 * particular counter cannot be updated from interrupt context.
2244b95a
CL
315 */
316void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 317 long delta)
2244b95a 318{
12938a92
CL
319 struct per_cpu_pageset __percpu *pcp = zone->pageset;
320 s8 __percpu *p = pcp->vm_stat_diff + item;
2244b95a 321 long x;
12938a92
CL
322 long t;
323
324 x = delta + __this_cpu_read(*p);
2244b95a 325
12938a92 326 t = __this_cpu_read(pcp->stat_threshold);
2244b95a 327
40610076 328 if (unlikely(abs(x) > t)) {
2244b95a
CL
329 zone_page_state_add(x, zone, item);
330 x = 0;
331 }
12938a92 332 __this_cpu_write(*p, x);
2244b95a
CL
333}
334EXPORT_SYMBOL(__mod_zone_page_state);
335
75ef7184
MG
336void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
337 long delta)
338{
339 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
340 s8 __percpu *p = pcp->vm_node_stat_diff + item;
341 long x;
342 long t;
343
ea426c2a 344 if (vmstat_item_in_bytes(item)) {
629484ae
JW
345 /*
346 * Only cgroups use subpage accounting right now; at
347 * the global level, these items still change in
348 * multiples of whole pages. Store them as pages
349 * internally to keep the per-cpu counters compact.
350 */
ea426c2a
RG
351 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
352 delta >>= PAGE_SHIFT;
353 }
354
75ef7184
MG
355 x = delta + __this_cpu_read(*p);
356
357 t = __this_cpu_read(pcp->stat_threshold);
358
40610076 359 if (unlikely(abs(x) > t)) {
75ef7184
MG
360 node_page_state_add(x, pgdat, item);
361 x = 0;
362 }
363 __this_cpu_write(*p, x);
364}
365EXPORT_SYMBOL(__mod_node_page_state);
366
2244b95a
CL
367/*
368 * Optimized increment and decrement functions.
369 *
370 * These are only for a single page and therefore can take a struct page *
371 * argument instead of struct zone *. This allows the inclusion of the code
372 * generated for page_zone(page) into the optimized functions.
373 *
374 * No overflow check is necessary and therefore the differential can be
375 * incremented or decremented in place which may allow the compilers to
376 * generate better code.
2244b95a
CL
377 * The increment or decrement is known and therefore one boundary check can
378 * be omitted.
379 *
df9ecaba
CL
380 * NOTE: These functions are very performance sensitive. Change only
381 * with care.
382 *
2244b95a
CL
383 * Some processors have inc/dec instructions that are atomic vs an interrupt.
384 * However, the code must first determine the differential location in a zone
385 * based on the processor number and then inc/dec the counter. There is no
386 * guarantee without disabling preemption that the processor will not change
387 * in between and therefore the atomicity vs. interrupt cannot be exploited
388 * in a useful way here.
389 */
c8785385 390void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
2244b95a 391{
12938a92
CL
392 struct per_cpu_pageset __percpu *pcp = zone->pageset;
393 s8 __percpu *p = pcp->vm_stat_diff + item;
394 s8 v, t;
2244b95a 395
908ee0f1 396 v = __this_cpu_inc_return(*p);
12938a92
CL
397 t = __this_cpu_read(pcp->stat_threshold);
398 if (unlikely(v > t)) {
399 s8 overstep = t >> 1;
df9ecaba 400
12938a92
CL
401 zone_page_state_add(v + overstep, zone, item);
402 __this_cpu_write(*p, -overstep);
2244b95a
CL
403 }
404}
ca889e6c 405
75ef7184
MG
406void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
407{
408 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
409 s8 __percpu *p = pcp->vm_node_stat_diff + item;
410 s8 v, t;
411
ea426c2a
RG
412 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
413
75ef7184
MG
414 v = __this_cpu_inc_return(*p);
415 t = __this_cpu_read(pcp->stat_threshold);
416 if (unlikely(v > t)) {
417 s8 overstep = t >> 1;
418
419 node_page_state_add(v + overstep, pgdat, item);
420 __this_cpu_write(*p, -overstep);
421 }
422}
423
ca889e6c
CL
424void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
425{
426 __inc_zone_state(page_zone(page), item);
427}
2244b95a
CL
428EXPORT_SYMBOL(__inc_zone_page_state);
429
75ef7184
MG
430void __inc_node_page_state(struct page *page, enum node_stat_item item)
431{
432 __inc_node_state(page_pgdat(page), item);
433}
434EXPORT_SYMBOL(__inc_node_page_state);
435
c8785385 436void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
2244b95a 437{
12938a92
CL
438 struct per_cpu_pageset __percpu *pcp = zone->pageset;
439 s8 __percpu *p = pcp->vm_stat_diff + item;
440 s8 v, t;
2244b95a 441
908ee0f1 442 v = __this_cpu_dec_return(*p);
12938a92
CL
443 t = __this_cpu_read(pcp->stat_threshold);
444 if (unlikely(v < - t)) {
445 s8 overstep = t >> 1;
2244b95a 446
12938a92
CL
447 zone_page_state_add(v - overstep, zone, item);
448 __this_cpu_write(*p, overstep);
2244b95a
CL
449 }
450}
c8785385 451
75ef7184
MG
452void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
453{
454 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
455 s8 __percpu *p = pcp->vm_node_stat_diff + item;
456 s8 v, t;
457
ea426c2a
RG
458 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
459
75ef7184
MG
460 v = __this_cpu_dec_return(*p);
461 t = __this_cpu_read(pcp->stat_threshold);
462 if (unlikely(v < - t)) {
463 s8 overstep = t >> 1;
464
465 node_page_state_add(v - overstep, pgdat, item);
466 __this_cpu_write(*p, overstep);
467 }
468}
469
c8785385
CL
470void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
471{
472 __dec_zone_state(page_zone(page), item);
473}
2244b95a
CL
474EXPORT_SYMBOL(__dec_zone_page_state);
475
75ef7184
MG
476void __dec_node_page_state(struct page *page, enum node_stat_item item)
477{
478 __dec_node_state(page_pgdat(page), item);
479}
480EXPORT_SYMBOL(__dec_node_page_state);
481
4156153c 482#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
7c839120
CL
483/*
484 * If we have cmpxchg_local support then we do not need to incur the overhead
485 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
486 *
487 * mod_state() modifies the zone counter state through atomic per cpu
488 * operations.
489 *
490 * Overstep mode specifies how overstep should handled:
491 * 0 No overstepping
492 * 1 Overstepping half of threshold
493 * -1 Overstepping minus half of threshold
494*/
75ef7184
MG
495static inline void mod_zone_state(struct zone *zone,
496 enum zone_stat_item item, long delta, int overstep_mode)
7c839120
CL
497{
498 struct per_cpu_pageset __percpu *pcp = zone->pageset;
499 s8 __percpu *p = pcp->vm_stat_diff + item;
500 long o, n, t, z;
501
502 do {
503 z = 0; /* overflow to zone counters */
504
505 /*
506 * The fetching of the stat_threshold is racy. We may apply
507 * a counter threshold to the wrong the cpu if we get
d3bc2367
CL
508 * rescheduled while executing here. However, the next
509 * counter update will apply the threshold again and
510 * therefore bring the counter under the threshold again.
511 *
512 * Most of the time the thresholds are the same anyways
513 * for all cpus in a zone.
7c839120
CL
514 */
515 t = this_cpu_read(pcp->stat_threshold);
516
517 o = this_cpu_read(*p);
518 n = delta + o;
519
40610076 520 if (abs(n) > t) {
7c839120
CL
521 int os = overstep_mode * (t >> 1) ;
522
523 /* Overflow must be added to zone counters */
524 z = n + os;
525 n = -os;
526 }
527 } while (this_cpu_cmpxchg(*p, o, n) != o);
528
529 if (z)
530 zone_page_state_add(z, zone, item);
531}
532
533void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 534 long delta)
7c839120 535{
75ef7184 536 mod_zone_state(zone, item, delta, 0);
7c839120
CL
537}
538EXPORT_SYMBOL(mod_zone_page_state);
539
7c839120
CL
540void inc_zone_page_state(struct page *page, enum zone_stat_item item)
541{
75ef7184 542 mod_zone_state(page_zone(page), item, 1, 1);
7c839120
CL
543}
544EXPORT_SYMBOL(inc_zone_page_state);
545
546void dec_zone_page_state(struct page *page, enum zone_stat_item item)
547{
75ef7184 548 mod_zone_state(page_zone(page), item, -1, -1);
7c839120
CL
549}
550EXPORT_SYMBOL(dec_zone_page_state);
75ef7184
MG
551
552static inline void mod_node_state(struct pglist_data *pgdat,
553 enum node_stat_item item, int delta, int overstep_mode)
554{
555 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
556 s8 __percpu *p = pcp->vm_node_stat_diff + item;
557 long o, n, t, z;
558
ea426c2a 559 if (vmstat_item_in_bytes(item)) {
629484ae
JW
560 /*
561 * Only cgroups use subpage accounting right now; at
562 * the global level, these items still change in
563 * multiples of whole pages. Store them as pages
564 * internally to keep the per-cpu counters compact.
565 */
ea426c2a
RG
566 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
567 delta >>= PAGE_SHIFT;
568 }
569
75ef7184
MG
570 do {
571 z = 0; /* overflow to node counters */
572
573 /*
574 * The fetching of the stat_threshold is racy. We may apply
575 * a counter threshold to the wrong the cpu if we get
576 * rescheduled while executing here. However, the next
577 * counter update will apply the threshold again and
578 * therefore bring the counter under the threshold again.
579 *
580 * Most of the time the thresholds are the same anyways
581 * for all cpus in a node.
582 */
583 t = this_cpu_read(pcp->stat_threshold);
584
585 o = this_cpu_read(*p);
586 n = delta + o;
587
40610076 588 if (abs(n) > t) {
75ef7184
MG
589 int os = overstep_mode * (t >> 1) ;
590
591 /* Overflow must be added to node counters */
592 z = n + os;
593 n = -os;
594 }
595 } while (this_cpu_cmpxchg(*p, o, n) != o);
596
597 if (z)
598 node_page_state_add(z, pgdat, item);
599}
600
601void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
602 long delta)
603{
604 mod_node_state(pgdat, item, delta, 0);
605}
606EXPORT_SYMBOL(mod_node_page_state);
607
608void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
609{
610 mod_node_state(pgdat, item, 1, 1);
611}
612
613void inc_node_page_state(struct page *page, enum node_stat_item item)
614{
615 mod_node_state(page_pgdat(page), item, 1, 1);
616}
617EXPORT_SYMBOL(inc_node_page_state);
618
619void dec_node_page_state(struct page *page, enum node_stat_item item)
620{
621 mod_node_state(page_pgdat(page), item, -1, -1);
622}
623EXPORT_SYMBOL(dec_node_page_state);
7c839120
CL
624#else
625/*
626 * Use interrupt disable to serialize counter updates
627 */
628void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 629 long delta)
7c839120
CL
630{
631 unsigned long flags;
632
633 local_irq_save(flags);
634 __mod_zone_page_state(zone, item, delta);
635 local_irq_restore(flags);
636}
637EXPORT_SYMBOL(mod_zone_page_state);
638
2244b95a
CL
639void inc_zone_page_state(struct page *page, enum zone_stat_item item)
640{
641 unsigned long flags;
642 struct zone *zone;
2244b95a
CL
643
644 zone = page_zone(page);
645 local_irq_save(flags);
ca889e6c 646 __inc_zone_state(zone, item);
2244b95a
CL
647 local_irq_restore(flags);
648}
649EXPORT_SYMBOL(inc_zone_page_state);
650
651void dec_zone_page_state(struct page *page, enum zone_stat_item item)
652{
653 unsigned long flags;
2244b95a 654
2244b95a 655 local_irq_save(flags);
a302eb4e 656 __dec_zone_page_state(page, item);
2244b95a
CL
657 local_irq_restore(flags);
658}
659EXPORT_SYMBOL(dec_zone_page_state);
660
75ef7184
MG
661void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
662{
663 unsigned long flags;
664
665 local_irq_save(flags);
666 __inc_node_state(pgdat, item);
667 local_irq_restore(flags);
668}
669EXPORT_SYMBOL(inc_node_state);
670
671void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
672 long delta)
673{
674 unsigned long flags;
675
676 local_irq_save(flags);
677 __mod_node_page_state(pgdat, item, delta);
678 local_irq_restore(flags);
679}
680EXPORT_SYMBOL(mod_node_page_state);
681
682void inc_node_page_state(struct page *page, enum node_stat_item item)
683{
684 unsigned long flags;
685 struct pglist_data *pgdat;
686
687 pgdat = page_pgdat(page);
688 local_irq_save(flags);
689 __inc_node_state(pgdat, item);
690 local_irq_restore(flags);
691}
692EXPORT_SYMBOL(inc_node_page_state);
693
694void dec_node_page_state(struct page *page, enum node_stat_item item)
695{
696 unsigned long flags;
697
698 local_irq_save(flags);
699 __dec_node_page_state(page, item);
700 local_irq_restore(flags);
701}
702EXPORT_SYMBOL(dec_node_page_state);
703#endif
7cc36bbd
CL
704
705/*
706 * Fold a differential into the global counters.
707 * Returns the number of counters updated.
708 */
3a321d2a
KW
709#ifdef CONFIG_NUMA
710static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff)
711{
712 int i;
713 int changes = 0;
714
715 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
716 if (zone_diff[i]) {
717 atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
718 changes++;
719 }
720
721 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
722 if (numa_diff[i]) {
723 atomic_long_add(numa_diff[i], &vm_numa_stat[i]);
724 changes++;
725 }
726
727 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
728 if (node_diff[i]) {
729 atomic_long_add(node_diff[i], &vm_node_stat[i]);
730 changes++;
731 }
732 return changes;
733}
734#else
75ef7184 735static int fold_diff(int *zone_diff, int *node_diff)
4edb0748
CL
736{
737 int i;
7cc36bbd 738 int changes = 0;
4edb0748
CL
739
740 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
75ef7184
MG
741 if (zone_diff[i]) {
742 atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
743 changes++;
744 }
745
746 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
747 if (node_diff[i]) {
748 atomic_long_add(node_diff[i], &vm_node_stat[i]);
7cc36bbd
CL
749 changes++;
750 }
751 return changes;
4edb0748 752}
3a321d2a 753#endif /* CONFIG_NUMA */
4edb0748 754
2244b95a 755/*
2bb921e5 756 * Update the zone counters for the current cpu.
a7f75e25 757 *
4037d452
CL
758 * Note that refresh_cpu_vm_stats strives to only access
759 * node local memory. The per cpu pagesets on remote zones are placed
760 * in the memory local to the processor using that pageset. So the
761 * loop over all zones will access a series of cachelines local to
762 * the processor.
763 *
764 * The call to zone_page_state_add updates the cachelines with the
765 * statistics in the remote zone struct as well as the global cachelines
766 * with the global counters. These could cause remote node cache line
767 * bouncing and will have to be only done when necessary.
7cc36bbd
CL
768 *
769 * The function returns the number of global counters updated.
2244b95a 770 */
0eb77e98 771static int refresh_cpu_vm_stats(bool do_pagesets)
2244b95a 772{
75ef7184 773 struct pglist_data *pgdat;
2244b95a
CL
774 struct zone *zone;
775 int i;
75ef7184 776 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
3a321d2a
KW
777#ifdef CONFIG_NUMA
778 int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
779#endif
75ef7184 780 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
7cc36bbd 781 int changes = 0;
2244b95a 782
ee99c71c 783 for_each_populated_zone(zone) {
fbc2edb0 784 struct per_cpu_pageset __percpu *p = zone->pageset;
2244b95a 785
fbc2edb0
CL
786 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
787 int v;
2244b95a 788
fbc2edb0
CL
789 v = this_cpu_xchg(p->vm_stat_diff[i], 0);
790 if (v) {
a7f75e25 791
a7f75e25 792 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 793 global_zone_diff[i] += v;
4037d452
CL
794#ifdef CONFIG_NUMA
795 /* 3 seconds idle till flush */
fbc2edb0 796 __this_cpu_write(p->expire, 3);
4037d452 797#endif
2244b95a 798 }
fbc2edb0 799 }
4037d452 800#ifdef CONFIG_NUMA
3a321d2a
KW
801 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
802 int v;
803
804 v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0);
805 if (v) {
806
807 atomic_long_add(v, &zone->vm_numa_stat[i]);
808 global_numa_diff[i] += v;
809 __this_cpu_write(p->expire, 3);
810 }
811 }
812
0eb77e98
CL
813 if (do_pagesets) {
814 cond_resched();
815 /*
816 * Deal with draining the remote pageset of this
817 * processor
818 *
819 * Check if there are pages remaining in this pageset
820 * if not then there is nothing to expire.
821 */
822 if (!__this_cpu_read(p->expire) ||
fbc2edb0 823 !__this_cpu_read(p->pcp.count))
0eb77e98 824 continue;
4037d452 825
0eb77e98
CL
826 /*
827 * We never drain zones local to this processor.
828 */
829 if (zone_to_nid(zone) == numa_node_id()) {
830 __this_cpu_write(p->expire, 0);
831 continue;
832 }
4037d452 833
0eb77e98
CL
834 if (__this_cpu_dec_return(p->expire))
835 continue;
4037d452 836
0eb77e98
CL
837 if (__this_cpu_read(p->pcp.count)) {
838 drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
839 changes++;
840 }
7cc36bbd 841 }
4037d452 842#endif
2244b95a 843 }
75ef7184
MG
844
845 for_each_online_pgdat(pgdat) {
846 struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
847
848 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
849 int v;
850
851 v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
852 if (v) {
853 atomic_long_add(v, &pgdat->vm_stat[i]);
854 global_node_diff[i] += v;
855 }
856 }
857 }
858
3a321d2a
KW
859#ifdef CONFIG_NUMA
860 changes += fold_diff(global_zone_diff, global_numa_diff,
861 global_node_diff);
862#else
75ef7184 863 changes += fold_diff(global_zone_diff, global_node_diff);
3a321d2a 864#endif
7cc36bbd 865 return changes;
2244b95a
CL
866}
867
2bb921e5
CL
868/*
869 * Fold the data for an offline cpu into the global array.
870 * There cannot be any access by the offline cpu and therefore
871 * synchronization is simplified.
872 */
873void cpu_vm_stats_fold(int cpu)
874{
75ef7184 875 struct pglist_data *pgdat;
2bb921e5
CL
876 struct zone *zone;
877 int i;
75ef7184 878 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
3a321d2a
KW
879#ifdef CONFIG_NUMA
880 int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
881#endif
75ef7184 882 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
2bb921e5
CL
883
884 for_each_populated_zone(zone) {
885 struct per_cpu_pageset *p;
886
887 p = per_cpu_ptr(zone->pageset, cpu);
888
889 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
890 if (p->vm_stat_diff[i]) {
891 int v;
892
893 v = p->vm_stat_diff[i];
894 p->vm_stat_diff[i] = 0;
895 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 896 global_zone_diff[i] += v;
2bb921e5 897 }
3a321d2a
KW
898
899#ifdef CONFIG_NUMA
900 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
901 if (p->vm_numa_stat_diff[i]) {
902 int v;
903
904 v = p->vm_numa_stat_diff[i];
905 p->vm_numa_stat_diff[i] = 0;
906 atomic_long_add(v, &zone->vm_numa_stat[i]);
907 global_numa_diff[i] += v;
908 }
909#endif
2bb921e5
CL
910 }
911
75ef7184
MG
912 for_each_online_pgdat(pgdat) {
913 struct per_cpu_nodestat *p;
914
915 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
916
917 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
918 if (p->vm_node_stat_diff[i]) {
919 int v;
920
921 v = p->vm_node_stat_diff[i];
922 p->vm_node_stat_diff[i] = 0;
923 atomic_long_add(v, &pgdat->vm_stat[i]);
924 global_node_diff[i] += v;
925 }
926 }
927
3a321d2a
KW
928#ifdef CONFIG_NUMA
929 fold_diff(global_zone_diff, global_numa_diff, global_node_diff);
930#else
75ef7184 931 fold_diff(global_zone_diff, global_node_diff);
3a321d2a 932#endif
2bb921e5
CL
933}
934
40f4b1ea
CS
935/*
936 * this is only called if !populated_zone(zone), which implies no other users of
937 * pset->vm_stat_diff[] exsist.
938 */
5a883813
MK
939void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
940{
941 int i;
942
943 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
944 if (pset->vm_stat_diff[i]) {
945 int v = pset->vm_stat_diff[i];
946 pset->vm_stat_diff[i] = 0;
947 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 948 atomic_long_add(v, &vm_zone_stat[i]);
5a883813 949 }
3a321d2a
KW
950
951#ifdef CONFIG_NUMA
952 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
953 if (pset->vm_numa_stat_diff[i]) {
954 int v = pset->vm_numa_stat_diff[i];
955
956 pset->vm_numa_stat_diff[i] = 0;
957 atomic_long_add(v, &zone->vm_numa_stat[i]);
958 atomic_long_add(v, &vm_numa_stat[i]);
959 }
960#endif
5a883813 961}
2244b95a
CL
962#endif
963
ca889e6c 964#ifdef CONFIG_NUMA
3a321d2a
KW
965void __inc_numa_state(struct zone *zone,
966 enum numa_stat_item item)
967{
968 struct per_cpu_pageset __percpu *pcp = zone->pageset;
1d90ca89
KW
969 u16 __percpu *p = pcp->vm_numa_stat_diff + item;
970 u16 v;
3a321d2a
KW
971
972 v = __this_cpu_inc_return(*p);
3a321d2a 973
1d90ca89
KW
974 if (unlikely(v > NUMA_STATS_THRESHOLD)) {
975 zone_numa_state_add(v, zone, item);
976 __this_cpu_write(*p, 0);
3a321d2a
KW
977 }
978}
979
c2d42c16 980/*
75ef7184
MG
981 * Determine the per node value of a stat item. This function
982 * is called frequently in a NUMA machine, so try to be as
983 * frugal as possible.
c2d42c16 984 */
75ef7184
MG
985unsigned long sum_zone_node_page_state(int node,
986 enum zone_stat_item item)
c2d42c16
AM
987{
988 struct zone *zones = NODE_DATA(node)->node_zones;
e87d59f7
JK
989 int i;
990 unsigned long count = 0;
c2d42c16 991
e87d59f7
JK
992 for (i = 0; i < MAX_NR_ZONES; i++)
993 count += zone_page_state(zones + i, item);
994
995 return count;
c2d42c16
AM
996}
997
63803222
KW
998/*
999 * Determine the per node value of a numa stat item. To avoid deviation,
1000 * the per cpu stat number in vm_numa_stat_diff[] is also included.
1001 */
3a321d2a
KW
1002unsigned long sum_zone_numa_state(int node,
1003 enum numa_stat_item item)
1004{
1005 struct zone *zones = NODE_DATA(node)->node_zones;
1006 int i;
1007 unsigned long count = 0;
1008
1009 for (i = 0; i < MAX_NR_ZONES; i++)
63803222 1010 count += zone_numa_state_snapshot(zones + i, item);
3a321d2a
KW
1011
1012 return count;
1013}
1014
75ef7184
MG
1015/*
1016 * Determine the per node value of a stat item.
1017 */
ea426c2a
RG
1018unsigned long node_page_state_pages(struct pglist_data *pgdat,
1019 enum node_stat_item item)
75ef7184
MG
1020{
1021 long x = atomic_long_read(&pgdat->vm_stat[item]);
1022#ifdef CONFIG_SMP
1023 if (x < 0)
1024 x = 0;
1025#endif
1026 return x;
1027}
ea426c2a
RG
1028
1029unsigned long node_page_state(struct pglist_data *pgdat,
1030 enum node_stat_item item)
1031{
1032 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
1033
1034 return node_page_state_pages(pgdat, item);
1035}
ca889e6c
CL
1036#endif
1037
d7a5752c 1038#ifdef CONFIG_COMPACTION
36deb0be 1039
d7a5752c
MG
1040struct contig_page_info {
1041 unsigned long free_pages;
1042 unsigned long free_blocks_total;
1043 unsigned long free_blocks_suitable;
1044};
1045
1046/*
1047 * Calculate the number of free pages in a zone, how many contiguous
1048 * pages are free and how many are large enough to satisfy an allocation of
1049 * the target size. Note that this function makes no attempt to estimate
1050 * how many suitable free blocks there *might* be if MOVABLE pages were
1051 * migrated. Calculating that is possible, but expensive and can be
1052 * figured out from userspace
1053 */
1054static void fill_contig_page_info(struct zone *zone,
1055 unsigned int suitable_order,
1056 struct contig_page_info *info)
1057{
1058 unsigned int order;
1059
1060 info->free_pages = 0;
1061 info->free_blocks_total = 0;
1062 info->free_blocks_suitable = 0;
1063
1064 for (order = 0; order < MAX_ORDER; order++) {
1065 unsigned long blocks;
1066
1067 /* Count number of free blocks */
1068 blocks = zone->free_area[order].nr_free;
1069 info->free_blocks_total += blocks;
1070
1071 /* Count free base pages */
1072 info->free_pages += blocks << order;
1073
1074 /* Count the suitable free blocks */
1075 if (order >= suitable_order)
1076 info->free_blocks_suitable += blocks <<
1077 (order - suitable_order);
1078 }
1079}
f1a5ab12
MG
1080
1081/*
1082 * A fragmentation index only makes sense if an allocation of a requested
1083 * size would fail. If that is true, the fragmentation index indicates
1084 * whether external fragmentation or a lack of memory was the problem.
1085 * The value can be used to determine if page reclaim or compaction
1086 * should be used
1087 */
56de7263 1088static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
f1a5ab12
MG
1089{
1090 unsigned long requested = 1UL << order;
1091
88d6ac40
WY
1092 if (WARN_ON_ONCE(order >= MAX_ORDER))
1093 return 0;
1094
f1a5ab12
MG
1095 if (!info->free_blocks_total)
1096 return 0;
1097
1098 /* Fragmentation index only makes sense when a request would fail */
1099 if (info->free_blocks_suitable)
1100 return -1000;
1101
1102 /*
1103 * Index is between 0 and 1 so return within 3 decimal places
1104 *
1105 * 0 => allocation would fail due to lack of memory
1106 * 1 => allocation would fail due to fragmentation
1107 */
1108 return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
1109}
56de7263 1110
facdaa91
NG
1111/*
1112 * Calculates external fragmentation within a zone wrt the given order.
1113 * It is defined as the percentage of pages found in blocks of size
1114 * less than 1 << order. It returns values in range [0, 100].
1115 */
d34c0a75 1116unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
facdaa91
NG
1117{
1118 struct contig_page_info info;
1119
1120 fill_contig_page_info(zone, order, &info);
1121 if (info.free_pages == 0)
1122 return 0;
1123
1124 return div_u64((info.free_pages -
1125 (info.free_blocks_suitable << order)) * 100,
1126 info.free_pages);
1127}
1128
56de7263
MG
1129/* Same as __fragmentation index but allocs contig_page_info on stack */
1130int fragmentation_index(struct zone *zone, unsigned int order)
1131{
1132 struct contig_page_info info;
1133
1134 fill_contig_page_info(zone, order, &info);
1135 return __fragmentation_index(order, &info);
1136}
d7a5752c
MG
1137#endif
1138
ebc5d83d
KK
1139#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
1140 defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
fa25c503
KM
1141#ifdef CONFIG_ZONE_DMA
1142#define TEXT_FOR_DMA(xx) xx "_dma",
1143#else
1144#define TEXT_FOR_DMA(xx)
1145#endif
1146
1147#ifdef CONFIG_ZONE_DMA32
1148#define TEXT_FOR_DMA32(xx) xx "_dma32",
1149#else
1150#define TEXT_FOR_DMA32(xx)
1151#endif
1152
1153#ifdef CONFIG_HIGHMEM
1154#define TEXT_FOR_HIGHMEM(xx) xx "_high",
1155#else
1156#define TEXT_FOR_HIGHMEM(xx)
1157#endif
1158
1159#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
1160 TEXT_FOR_HIGHMEM(xx) xx "_movable",
1161
1162const char * const vmstat_text[] = {
8d92890b 1163 /* enum zone_stat_item counters */
fa25c503 1164 "nr_free_pages",
71c799f4
MK
1165 "nr_zone_inactive_anon",
1166 "nr_zone_active_anon",
1167 "nr_zone_inactive_file",
1168 "nr_zone_active_file",
1169 "nr_zone_unevictable",
5a1c84b4 1170 "nr_zone_write_pending",
fa25c503 1171 "nr_mlock",
fa25c503 1172 "nr_bounce",
91537fee
MK
1173#if IS_ENABLED(CONFIG_ZSMALLOC)
1174 "nr_zspages",
1175#endif
3a321d2a
KW
1176 "nr_free_cma",
1177
1178 /* enum numa_stat_item counters */
fa25c503
KM
1179#ifdef CONFIG_NUMA
1180 "numa_hit",
1181 "numa_miss",
1182 "numa_foreign",
1183 "numa_interleave",
1184 "numa_local",
1185 "numa_other",
1186#endif
09316c09 1187
9d7ea9a2 1188 /* enum node_stat_item counters */
599d0c95
MG
1189 "nr_inactive_anon",
1190 "nr_active_anon",
1191 "nr_inactive_file",
1192 "nr_active_file",
1193 "nr_unevictable",
385386cf
JW
1194 "nr_slab_reclaimable",
1195 "nr_slab_unreclaimable",
599d0c95
MG
1196 "nr_isolated_anon",
1197 "nr_isolated_file",
68d48e6a 1198 "workingset_nodes",
170b04b7
JK
1199 "workingset_refault_anon",
1200 "workingset_refault_file",
1201 "workingset_activate_anon",
1202 "workingset_activate_file",
1203 "workingset_restore_anon",
1204 "workingset_restore_file",
1e6b1085 1205 "workingset_nodereclaim",
50658e2e
MG
1206 "nr_anon_pages",
1207 "nr_mapped",
11fb9989
MG
1208 "nr_file_pages",
1209 "nr_dirty",
1210 "nr_writeback",
1211 "nr_writeback_temp",
1212 "nr_shmem",
1213 "nr_shmem_hugepages",
1214 "nr_shmem_pmdmapped",
60fbf0ab
SL
1215 "nr_file_hugepages",
1216 "nr_file_pmdmapped",
11fb9989 1217 "nr_anon_transparent_hugepages",
c4a25635
MG
1218 "nr_vmscan_write",
1219 "nr_vmscan_immediate_reclaim",
1220 "nr_dirtied",
1221 "nr_written",
b29940c1 1222 "nr_kernel_misc_reclaimable",
1970dc6f
JH
1223 "nr_foll_pin_acquired",
1224 "nr_foll_pin_released",
991e7673
SB
1225 "nr_kernel_stack",
1226#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
1227 "nr_shadow_call_stack",
1228#endif
f0c0c115 1229 "nr_page_table_pages",
b6038942
SB
1230#ifdef CONFIG_SWAP
1231 "nr_swapcached",
1232#endif
599d0c95 1233
09316c09 1234 /* enum writeback_stat_item counters */
fa25c503
KM
1235 "nr_dirty_threshold",
1236 "nr_dirty_background_threshold",
1237
ebc5d83d 1238#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
09316c09 1239 /* enum vm_event_item counters */
fa25c503
KM
1240 "pgpgin",
1241 "pgpgout",
1242 "pswpin",
1243 "pswpout",
1244
1245 TEXTS_FOR_ZONES("pgalloc")
7cc30fcf
MG
1246 TEXTS_FOR_ZONES("allocstall")
1247 TEXTS_FOR_ZONES("pgskip")
fa25c503
KM
1248
1249 "pgfree",
1250 "pgactivate",
1251 "pgdeactivate",
f7ad2a6c 1252 "pglazyfree",
fa25c503
KM
1253
1254 "pgfault",
1255 "pgmajfault",
854e9ed0 1256 "pglazyfreed",
fa25c503 1257
599d0c95 1258 "pgrefill",
798a6b87 1259 "pgreuse",
599d0c95
MG
1260 "pgsteal_kswapd",
1261 "pgsteal_direct",
1262 "pgscan_kswapd",
1263 "pgscan_direct",
68243e76 1264 "pgscan_direct_throttle",
497a6c1b
JW
1265 "pgscan_anon",
1266 "pgscan_file",
1267 "pgsteal_anon",
1268 "pgsteal_file",
fa25c503
KM
1269
1270#ifdef CONFIG_NUMA
1271 "zone_reclaim_failed",
1272#endif
1273 "pginodesteal",
1274 "slabs_scanned",
fa25c503
KM
1275 "kswapd_inodesteal",
1276 "kswapd_low_wmark_hit_quickly",
1277 "kswapd_high_wmark_hit_quickly",
fa25c503 1278 "pageoutrun",
fa25c503
KM
1279
1280 "pgrotated",
1281
5509a5d2
DH
1282 "drop_pagecache",
1283 "drop_slab",
8e675f7a 1284 "oom_kill",
5509a5d2 1285
03c5a6e1
MG
1286#ifdef CONFIG_NUMA_BALANCING
1287 "numa_pte_updates",
72403b4a 1288 "numa_huge_pte_updates",
03c5a6e1
MG
1289 "numa_hint_faults",
1290 "numa_hint_faults_local",
1291 "numa_pages_migrated",
1292#endif
5647bc29
MG
1293#ifdef CONFIG_MIGRATION
1294 "pgmigrate_success",
1295 "pgmigrate_fail",
1a5bae25
AK
1296 "thp_migration_success",
1297 "thp_migration_fail",
1298 "thp_migration_split",
5647bc29 1299#endif
fa25c503 1300#ifdef CONFIG_COMPACTION
397487db
MG
1301 "compact_migrate_scanned",
1302 "compact_free_scanned",
1303 "compact_isolated",
fa25c503
KM
1304 "compact_stall",
1305 "compact_fail",
1306 "compact_success",
698b1b30 1307 "compact_daemon_wake",
7f354a54
DR
1308 "compact_daemon_migrate_scanned",
1309 "compact_daemon_free_scanned",
fa25c503
KM
1310#endif
1311
1312#ifdef CONFIG_HUGETLB_PAGE
1313 "htlb_buddy_alloc_success",
1314 "htlb_buddy_alloc_fail",
1315#endif
1316 "unevictable_pgs_culled",
1317 "unevictable_pgs_scanned",
1318 "unevictable_pgs_rescued",
1319 "unevictable_pgs_mlocked",
1320 "unevictable_pgs_munlocked",
1321 "unevictable_pgs_cleared",
1322 "unevictable_pgs_stranded",
fa25c503
KM
1323
1324#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1325 "thp_fault_alloc",
1326 "thp_fault_fallback",
85b9f46e 1327 "thp_fault_fallback_charge",
fa25c503
KM
1328 "thp_collapse_alloc",
1329 "thp_collapse_alloc_failed",
95ecedcd 1330 "thp_file_alloc",
dcdf11ee 1331 "thp_file_fallback",
85b9f46e 1332 "thp_file_fallback_charge",
95ecedcd 1333 "thp_file_mapped",
122afea9
KS
1334 "thp_split_page",
1335 "thp_split_page_failed",
f9719a03 1336 "thp_deferred_split_page",
122afea9 1337 "thp_split_pmd",
ce9311cf
YX
1338#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1339 "thp_split_pud",
1340#endif
d8a8e1f0
KS
1341 "thp_zero_page_alloc",
1342 "thp_zero_page_alloc_failed",
225311a4 1343 "thp_swpout",
fe490cc0 1344 "thp_swpout_fallback",
fa25c503 1345#endif
09316c09
KK
1346#ifdef CONFIG_MEMORY_BALLOON
1347 "balloon_inflate",
1348 "balloon_deflate",
1349#ifdef CONFIG_BALLOON_COMPACTION
1350 "balloon_migrate",
1351#endif
1352#endif /* CONFIG_MEMORY_BALLOON */
ec659934 1353#ifdef CONFIG_DEBUG_TLBFLUSH
9824cf97
DH
1354 "nr_tlb_remote_flush",
1355 "nr_tlb_remote_flush_received",
1356 "nr_tlb_local_flush_all",
1357 "nr_tlb_local_flush_one",
ec659934 1358#endif /* CONFIG_DEBUG_TLBFLUSH */
fa25c503 1359
4f115147
DB
1360#ifdef CONFIG_DEBUG_VM_VMACACHE
1361 "vmacache_find_calls",
1362 "vmacache_find_hits",
1363#endif
cbc65df2
HY
1364#ifdef CONFIG_SWAP
1365 "swap_ra",
1366 "swap_ra_hit",
1367#endif
ebc5d83d 1368#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
fa25c503 1369};
ebc5d83d 1370#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
fa25c503 1371
3c486871
AM
1372#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
1373 defined(CONFIG_PROC_FS)
1374static void *frag_start(struct seq_file *m, loff_t *pos)
1375{
1376 pg_data_t *pgdat;
1377 loff_t node = *pos;
1378
1379 for (pgdat = first_online_pgdat();
1380 pgdat && node;
1381 pgdat = next_online_pgdat(pgdat))
1382 --node;
1383
1384 return pgdat;
1385}
1386
1387static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
1388{
1389 pg_data_t *pgdat = (pg_data_t *)arg;
1390
1391 (*pos)++;
1392 return next_online_pgdat(pgdat);
1393}
1394
1395static void frag_stop(struct seq_file *m, void *arg)
1396{
1397}
1398
b2bd8598
DR
1399/*
1400 * Walk zones in a node and print using a callback.
1401 * If @assert_populated is true, only use callback for zones that are populated.
1402 */
3c486871 1403static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
727c080f 1404 bool assert_populated, bool nolock,
3c486871
AM
1405 void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
1406{
1407 struct zone *zone;
1408 struct zone *node_zones = pgdat->node_zones;
1409 unsigned long flags;
1410
1411 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
b2bd8598 1412 if (assert_populated && !populated_zone(zone))
3c486871
AM
1413 continue;
1414
727c080f
VM
1415 if (!nolock)
1416 spin_lock_irqsave(&zone->lock, flags);
3c486871 1417 print(m, pgdat, zone);
727c080f
VM
1418 if (!nolock)
1419 spin_unlock_irqrestore(&zone->lock, flags);
3c486871
AM
1420 }
1421}
1422#endif
1423
d7a5752c 1424#ifdef CONFIG_PROC_FS
467c996c
MG
1425static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
1426 struct zone *zone)
1427{
1428 int order;
1429
1430 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1431 for (order = 0; order < MAX_ORDER; ++order)
1432 seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
1433 seq_putc(m, '\n');
1434}
1435
1436/*
1437 * This walks the free areas for each zone.
1438 */
1439static int frag_show(struct seq_file *m, void *arg)
1440{
1441 pg_data_t *pgdat = (pg_data_t *)arg;
727c080f 1442 walk_zones_in_node(m, pgdat, true, false, frag_show_print);
467c996c
MG
1443 return 0;
1444}
1445
1446static void pagetypeinfo_showfree_print(struct seq_file *m,
1447 pg_data_t *pgdat, struct zone *zone)
1448{
1449 int order, mtype;
1450
1451 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
1452 seq_printf(m, "Node %4d, zone %8s, type %12s ",
1453 pgdat->node_id,
1454 zone->name,
1455 migratetype_names[mtype]);
1456 for (order = 0; order < MAX_ORDER; ++order) {
1457 unsigned long freecount = 0;
1458 struct free_area *area;
1459 struct list_head *curr;
93b3a674 1460 bool overflow = false;
467c996c
MG
1461
1462 area = &(zone->free_area[order]);
1463
93b3a674
MH
1464 list_for_each(curr, &area->free_list[mtype]) {
1465 /*
1466 * Cap the free_list iteration because it might
1467 * be really large and we are under a spinlock
1468 * so a long time spent here could trigger a
1469 * hard lockup detector. Anyway this is a
1470 * debugging tool so knowing there is a handful
1471 * of pages of this order should be more than
1472 * sufficient.
1473 */
1474 if (++freecount >= 100000) {
1475 overflow = true;
1476 break;
1477 }
1478 }
1479 seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
1480 spin_unlock_irq(&zone->lock);
1481 cond_resched();
1482 spin_lock_irq(&zone->lock);
467c996c 1483 }
f6ac2354
CL
1484 seq_putc(m, '\n');
1485 }
467c996c
MG
1486}
1487
1488/* Print out the free pages at each order for each migatetype */
1489static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
1490{
1491 int order;
1492 pg_data_t *pgdat = (pg_data_t *)arg;
1493
1494 /* Print header */
1495 seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
1496 for (order = 0; order < MAX_ORDER; ++order)
1497 seq_printf(m, "%6d ", order);
1498 seq_putc(m, '\n');
1499
727c080f 1500 walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
467c996c
MG
1501
1502 return 0;
1503}
1504
1505static void pagetypeinfo_showblockcount_print(struct seq_file *m,
1506 pg_data_t *pgdat, struct zone *zone)
1507{
1508 int mtype;
1509 unsigned long pfn;
1510 unsigned long start_pfn = zone->zone_start_pfn;
108bcc96 1511 unsigned long end_pfn = zone_end_pfn(zone);
467c996c
MG
1512 unsigned long count[MIGRATE_TYPES] = { 0, };
1513
1514 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
1515 struct page *page;
1516
d336e94e
MH
1517 page = pfn_to_online_page(pfn);
1518 if (!page)
467c996c
MG
1519 continue;
1520
a91c43c7
JK
1521 if (page_zone(page) != zone)
1522 continue;
1523
467c996c
MG
1524 mtype = get_pageblock_migratetype(page);
1525
e80d6a24
MG
1526 if (mtype < MIGRATE_TYPES)
1527 count[mtype]++;
467c996c
MG
1528 }
1529
1530 /* Print counts */
1531 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1532 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1533 seq_printf(m, "%12lu ", count[mtype]);
1534 seq_putc(m, '\n');
1535}
1536
f113e641 1537/* Print out the number of pageblocks for each migratetype */
467c996c
MG
1538static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
1539{
1540 int mtype;
1541 pg_data_t *pgdat = (pg_data_t *)arg;
1542
1543 seq_printf(m, "\n%-23s", "Number of blocks type ");
1544 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1545 seq_printf(m, "%12s ", migratetype_names[mtype]);
1546 seq_putc(m, '\n');
727c080f
VM
1547 walk_zones_in_node(m, pgdat, true, false,
1548 pagetypeinfo_showblockcount_print);
467c996c
MG
1549
1550 return 0;
1551}
1552
48c96a36
JK
1553/*
1554 * Print out the number of pageblocks for each migratetype that contain pages
1555 * of other types. This gives an indication of how well fallbacks are being
1556 * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
1557 * to determine what is going on
1558 */
1559static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
1560{
1561#ifdef CONFIG_PAGE_OWNER
1562 int mtype;
1563
7dd80b8a 1564 if (!static_branch_unlikely(&page_owner_inited))
48c96a36
JK
1565 return;
1566
1567 drain_all_pages(NULL);
1568
1569 seq_printf(m, "\n%-23s", "Number of mixed blocks ");
1570 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1571 seq_printf(m, "%12s ", migratetype_names[mtype]);
1572 seq_putc(m, '\n');
1573
727c080f
VM
1574 walk_zones_in_node(m, pgdat, true, true,
1575 pagetypeinfo_showmixedcount_print);
48c96a36
JK
1576#endif /* CONFIG_PAGE_OWNER */
1577}
1578
467c996c
MG
1579/*
1580 * This prints out statistics in relation to grouping pages by mobility.
1581 * It is expensive to collect so do not constantly read the file.
1582 */
1583static int pagetypeinfo_show(struct seq_file *m, void *arg)
1584{
1585 pg_data_t *pgdat = (pg_data_t *)arg;
1586
41b25a37 1587 /* check memoryless node */
a47b53c5 1588 if (!node_state(pgdat->node_id, N_MEMORY))
41b25a37
KM
1589 return 0;
1590
467c996c
MG
1591 seq_printf(m, "Page block order: %d\n", pageblock_order);
1592 seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages);
1593 seq_putc(m, '\n');
1594 pagetypeinfo_showfree(m, pgdat);
1595 pagetypeinfo_showblockcount(m, pgdat);
48c96a36 1596 pagetypeinfo_showmixedcount(m, pgdat);
467c996c 1597
f6ac2354
CL
1598 return 0;
1599}
1600
8f32f7e5 1601static const struct seq_operations fragmentation_op = {
f6ac2354
CL
1602 .start = frag_start,
1603 .next = frag_next,
1604 .stop = frag_stop,
1605 .show = frag_show,
1606};
1607
74e2e8e8 1608static const struct seq_operations pagetypeinfo_op = {
467c996c
MG
1609 .start = frag_start,
1610 .next = frag_next,
1611 .stop = frag_stop,
1612 .show = pagetypeinfo_show,
1613};
1614
e2ecc8a7
MG
1615static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
1616{
1617 int zid;
1618
1619 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1620 struct zone *compare = &pgdat->node_zones[zid];
1621
1622 if (populated_zone(compare))
1623 return zone == compare;
1624 }
1625
e2ecc8a7
MG
1626 return false;
1627}
1628
467c996c
MG
1629static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1630 struct zone *zone)
f6ac2354 1631{
467c996c
MG
1632 int i;
1633 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
e2ecc8a7
MG
1634 if (is_zone_first_populated(pgdat, zone)) {
1635 seq_printf(m, "\n per-node stats");
1636 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
69473e5d
MS
1637 unsigned long pages = node_page_state_pages(pgdat, i);
1638
1639 if (vmstat_item_print_in_thp(i))
1640 pages /= HPAGE_PMD_NR;
9d7ea9a2 1641 seq_printf(m, "\n %-12s %lu", node_stat_name(i),
69473e5d 1642 pages);
e2ecc8a7
MG
1643 }
1644 }
467c996c
MG
1645 seq_printf(m,
1646 "\n pages free %lu"
1647 "\n min %lu"
1648 "\n low %lu"
1649 "\n high %lu"
467c996c 1650 "\n spanned %lu"
9feedc9d 1651 "\n present %lu"
3c381db1
DH
1652 "\n managed %lu"
1653 "\n cma %lu",
88f5acf8 1654 zone_page_state(zone, NR_FREE_PAGES),
41858966
MG
1655 min_wmark_pages(zone),
1656 low_wmark_pages(zone),
1657 high_wmark_pages(zone),
467c996c 1658 zone->spanned_pages,
9feedc9d 1659 zone->present_pages,
3c381db1
DH
1660 zone_managed_pages(zone),
1661 zone_cma_pages(zone));
467c996c 1662
467c996c 1663 seq_printf(m,
3484b2de 1664 "\n protection: (%ld",
467c996c
MG
1665 zone->lowmem_reserve[0]);
1666 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
3484b2de 1667 seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
7dfb8bf3
DR
1668 seq_putc(m, ')');
1669
a8a4b7ae
BH
1670 /* If unpopulated, no other information is useful */
1671 if (!populated_zone(zone)) {
1672 seq_putc(m, '\n');
1673 return;
1674 }
1675
7dfb8bf3 1676 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
9d7ea9a2
KK
1677 seq_printf(m, "\n %-12s %lu", zone_stat_name(i),
1678 zone_page_state(zone, i));
7dfb8bf3 1679
3a321d2a
KW
1680#ifdef CONFIG_NUMA
1681 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
9d7ea9a2
KK
1682 seq_printf(m, "\n %-12s %lu", numa_stat_name(i),
1683 zone_numa_state_snapshot(zone, i));
3a321d2a
KW
1684#endif
1685
7dfb8bf3 1686 seq_printf(m, "\n pagesets");
467c996c
MG
1687 for_each_online_cpu(i) {
1688 struct per_cpu_pageset *pageset;
467c996c 1689
99dcc3e5 1690 pageset = per_cpu_ptr(zone->pageset, i);
3dfa5721
CL
1691 seq_printf(m,
1692 "\n cpu: %i"
1693 "\n count: %i"
1694 "\n high: %i"
1695 "\n batch: %i",
1696 i,
1697 pageset->pcp.count,
1698 pageset->pcp.high,
1699 pageset->pcp.batch);
df9ecaba 1700#ifdef CONFIG_SMP
467c996c
MG
1701 seq_printf(m, "\n vm stats threshold: %d",
1702 pageset->stat_threshold);
df9ecaba 1703#endif
f6ac2354 1704 }
467c996c 1705 seq_printf(m,
599d0c95 1706 "\n node_unreclaimable: %u"
3a50d14d 1707 "\n start_pfn: %lu",
c73322d0 1708 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
3a50d14d 1709 zone->zone_start_pfn);
467c996c
MG
1710 seq_putc(m, '\n');
1711}
1712
1713/*
b2bd8598
DR
1714 * Output information about zones in @pgdat. All zones are printed regardless
1715 * of whether they are populated or not: lowmem_reserve_ratio operates on the
1716 * set of all zones and userspace would not be aware of such zones if they are
1717 * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
467c996c
MG
1718 */
1719static int zoneinfo_show(struct seq_file *m, void *arg)
1720{
1721 pg_data_t *pgdat = (pg_data_t *)arg;
727c080f 1722 walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
f6ac2354
CL
1723 return 0;
1724}
1725
5c9fe628 1726static const struct seq_operations zoneinfo_op = {
f6ac2354
CL
1727 .start = frag_start, /* iterate over all zones. The same as in
1728 * fragmentation. */
1729 .next = frag_next,
1730 .stop = frag_stop,
1731 .show = zoneinfo_show,
1732};
1733
9d7ea9a2
KK
1734#define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
1735 NR_VM_NUMA_STAT_ITEMS + \
1736 NR_VM_NODE_STAT_ITEMS + \
1737 NR_VM_WRITEBACK_STAT_ITEMS + \
1738 (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
1739 NR_VM_EVENT_ITEMS : 0))
79da826a 1740
f6ac2354
CL
1741static void *vmstat_start(struct seq_file *m, loff_t *pos)
1742{
2244b95a 1743 unsigned long *v;
9d7ea9a2 1744 int i;
f6ac2354 1745
9d7ea9a2 1746 if (*pos >= NR_VMSTAT_ITEMS)
f6ac2354 1747 return NULL;
79da826a 1748
9d7ea9a2
KK
1749 BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
1750 v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
2244b95a
CL
1751 m->private = v;
1752 if (!v)
f6ac2354 1753 return ERR_PTR(-ENOMEM);
2244b95a 1754 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
c41f012a 1755 v[i] = global_zone_page_state(i);
79da826a
MR
1756 v += NR_VM_ZONE_STAT_ITEMS;
1757
3a321d2a
KW
1758#ifdef CONFIG_NUMA
1759 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
1760 v[i] = global_numa_state(i);
1761 v += NR_VM_NUMA_STAT_ITEMS;
1762#endif
1763
69473e5d 1764 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
ea426c2a 1765 v[i] = global_node_page_state_pages(i);
69473e5d
MS
1766 if (vmstat_item_print_in_thp(i))
1767 v[i] /= HPAGE_PMD_NR;
1768 }
75ef7184
MG
1769 v += NR_VM_NODE_STAT_ITEMS;
1770
79da826a
MR
1771 global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
1772 v + NR_DIRTY_THRESHOLD);
1773 v += NR_VM_WRITEBACK_STAT_ITEMS;
1774
f8891e5e 1775#ifdef CONFIG_VM_EVENT_COUNTERS
79da826a
MR
1776 all_vm_events(v);
1777 v[PGPGIN] /= 2; /* sectors -> kbytes */
1778 v[PGPGOUT] /= 2;
f8891e5e 1779#endif
ff8b16d7 1780 return (unsigned long *)m->private + *pos;
f6ac2354
CL
1781}
1782
1783static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1784{
1785 (*pos)++;
9d7ea9a2 1786 if (*pos >= NR_VMSTAT_ITEMS)
f6ac2354
CL
1787 return NULL;
1788 return (unsigned long *)m->private + *pos;
1789}
1790
1791static int vmstat_show(struct seq_file *m, void *arg)
1792{
1793 unsigned long *l = arg;
1794 unsigned long off = l - (unsigned long *)m->private;
68ba0326
AD
1795
1796 seq_puts(m, vmstat_text[off]);
75ba1d07 1797 seq_put_decimal_ull(m, " ", *l);
68ba0326 1798 seq_putc(m, '\n');
8d92890b
N
1799
1800 if (off == NR_VMSTAT_ITEMS - 1) {
1801 /*
1802 * We've come to the end - add any deprecated counters to avoid
1803 * breaking userspace which might depend on them being present.
1804 */
1805 seq_puts(m, "nr_unstable 0\n");
1806 }
f6ac2354
CL
1807 return 0;
1808}
1809
1810static void vmstat_stop(struct seq_file *m, void *arg)
1811{
1812 kfree(m->private);
1813 m->private = NULL;
1814}
1815
b6aa44ab 1816static const struct seq_operations vmstat_op = {
f6ac2354
CL
1817 .start = vmstat_start,
1818 .next = vmstat_next,
1819 .stop = vmstat_stop,
1820 .show = vmstat_show,
1821};
f6ac2354
CL
1822#endif /* CONFIG_PROC_FS */
1823
df9ecaba 1824#ifdef CONFIG_SMP
d1187ed2 1825static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
77461ab3 1826int sysctl_stat_interval __read_mostly = HZ;
d1187ed2 1827
52b6f46b
HD
1828#ifdef CONFIG_PROC_FS
1829static void refresh_vm_stats(struct work_struct *work)
1830{
1831 refresh_cpu_vm_stats(true);
1832}
1833
1834int vmstat_refresh(struct ctl_table *table, int write,
32927393 1835 void *buffer, size_t *lenp, loff_t *ppos)
52b6f46b
HD
1836{
1837 long val;
1838 int err;
1839 int i;
1840
1841 /*
1842 * The regular update, every sysctl_stat_interval, may come later
1843 * than expected: leaving a significant amount in per_cpu buckets.
1844 * This is particularly misleading when checking a quantity of HUGE
1845 * pages, immediately after running a test. /proc/sys/vm/stat_refresh,
1846 * which can equally be echo'ed to or cat'ted from (by root),
1847 * can be used to update the stats just before reading them.
1848 *
c41f012a 1849 * Oh, and since global_zone_page_state() etc. are so careful to hide
52b6f46b
HD
1850 * transiently negative values, report an error here if any of
1851 * the stats is negative, so we know to go looking for imbalance.
1852 */
1853 err = schedule_on_each_cpu(refresh_vm_stats);
1854 if (err)
1855 return err;
1856 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
75ef7184 1857 val = atomic_long_read(&vm_zone_stat[i]);
52b6f46b 1858 if (val < 0) {
c822f622 1859 pr_warn("%s: %s %ld\n",
9d7ea9a2 1860 __func__, zone_stat_name(i), val);
c822f622 1861 err = -EINVAL;
52b6f46b
HD
1862 }
1863 }
3a321d2a
KW
1864#ifdef CONFIG_NUMA
1865 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
1866 val = atomic_long_read(&vm_numa_stat[i]);
1867 if (val < 0) {
1868 pr_warn("%s: %s %ld\n",
9d7ea9a2 1869 __func__, numa_stat_name(i), val);
3a321d2a
KW
1870 err = -EINVAL;
1871 }
1872 }
1873#endif
52b6f46b
HD
1874 if (err)
1875 return err;
1876 if (write)
1877 *ppos += *lenp;
1878 else
1879 *lenp = 0;
1880 return 0;
1881}
1882#endif /* CONFIG_PROC_FS */
1883
d1187ed2
CL
1884static void vmstat_update(struct work_struct *w)
1885{
0eb77e98 1886 if (refresh_cpu_vm_stats(true)) {
7cc36bbd
CL
1887 /*
1888 * Counters were updated so we expect more updates
1889 * to occur in the future. Keep on running the
1890 * update worker thread.
1891 */
ce612879 1892 queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
f01f17d3
MH
1893 this_cpu_ptr(&vmstat_work),
1894 round_jiffies_relative(sysctl_stat_interval));
7cc36bbd
CL
1895 }
1896}
1897
0eb77e98
CL
1898/*
1899 * Switch off vmstat processing and then fold all the remaining differentials
1900 * until the diffs stay at zero. The function is used by NOHZ and can only be
1901 * invoked when tick processing is not active.
1902 */
7cc36bbd
CL
1903/*
1904 * Check if the diffs for a certain cpu indicate that
1905 * an update is needed.
1906 */
1907static bool need_update(int cpu)
1908{
2bbd00ae 1909 pg_data_t *last_pgdat = NULL;
7cc36bbd
CL
1910 struct zone *zone;
1911
1912 for_each_populated_zone(zone) {
1913 struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
2bbd00ae 1914 struct per_cpu_nodestat *n;
7cc36bbd
CL
1915 /*
1916 * The fast way of checking if there are any vmstat diffs.
7cc36bbd 1917 */
13c9aaf7
JH
1918 if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS *
1919 sizeof(p->vm_stat_diff[0])))
7cc36bbd 1920 return true;
3a321d2a 1921#ifdef CONFIG_NUMA
13c9aaf7
JH
1922 if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS *
1923 sizeof(p->vm_numa_stat_diff[0])))
3a321d2a
KW
1924 return true;
1925#endif
2bbd00ae
JW
1926 if (last_pgdat == zone->zone_pgdat)
1927 continue;
1928 last_pgdat = zone->zone_pgdat;
1929 n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu);
1930 if (memchr_inv(n->vm_node_stat_diff, 0, NR_VM_NODE_STAT_ITEMS *
1931 sizeof(n->vm_node_stat_diff[0])))
1932 return true;
7cc36bbd
CL
1933 }
1934 return false;
1935}
1936
7b8da4c7
CL
1937/*
1938 * Switch off vmstat processing and then fold all the remaining differentials
1939 * until the diffs stay at zero. The function is used by NOHZ and can only be
1940 * invoked when tick processing is not active.
1941 */
f01f17d3
MH
1942void quiet_vmstat(void)
1943{
1944 if (system_state != SYSTEM_RUNNING)
1945 return;
1946
7b8da4c7 1947 if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
f01f17d3
MH
1948 return;
1949
1950 if (!need_update(smp_processor_id()))
1951 return;
1952
1953 /*
1954 * Just refresh counters and do not care about the pending delayed
1955 * vmstat_update. It doesn't fire that often to matter and canceling
1956 * it would be too expensive from this path.
1957 * vmstat_shepherd will take care about that for us.
1958 */
1959 refresh_cpu_vm_stats(false);
1960}
1961
7cc36bbd
CL
1962/*
1963 * Shepherd worker thread that checks the
1964 * differentials of processors that have their worker
1965 * threads for vm statistics updates disabled because of
1966 * inactivity.
1967 */
1968static void vmstat_shepherd(struct work_struct *w);
1969
0eb77e98 1970static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
7cc36bbd
CL
1971
1972static void vmstat_shepherd(struct work_struct *w)
1973{
1974 int cpu;
1975
1976 get_online_cpus();
1977 /* Check processors whose vmstat worker threads have been disabled */
7b8da4c7 1978 for_each_online_cpu(cpu) {
f01f17d3 1979 struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
7cc36bbd 1980
7b8da4c7 1981 if (!delayed_work_pending(dw) && need_update(cpu))
ce612879 1982 queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
fbcc8183
JB
1983
1984 cond_resched();
f01f17d3 1985 }
7cc36bbd
CL
1986 put_online_cpus();
1987
1988 schedule_delayed_work(&shepherd,
98f4ebb2 1989 round_jiffies_relative(sysctl_stat_interval));
d1187ed2
CL
1990}
1991
7cc36bbd 1992static void __init start_shepherd_timer(void)
d1187ed2 1993{
7cc36bbd
CL
1994 int cpu;
1995
1996 for_each_possible_cpu(cpu)
ccde8bd4 1997 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
7cc36bbd
CL
1998 vmstat_update);
1999
7cc36bbd
CL
2000 schedule_delayed_work(&shepherd,
2001 round_jiffies_relative(sysctl_stat_interval));
d1187ed2
CL
2002}
2003
03e86dba
TC
2004static void __init init_cpu_node_state(void)
2005{
4c501327 2006 int node;
03e86dba 2007
4c501327
SAS
2008 for_each_online_node(node) {
2009 if (cpumask_weight(cpumask_of_node(node)) > 0)
2010 node_set_state(node, N_CPU);
2011 }
03e86dba
TC
2012}
2013
5438da97
SAS
2014static int vmstat_cpu_online(unsigned int cpu)
2015{
2016 refresh_zone_stat_thresholds();
2017 node_set_state(cpu_to_node(cpu), N_CPU);
2018 return 0;
2019}
2020
2021static int vmstat_cpu_down_prep(unsigned int cpu)
2022{
2023 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
2024 return 0;
2025}
2026
2027static int vmstat_cpu_dead(unsigned int cpu)
807a1bd2 2028{
4c501327 2029 const struct cpumask *node_cpus;
5438da97 2030 int node;
807a1bd2 2031
5438da97
SAS
2032 node = cpu_to_node(cpu);
2033
2034 refresh_zone_stat_thresholds();
4c501327
SAS
2035 node_cpus = cpumask_of_node(node);
2036 if (cpumask_weight(node_cpus) > 0)
5438da97 2037 return 0;
807a1bd2
TK
2038
2039 node_clear_state(node, N_CPU);
5438da97 2040 return 0;
807a1bd2
TK
2041}
2042
8f32f7e5 2043#endif
df9ecaba 2044
ce612879
MH
2045struct workqueue_struct *mm_percpu_wq;
2046
597b7305 2047void __init init_mm_internals(void)
df9ecaba 2048{
ce612879 2049 int ret __maybe_unused;
5438da97 2050
80d136e1 2051 mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
ce612879
MH
2052
2053#ifdef CONFIG_SMP
5438da97
SAS
2054 ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
2055 NULL, vmstat_cpu_dead);
2056 if (ret < 0)
2057 pr_err("vmstat: failed to register 'dead' hotplug state\n");
2058
2059 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
2060 vmstat_cpu_online,
2061 vmstat_cpu_down_prep);
2062 if (ret < 0)
2063 pr_err("vmstat: failed to register 'online' hotplug state\n");
2064
2065 get_online_cpus();
03e86dba 2066 init_cpu_node_state();
5438da97 2067 put_online_cpus();
d1187ed2 2068
7cc36bbd 2069 start_shepherd_timer();
8f32f7e5
AD
2070#endif
2071#ifdef CONFIG_PROC_FS
fddda2b7 2072 proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
abaed011 2073 proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
fddda2b7
CH
2074 proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
2075 proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
8f32f7e5 2076#endif
df9ecaba 2077}
d7a5752c
MG
2078
2079#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
d7a5752c
MG
2080
2081/*
2082 * Return an index indicating how much of the available free memory is
2083 * unusable for an allocation of the requested size.
2084 */
2085static int unusable_free_index(unsigned int order,
2086 struct contig_page_info *info)
2087{
2088 /* No free memory is interpreted as all free memory is unusable */
2089 if (info->free_pages == 0)
2090 return 1000;
2091
2092 /*
2093 * Index should be a value between 0 and 1. Return a value to 3
2094 * decimal places.
2095 *
2096 * 0 => no fragmentation
2097 * 1 => high fragmentation
2098 */
2099 return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
2100
2101}
2102
2103static void unusable_show_print(struct seq_file *m,
2104 pg_data_t *pgdat, struct zone *zone)
2105{
2106 unsigned int order;
2107 int index;
2108 struct contig_page_info info;
2109
2110 seq_printf(m, "Node %d, zone %8s ",
2111 pgdat->node_id,
2112 zone->name);
2113 for (order = 0; order < MAX_ORDER; ++order) {
2114 fill_contig_page_info(zone, order, &info);
2115 index = unusable_free_index(order, &info);
2116 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2117 }
2118
2119 seq_putc(m, '\n');
2120}
2121
2122/*
2123 * Display unusable free space index
2124 *
2125 * The unusable free space index measures how much of the available free
2126 * memory cannot be used to satisfy an allocation of a given size and is a
2127 * value between 0 and 1. The higher the value, the more of free memory is
2128 * unusable and by implication, the worse the external fragmentation is. This
2129 * can be expressed as a percentage by multiplying by 100.
2130 */
2131static int unusable_show(struct seq_file *m, void *arg)
2132{
2133 pg_data_t *pgdat = (pg_data_t *)arg;
2134
2135 /* check memoryless node */
a47b53c5 2136 if (!node_state(pgdat->node_id, N_MEMORY))
d7a5752c
MG
2137 return 0;
2138
727c080f 2139 walk_zones_in_node(m, pgdat, true, false, unusable_show_print);
d7a5752c
MG
2140
2141 return 0;
2142}
2143
01a99560 2144static const struct seq_operations unusable_sops = {
d7a5752c
MG
2145 .start = frag_start,
2146 .next = frag_next,
2147 .stop = frag_stop,
2148 .show = unusable_show,
2149};
2150
01a99560 2151DEFINE_SEQ_ATTRIBUTE(unusable);
d7a5752c 2152
f1a5ab12
MG
2153static void extfrag_show_print(struct seq_file *m,
2154 pg_data_t *pgdat, struct zone *zone)
2155{
2156 unsigned int order;
2157 int index;
2158
2159 /* Alloc on stack as interrupts are disabled for zone walk */
2160 struct contig_page_info info;
2161
2162 seq_printf(m, "Node %d, zone %8s ",
2163 pgdat->node_id,
2164 zone->name);
2165 for (order = 0; order < MAX_ORDER; ++order) {
2166 fill_contig_page_info(zone, order, &info);
56de7263 2167 index = __fragmentation_index(order, &info);
f1a5ab12
MG
2168 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2169 }
2170
2171 seq_putc(m, '\n');
2172}
2173
2174/*
2175 * Display fragmentation index for orders that allocations would fail for
2176 */
2177static int extfrag_show(struct seq_file *m, void *arg)
2178{
2179 pg_data_t *pgdat = (pg_data_t *)arg;
2180
727c080f 2181 walk_zones_in_node(m, pgdat, true, false, extfrag_show_print);
f1a5ab12
MG
2182
2183 return 0;
2184}
2185
01a99560 2186static const struct seq_operations extfrag_sops = {
f1a5ab12
MG
2187 .start = frag_start,
2188 .next = frag_next,
2189 .stop = frag_stop,
2190 .show = extfrag_show,
2191};
2192
01a99560 2193DEFINE_SEQ_ATTRIBUTE(extfrag);
f1a5ab12 2194
d7a5752c
MG
2195static int __init extfrag_debug_init(void)
2196{
bde8bd8a
S
2197 struct dentry *extfrag_debug_root;
2198
d7a5752c 2199 extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
d7a5752c 2200
d9f7979c 2201 debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
01a99560 2202 &unusable_fops);
d7a5752c 2203
d9f7979c 2204 debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
01a99560 2205 &extfrag_fops);
f1a5ab12 2206
d7a5752c
MG
2207 return 0;
2208}
2209
2210module_init(extfrag_debug_init);
2211#endif