net/sched: act_ipt: add sanity checks on skb before calling target
[linux-2.6-block.git] / mm / vmstat.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
f6ac2354
CL
2/*
3 * linux/mm/vmstat.c
4 *
5 * Manages VM statistics
6 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
2244b95a
CL
7 *
8 * zoned VM statistics
9 * Copyright (C) 2006 Silicon Graphics, Inc.,
10 * Christoph Lameter <christoph@lameter.com>
7cc36bbd 11 * Copyright (C) 2008-2014 Christoph Lameter
f6ac2354 12 */
8f32f7e5 13#include <linux/fs.h>
f6ac2354 14#include <linux/mm.h>
4e950f6f 15#include <linux/err.h>
2244b95a 16#include <linux/module.h>
5a0e3ad6 17#include <linux/slab.h>
df9ecaba 18#include <linux/cpu.h>
7cc36bbd 19#include <linux/cpumask.h>
c748e134 20#include <linux/vmstat.h>
3c486871
AM
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/debugfs.h>
e8edc6e0 24#include <linux/sched.h>
f1a5ab12 25#include <linux/math64.h>
79da826a 26#include <linux/writeback.h>
36deb0be 27#include <linux/compaction.h>
6e543d57 28#include <linux/mm_inline.h>
48c96a36
JK
29#include <linux/page_ext.h>
30#include <linux/page_owner.h>
be5e015d 31#include <linux/sched/isolation.h>
6e543d57
LD
32
33#include "internal.h"
f6ac2354 34
4518085e
KW
35#ifdef CONFIG_NUMA
36int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
37
38/* zero numa counters within a zone */
39static void zero_zone_numa_counters(struct zone *zone)
40{
41 int item, cpu;
42
f19298b9
MG
43 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) {
44 atomic_long_set(&zone->vm_numa_event[item], 0);
45 for_each_online_cpu(cpu) {
46 per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item]
4518085e 47 = 0;
f19298b9 48 }
4518085e
KW
49 }
50}
51
52/* zero numa counters of all the populated zones */
53static void zero_zones_numa_counters(void)
54{
55 struct zone *zone;
56
57 for_each_populated_zone(zone)
58 zero_zone_numa_counters(zone);
59}
60
61/* zero global numa counters */
62static void zero_global_numa_counters(void)
63{
64 int item;
65
f19298b9
MG
66 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
67 atomic_long_set(&vm_numa_event[item], 0);
4518085e
KW
68}
69
70static void invalid_numa_statistics(void)
71{
72 zero_zones_numa_counters();
73 zero_global_numa_counters();
74}
75
76static DEFINE_MUTEX(vm_numa_stat_lock);
77
78int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
32927393 79 void *buffer, size_t *length, loff_t *ppos)
4518085e
KW
80{
81 int ret, oldval;
82
83 mutex_lock(&vm_numa_stat_lock);
84 if (write)
85 oldval = sysctl_vm_numa_stat;
86 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
87 if (ret || !write)
88 goto out;
89
90 if (oldval == sysctl_vm_numa_stat)
91 goto out;
92 else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
93 static_branch_enable(&vm_numa_stat_key);
94 pr_info("enable numa statistics\n");
95 } else {
96 static_branch_disable(&vm_numa_stat_key);
97 invalid_numa_statistics();
98 pr_info("disable numa statistics, and clear numa counters\n");
99 }
100
101out:
102 mutex_unlock(&vm_numa_stat_lock);
103 return ret;
104}
105#endif
106
f8891e5e
CL
107#ifdef CONFIG_VM_EVENT_COUNTERS
108DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
109EXPORT_PER_CPU_SYMBOL(vm_event_states);
110
31f961a8 111static void sum_vm_events(unsigned long *ret)
f8891e5e 112{
9eccf2a8 113 int cpu;
f8891e5e
CL
114 int i;
115
116 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
117
31f961a8 118 for_each_online_cpu(cpu) {
f8891e5e
CL
119 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
120
f8891e5e
CL
121 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
122 ret[i] += this->event[i];
123 }
124}
125
126/*
127 * Accumulate the vm event counters across all CPUs.
128 * The result is unavoidably approximate - it can change
129 * during and after execution of this function.
130*/
131void all_vm_events(unsigned long *ret)
132{
7625eccd 133 cpus_read_lock();
31f961a8 134 sum_vm_events(ret);
7625eccd 135 cpus_read_unlock();
f8891e5e 136}
32dd66fc 137EXPORT_SYMBOL_GPL(all_vm_events);
f8891e5e 138
f8891e5e
CL
139/*
140 * Fold the foreign cpu events into our own.
141 *
142 * This is adding to the events on one processor
143 * but keeps the global counts constant.
144 */
145void vm_events_fold_cpu(int cpu)
146{
147 struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
148 int i;
149
150 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
151 count_vm_events(i, fold_state->event[i]);
152 fold_state->event[i] = 0;
153 }
154}
f8891e5e
CL
155
156#endif /* CONFIG_VM_EVENT_COUNTERS */
157
2244b95a
CL
158/*
159 * Manage combined zone based / global counters
160 *
161 * vm_stat contains the global counters
162 */
75ef7184
MG
163atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
164atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
f19298b9 165atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
75ef7184
MG
166EXPORT_SYMBOL(vm_zone_stat);
167EXPORT_SYMBOL(vm_node_stat);
2244b95a 168
ebeac3ea
GU
169#ifdef CONFIG_NUMA
170static void fold_vm_zone_numa_events(struct zone *zone)
171{
172 unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
173 int cpu;
174 enum numa_stat_item item;
175
176 for_each_online_cpu(cpu) {
177 struct per_cpu_zonestat *pzstats;
178
179 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
180 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
181 zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
182 }
183
184 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
185 zone_numa_event_add(zone_numa_events[item], zone, item);
186}
187
188void fold_vm_numa_events(void)
189{
190 struct zone *zone;
191
192 for_each_populated_zone(zone)
193 fold_vm_zone_numa_events(zone);
194}
195#endif
196
2244b95a
CL
197#ifdef CONFIG_SMP
198
b44129b3 199int calculate_pressure_threshold(struct zone *zone)
88f5acf8
MG
200{
201 int threshold;
202 int watermark_distance;
203
204 /*
205 * As vmstats are not up to date, there is drift between the estimated
206 * and real values. For high thresholds and a high number of CPUs, it
207 * is possible for the min watermark to be breached while the estimated
208 * value looks fine. The pressure threshold is a reduced value such
209 * that even the maximum amount of drift will not accidentally breach
210 * the min watermark
211 */
212 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
213 threshold = max(1, (int)(watermark_distance / num_online_cpus()));
214
215 /*
216 * Maximum threshold is 125
217 */
218 threshold = min(125, threshold);
219
220 return threshold;
221}
222
b44129b3 223int calculate_normal_threshold(struct zone *zone)
df9ecaba
CL
224{
225 int threshold;
226 int mem; /* memory in 128 MB units */
227
228 /*
229 * The threshold scales with the number of processors and the amount
230 * of memory per zone. More memory means that we can defer updates for
231 * longer, more processors could lead to more contention.
232 * fls() is used to have a cheap way of logarithmic scaling.
233 *
234 * Some sample thresholds:
235 *
ea15ba17 236 * Threshold Processors (fls) Zonesize fls(mem)+1
df9ecaba
CL
237 * ------------------------------------------------------------------
238 * 8 1 1 0.9-1 GB 4
239 * 16 2 2 0.9-1 GB 4
240 * 20 2 2 1-2 GB 5
241 * 24 2 2 2-4 GB 6
242 * 28 2 2 4-8 GB 7
243 * 32 2 2 8-16 GB 8
244 * 4 2 2 <128M 1
245 * 30 4 3 2-4 GB 5
246 * 48 4 3 8-16 GB 8
247 * 32 8 4 1-2 GB 4
248 * 32 8 4 0.9-1GB 4
249 * 10 16 5 <128M 1
250 * 40 16 5 900M 4
251 * 70 64 7 2-4 GB 5
252 * 84 64 7 4-8 GB 6
253 * 108 512 9 4-8 GB 6
254 * 125 1024 10 8-16 GB 8
255 * 125 1024 10 16-32 GB 9
256 */
257
9705bea5 258 mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
df9ecaba
CL
259
260 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
261
262 /*
263 * Maximum threshold is 125
264 */
265 threshold = min(125, threshold);
266
267 return threshold;
268}
2244b95a
CL
269
270/*
df9ecaba 271 * Refresh the thresholds for each zone.
2244b95a 272 */
a6cccdc3 273void refresh_zone_stat_thresholds(void)
2244b95a 274{
75ef7184 275 struct pglist_data *pgdat;
df9ecaba
CL
276 struct zone *zone;
277 int cpu;
278 int threshold;
279
75ef7184
MG
280 /* Zero current pgdat thresholds */
281 for_each_online_pgdat(pgdat) {
282 for_each_online_cpu(cpu) {
283 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
284 }
285 }
286
ee99c71c 287 for_each_populated_zone(zone) {
75ef7184 288 struct pglist_data *pgdat = zone->zone_pgdat;
aa454840
CL
289 unsigned long max_drift, tolerate_drift;
290
b44129b3 291 threshold = calculate_normal_threshold(zone);
df9ecaba 292
75ef7184
MG
293 for_each_online_cpu(cpu) {
294 int pgdat_threshold;
295
28f836b6 296 per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
99dcc3e5 297 = threshold;
1d90ca89 298
75ef7184
MG
299 /* Base nodestat threshold on the largest populated zone. */
300 pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
301 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
302 = max(threshold, pgdat_threshold);
303 }
304
aa454840
CL
305 /*
306 * Only set percpu_drift_mark if there is a danger that
307 * NR_FREE_PAGES reports the low watermark is ok when in fact
308 * the min watermark could be breached by an allocation
309 */
310 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
311 max_drift = num_online_cpus() * threshold;
312 if (max_drift > tolerate_drift)
313 zone->percpu_drift_mark = high_wmark_pages(zone) +
314 max_drift;
df9ecaba 315 }
2244b95a
CL
316}
317
b44129b3
MG
318void set_pgdat_percpu_threshold(pg_data_t *pgdat,
319 int (*calculate_pressure)(struct zone *))
88f5acf8
MG
320{
321 struct zone *zone;
322 int cpu;
323 int threshold;
324 int i;
325
88f5acf8
MG
326 for (i = 0; i < pgdat->nr_zones; i++) {
327 zone = &pgdat->node_zones[i];
328 if (!zone->percpu_drift_mark)
329 continue;
330
b44129b3 331 threshold = (*calculate_pressure)(zone);
1d90ca89 332 for_each_online_cpu(cpu)
28f836b6 333 per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
88f5acf8
MG
334 = threshold;
335 }
88f5acf8
MG
336}
337
2244b95a 338/*
bea04b07
JZ
339 * For use when we know that interrupts are disabled,
340 * or when we know that preemption is disabled and that
341 * particular counter cannot be updated from interrupt context.
2244b95a
CL
342 */
343void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 344 long delta)
2244b95a 345{
28f836b6 346 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
12938a92 347 s8 __percpu *p = pcp->vm_stat_diff + item;
2244b95a 348 long x;
12938a92
CL
349 long t;
350
c68ed794
IM
351 /*
352 * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels,
353 * atomicity is provided by IRQs being disabled -- either explicitly
354 * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables
355 * CPU migrations and preemption potentially corrupts a counter so
356 * disable preemption.
357 */
7a025e91 358 preempt_disable_nested();
c68ed794 359
12938a92 360 x = delta + __this_cpu_read(*p);
2244b95a 361
12938a92 362 t = __this_cpu_read(pcp->stat_threshold);
2244b95a 363
40610076 364 if (unlikely(abs(x) > t)) {
2244b95a
CL
365 zone_page_state_add(x, zone, item);
366 x = 0;
367 }
12938a92 368 __this_cpu_write(*p, x);
c68ed794 369
7a025e91 370 preempt_enable_nested();
2244b95a
CL
371}
372EXPORT_SYMBOL(__mod_zone_page_state);
373
75ef7184
MG
374void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
375 long delta)
376{
377 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
378 s8 __percpu *p = pcp->vm_node_stat_diff + item;
379 long x;
380 long t;
381
ea426c2a 382 if (vmstat_item_in_bytes(item)) {
629484ae
JW
383 /*
384 * Only cgroups use subpage accounting right now; at
385 * the global level, these items still change in
386 * multiples of whole pages. Store them as pages
387 * internally to keep the per-cpu counters compact.
388 */
ea426c2a
RG
389 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
390 delta >>= PAGE_SHIFT;
391 }
392
c68ed794 393 /* See __mod_node_page_state */
7a025e91 394 preempt_disable_nested();
c68ed794 395
75ef7184
MG
396 x = delta + __this_cpu_read(*p);
397
398 t = __this_cpu_read(pcp->stat_threshold);
399
40610076 400 if (unlikely(abs(x) > t)) {
75ef7184
MG
401 node_page_state_add(x, pgdat, item);
402 x = 0;
403 }
404 __this_cpu_write(*p, x);
c68ed794 405
7a025e91 406 preempt_enable_nested();
75ef7184
MG
407}
408EXPORT_SYMBOL(__mod_node_page_state);
409
2244b95a
CL
410/*
411 * Optimized increment and decrement functions.
412 *
413 * These are only for a single page and therefore can take a struct page *
414 * argument instead of struct zone *. This allows the inclusion of the code
415 * generated for page_zone(page) into the optimized functions.
416 *
417 * No overflow check is necessary and therefore the differential can be
418 * incremented or decremented in place which may allow the compilers to
419 * generate better code.
2244b95a
CL
420 * The increment or decrement is known and therefore one boundary check can
421 * be omitted.
422 *
df9ecaba
CL
423 * NOTE: These functions are very performance sensitive. Change only
424 * with care.
425 *
2244b95a
CL
426 * Some processors have inc/dec instructions that are atomic vs an interrupt.
427 * However, the code must first determine the differential location in a zone
428 * based on the processor number and then inc/dec the counter. There is no
429 * guarantee without disabling preemption that the processor will not change
430 * in between and therefore the atomicity vs. interrupt cannot be exploited
431 * in a useful way here.
432 */
c8785385 433void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
2244b95a 434{
28f836b6 435 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
12938a92
CL
436 s8 __percpu *p = pcp->vm_stat_diff + item;
437 s8 v, t;
2244b95a 438
c68ed794 439 /* See __mod_node_page_state */
7a025e91 440 preempt_disable_nested();
c68ed794 441
908ee0f1 442 v = __this_cpu_inc_return(*p);
12938a92
CL
443 t = __this_cpu_read(pcp->stat_threshold);
444 if (unlikely(v > t)) {
445 s8 overstep = t >> 1;
df9ecaba 446
12938a92
CL
447 zone_page_state_add(v + overstep, zone, item);
448 __this_cpu_write(*p, -overstep);
2244b95a 449 }
c68ed794 450
7a025e91 451 preempt_enable_nested();
2244b95a 452}
ca889e6c 453
75ef7184
MG
454void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
455{
456 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
457 s8 __percpu *p = pcp->vm_node_stat_diff + item;
458 s8 v, t;
459
ea426c2a
RG
460 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
461
c68ed794 462 /* See __mod_node_page_state */
7a025e91 463 preempt_disable_nested();
c68ed794 464
75ef7184
MG
465 v = __this_cpu_inc_return(*p);
466 t = __this_cpu_read(pcp->stat_threshold);
467 if (unlikely(v > t)) {
468 s8 overstep = t >> 1;
469
470 node_page_state_add(v + overstep, pgdat, item);
471 __this_cpu_write(*p, -overstep);
472 }
c68ed794 473
7a025e91 474 preempt_enable_nested();
75ef7184
MG
475}
476
ca889e6c
CL
477void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
478{
479 __inc_zone_state(page_zone(page), item);
480}
2244b95a
CL
481EXPORT_SYMBOL(__inc_zone_page_state);
482
75ef7184
MG
483void __inc_node_page_state(struct page *page, enum node_stat_item item)
484{
485 __inc_node_state(page_pgdat(page), item);
486}
487EXPORT_SYMBOL(__inc_node_page_state);
488
c8785385 489void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
2244b95a 490{
28f836b6 491 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
12938a92
CL
492 s8 __percpu *p = pcp->vm_stat_diff + item;
493 s8 v, t;
2244b95a 494
c68ed794 495 /* See __mod_node_page_state */
7a025e91 496 preempt_disable_nested();
c68ed794 497
908ee0f1 498 v = __this_cpu_dec_return(*p);
12938a92
CL
499 t = __this_cpu_read(pcp->stat_threshold);
500 if (unlikely(v < - t)) {
501 s8 overstep = t >> 1;
2244b95a 502
12938a92
CL
503 zone_page_state_add(v - overstep, zone, item);
504 __this_cpu_write(*p, overstep);
2244b95a 505 }
c68ed794 506
7a025e91 507 preempt_enable_nested();
2244b95a 508}
c8785385 509
75ef7184
MG
510void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
511{
512 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
513 s8 __percpu *p = pcp->vm_node_stat_diff + item;
514 s8 v, t;
515
ea426c2a
RG
516 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
517
c68ed794 518 /* See __mod_node_page_state */
7a025e91 519 preempt_disable_nested();
c68ed794 520
75ef7184
MG
521 v = __this_cpu_dec_return(*p);
522 t = __this_cpu_read(pcp->stat_threshold);
523 if (unlikely(v < - t)) {
524 s8 overstep = t >> 1;
525
526 node_page_state_add(v - overstep, pgdat, item);
527 __this_cpu_write(*p, overstep);
528 }
c68ed794 529
7a025e91 530 preempt_enable_nested();
75ef7184
MG
531}
532
c8785385
CL
533void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
534{
535 __dec_zone_state(page_zone(page), item);
536}
2244b95a
CL
537EXPORT_SYMBOL(__dec_zone_page_state);
538
75ef7184
MG
539void __dec_node_page_state(struct page *page, enum node_stat_item item)
540{
541 __dec_node_state(page_pgdat(page), item);
542}
543EXPORT_SYMBOL(__dec_node_page_state);
544
4156153c 545#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
7c839120
CL
546/*
547 * If we have cmpxchg_local support then we do not need to incur the overhead
548 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
549 *
550 * mod_state() modifies the zone counter state through atomic per cpu
551 * operations.
552 *
553 * Overstep mode specifies how overstep should handled:
554 * 0 No overstepping
555 * 1 Overstepping half of threshold
556 * -1 Overstepping minus half of threshold
557*/
75ef7184
MG
558static inline void mod_zone_state(struct zone *zone,
559 enum zone_stat_item item, long delta, int overstep_mode)
7c839120 560{
28f836b6 561 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
7c839120
CL
562 s8 __percpu *p = pcp->vm_stat_diff + item;
563 long o, n, t, z;
564
565 do {
566 z = 0; /* overflow to zone counters */
567
568 /*
569 * The fetching of the stat_threshold is racy. We may apply
570 * a counter threshold to the wrong the cpu if we get
d3bc2367
CL
571 * rescheduled while executing here. However, the next
572 * counter update will apply the threshold again and
573 * therefore bring the counter under the threshold again.
574 *
575 * Most of the time the thresholds are the same anyways
576 * for all cpus in a zone.
7c839120
CL
577 */
578 t = this_cpu_read(pcp->stat_threshold);
579
580 o = this_cpu_read(*p);
581 n = delta + o;
582
40610076 583 if (abs(n) > t) {
7c839120
CL
584 int os = overstep_mode * (t >> 1) ;
585
586 /* Overflow must be added to zone counters */
587 z = n + os;
588 n = -os;
589 }
590 } while (this_cpu_cmpxchg(*p, o, n) != o);
591
592 if (z)
593 zone_page_state_add(z, zone, item);
594}
595
596void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 597 long delta)
7c839120 598{
75ef7184 599 mod_zone_state(zone, item, delta, 0);
7c839120
CL
600}
601EXPORT_SYMBOL(mod_zone_page_state);
602
7c839120
CL
603void inc_zone_page_state(struct page *page, enum zone_stat_item item)
604{
75ef7184 605 mod_zone_state(page_zone(page), item, 1, 1);
7c839120
CL
606}
607EXPORT_SYMBOL(inc_zone_page_state);
608
609void dec_zone_page_state(struct page *page, enum zone_stat_item item)
610{
75ef7184 611 mod_zone_state(page_zone(page), item, -1, -1);
7c839120
CL
612}
613EXPORT_SYMBOL(dec_zone_page_state);
75ef7184
MG
614
615static inline void mod_node_state(struct pglist_data *pgdat,
616 enum node_stat_item item, int delta, int overstep_mode)
617{
618 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
619 s8 __percpu *p = pcp->vm_node_stat_diff + item;
620 long o, n, t, z;
621
ea426c2a 622 if (vmstat_item_in_bytes(item)) {
629484ae
JW
623 /*
624 * Only cgroups use subpage accounting right now; at
625 * the global level, these items still change in
626 * multiples of whole pages. Store them as pages
627 * internally to keep the per-cpu counters compact.
628 */
ea426c2a
RG
629 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
630 delta >>= PAGE_SHIFT;
631 }
632
75ef7184
MG
633 do {
634 z = 0; /* overflow to node counters */
635
636 /*
637 * The fetching of the stat_threshold is racy. We may apply
638 * a counter threshold to the wrong the cpu if we get
639 * rescheduled while executing here. However, the next
640 * counter update will apply the threshold again and
641 * therefore bring the counter under the threshold again.
642 *
643 * Most of the time the thresholds are the same anyways
644 * for all cpus in a node.
645 */
646 t = this_cpu_read(pcp->stat_threshold);
647
648 o = this_cpu_read(*p);
649 n = delta + o;
650
40610076 651 if (abs(n) > t) {
75ef7184
MG
652 int os = overstep_mode * (t >> 1) ;
653
654 /* Overflow must be added to node counters */
655 z = n + os;
656 n = -os;
657 }
658 } while (this_cpu_cmpxchg(*p, o, n) != o);
659
660 if (z)
661 node_page_state_add(z, pgdat, item);
662}
663
664void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
665 long delta)
666{
667 mod_node_state(pgdat, item, delta, 0);
668}
669EXPORT_SYMBOL(mod_node_page_state);
670
671void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
672{
673 mod_node_state(pgdat, item, 1, 1);
674}
675
676void inc_node_page_state(struct page *page, enum node_stat_item item)
677{
678 mod_node_state(page_pgdat(page), item, 1, 1);
679}
680EXPORT_SYMBOL(inc_node_page_state);
681
682void dec_node_page_state(struct page *page, enum node_stat_item item)
683{
684 mod_node_state(page_pgdat(page), item, -1, -1);
685}
686EXPORT_SYMBOL(dec_node_page_state);
7c839120
CL
687#else
688/*
689 * Use interrupt disable to serialize counter updates
690 */
691void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 692 long delta)
7c839120
CL
693{
694 unsigned long flags;
695
696 local_irq_save(flags);
697 __mod_zone_page_state(zone, item, delta);
698 local_irq_restore(flags);
699}
700EXPORT_SYMBOL(mod_zone_page_state);
701
2244b95a
CL
702void inc_zone_page_state(struct page *page, enum zone_stat_item item)
703{
704 unsigned long flags;
705 struct zone *zone;
2244b95a
CL
706
707 zone = page_zone(page);
708 local_irq_save(flags);
ca889e6c 709 __inc_zone_state(zone, item);
2244b95a
CL
710 local_irq_restore(flags);
711}
712EXPORT_SYMBOL(inc_zone_page_state);
713
714void dec_zone_page_state(struct page *page, enum zone_stat_item item)
715{
716 unsigned long flags;
2244b95a 717
2244b95a 718 local_irq_save(flags);
a302eb4e 719 __dec_zone_page_state(page, item);
2244b95a
CL
720 local_irq_restore(flags);
721}
722EXPORT_SYMBOL(dec_zone_page_state);
723
75ef7184
MG
724void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
725{
726 unsigned long flags;
727
728 local_irq_save(flags);
729 __inc_node_state(pgdat, item);
730 local_irq_restore(flags);
731}
732EXPORT_SYMBOL(inc_node_state);
733
734void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
735 long delta)
736{
737 unsigned long flags;
738
739 local_irq_save(flags);
740 __mod_node_page_state(pgdat, item, delta);
741 local_irq_restore(flags);
742}
743EXPORT_SYMBOL(mod_node_page_state);
744
745void inc_node_page_state(struct page *page, enum node_stat_item item)
746{
747 unsigned long flags;
748 struct pglist_data *pgdat;
749
750 pgdat = page_pgdat(page);
751 local_irq_save(flags);
752 __inc_node_state(pgdat, item);
753 local_irq_restore(flags);
754}
755EXPORT_SYMBOL(inc_node_page_state);
756
757void dec_node_page_state(struct page *page, enum node_stat_item item)
758{
759 unsigned long flags;
760
761 local_irq_save(flags);
762 __dec_node_page_state(page, item);
763 local_irq_restore(flags);
764}
765EXPORT_SYMBOL(dec_node_page_state);
766#endif
7cc36bbd
CL
767
768/*
769 * Fold a differential into the global counters.
770 * Returns the number of counters updated.
771 */
f19298b9 772static int fold_diff(int *zone_diff, int *node_diff)
3a321d2a
KW
773{
774 int i;
775 int changes = 0;
776
777 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
778 if (zone_diff[i]) {
779 atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
780 changes++;
781 }
782
3a321d2a
KW
783 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
784 if (node_diff[i]) {
785 atomic_long_add(node_diff[i], &vm_node_stat[i]);
786 changes++;
787 }
788 return changes;
789}
f19298b9 790
2244b95a 791/*
2bb921e5 792 * Update the zone counters for the current cpu.
a7f75e25 793 *
4037d452
CL
794 * Note that refresh_cpu_vm_stats strives to only access
795 * node local memory. The per cpu pagesets on remote zones are placed
796 * in the memory local to the processor using that pageset. So the
797 * loop over all zones will access a series of cachelines local to
798 * the processor.
799 *
800 * The call to zone_page_state_add updates the cachelines with the
801 * statistics in the remote zone struct as well as the global cachelines
802 * with the global counters. These could cause remote node cache line
803 * bouncing and will have to be only done when necessary.
7cc36bbd
CL
804 *
805 * The function returns the number of global counters updated.
2244b95a 806 */
0eb77e98 807static int refresh_cpu_vm_stats(bool do_pagesets)
2244b95a 808{
75ef7184 809 struct pglist_data *pgdat;
2244b95a
CL
810 struct zone *zone;
811 int i;
75ef7184
MG
812 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
813 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
7cc36bbd 814 int changes = 0;
2244b95a 815
ee99c71c 816 for_each_populated_zone(zone) {
28f836b6
MG
817 struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
818#ifdef CONFIG_NUMA
819 struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
820#endif
2244b95a 821
fbc2edb0
CL
822 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
823 int v;
2244b95a 824
28f836b6 825 v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0);
fbc2edb0 826 if (v) {
a7f75e25 827
a7f75e25 828 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 829 global_zone_diff[i] += v;
4037d452
CL
830#ifdef CONFIG_NUMA
831 /* 3 seconds idle till flush */
28f836b6 832 __this_cpu_write(pcp->expire, 3);
4037d452 833#endif
2244b95a 834 }
fbc2edb0 835 }
4037d452 836#ifdef CONFIG_NUMA
3a321d2a 837
0eb77e98
CL
838 if (do_pagesets) {
839 cond_resched();
840 /*
841 * Deal with draining the remote pageset of this
842 * processor
843 *
844 * Check if there are pages remaining in this pageset
845 * if not then there is nothing to expire.
846 */
28f836b6
MG
847 if (!__this_cpu_read(pcp->expire) ||
848 !__this_cpu_read(pcp->count))
0eb77e98 849 continue;
4037d452 850
0eb77e98
CL
851 /*
852 * We never drain zones local to this processor.
853 */
854 if (zone_to_nid(zone) == numa_node_id()) {
28f836b6 855 __this_cpu_write(pcp->expire, 0);
0eb77e98
CL
856 continue;
857 }
4037d452 858
28f836b6 859 if (__this_cpu_dec_return(pcp->expire))
0eb77e98 860 continue;
4037d452 861
28f836b6
MG
862 if (__this_cpu_read(pcp->count)) {
863 drain_zone_pages(zone, this_cpu_ptr(pcp));
0eb77e98
CL
864 changes++;
865 }
7cc36bbd 866 }
4037d452 867#endif
2244b95a 868 }
75ef7184
MG
869
870 for_each_online_pgdat(pgdat) {
871 struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
872
873 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
874 int v;
875
876 v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
877 if (v) {
878 atomic_long_add(v, &pgdat->vm_stat[i]);
879 global_node_diff[i] += v;
880 }
881 }
882 }
883
884 changes += fold_diff(global_zone_diff, global_node_diff);
7cc36bbd 885 return changes;
2244b95a
CL
886}
887
2bb921e5
CL
888/*
889 * Fold the data for an offline cpu into the global array.
890 * There cannot be any access by the offline cpu and therefore
891 * synchronization is simplified.
892 */
893void cpu_vm_stats_fold(int cpu)
894{
75ef7184 895 struct pglist_data *pgdat;
2bb921e5
CL
896 struct zone *zone;
897 int i;
75ef7184
MG
898 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
899 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
2bb921e5
CL
900
901 for_each_populated_zone(zone) {
28f836b6 902 struct per_cpu_zonestat *pzstats;
2bb921e5 903
28f836b6 904 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
2bb921e5 905
f19298b9 906 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
28f836b6 907 if (pzstats->vm_stat_diff[i]) {
2bb921e5
CL
908 int v;
909
28f836b6
MG
910 v = pzstats->vm_stat_diff[i];
911 pzstats->vm_stat_diff[i] = 0;
2bb921e5 912 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 913 global_zone_diff[i] += v;
2bb921e5 914 }
f19298b9 915 }
3a321d2a 916#ifdef CONFIG_NUMA
f19298b9
MG
917 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
918 if (pzstats->vm_numa_event[i]) {
919 unsigned long v;
3a321d2a 920
f19298b9
MG
921 v = pzstats->vm_numa_event[i];
922 pzstats->vm_numa_event[i] = 0;
923 zone_numa_event_add(v, zone, i);
3a321d2a 924 }
f19298b9 925 }
3a321d2a 926#endif
2bb921e5
CL
927 }
928
75ef7184
MG
929 for_each_online_pgdat(pgdat) {
930 struct per_cpu_nodestat *p;
931
932 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
933
934 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
935 if (p->vm_node_stat_diff[i]) {
936 int v;
937
938 v = p->vm_node_stat_diff[i];
939 p->vm_node_stat_diff[i] = 0;
940 atomic_long_add(v, &pgdat->vm_stat[i]);
941 global_node_diff[i] += v;
942 }
943 }
944
945 fold_diff(global_zone_diff, global_node_diff);
2bb921e5
CL
946}
947
40f4b1ea
CS
948/*
949 * this is only called if !populated_zone(zone), which implies no other users of
f0953a1b 950 * pset->vm_stat_diff[] exist.
40f4b1ea 951 */
28f836b6 952void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats)
5a883813 953{
f19298b9 954 unsigned long v;
5a883813
MK
955 int i;
956
f19298b9 957 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
28f836b6 958 if (pzstats->vm_stat_diff[i]) {
f19298b9 959 v = pzstats->vm_stat_diff[i];
28f836b6 960 pzstats->vm_stat_diff[i] = 0;
f19298b9 961 zone_page_state_add(v, zone, i);
5a883813 962 }
f19298b9 963 }
3a321d2a
KW
964
965#ifdef CONFIG_NUMA
f19298b9
MG
966 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
967 if (pzstats->vm_numa_event[i]) {
968 v = pzstats->vm_numa_event[i];
969 pzstats->vm_numa_event[i] = 0;
970 zone_numa_event_add(v, zone, i);
3a321d2a 971 }
f19298b9 972 }
3a321d2a 973#endif
5a883813 974}
2244b95a
CL
975#endif
976
ca889e6c 977#ifdef CONFIG_NUMA
c2d42c16 978/*
75ef7184
MG
979 * Determine the per node value of a stat item. This function
980 * is called frequently in a NUMA machine, so try to be as
981 * frugal as possible.
c2d42c16 982 */
75ef7184
MG
983unsigned long sum_zone_node_page_state(int node,
984 enum zone_stat_item item)
c2d42c16
AM
985{
986 struct zone *zones = NODE_DATA(node)->node_zones;
e87d59f7
JK
987 int i;
988 unsigned long count = 0;
c2d42c16 989
e87d59f7
JK
990 for (i = 0; i < MAX_NR_ZONES; i++)
991 count += zone_page_state(zones + i, item);
992
993 return count;
c2d42c16
AM
994}
995
f19298b9
MG
996/* Determine the per node value of a numa stat item. */
997unsigned long sum_zone_numa_event_state(int node,
3a321d2a
KW
998 enum numa_stat_item item)
999{
1000 struct zone *zones = NODE_DATA(node)->node_zones;
3a321d2a 1001 unsigned long count = 0;
f19298b9 1002 int i;
3a321d2a
KW
1003
1004 for (i = 0; i < MAX_NR_ZONES; i++)
f19298b9 1005 count += zone_numa_event_state(zones + i, item);
3a321d2a
KW
1006
1007 return count;
1008}
1009
75ef7184
MG
1010/*
1011 * Determine the per node value of a stat item.
1012 */
ea426c2a
RG
1013unsigned long node_page_state_pages(struct pglist_data *pgdat,
1014 enum node_stat_item item)
75ef7184
MG
1015{
1016 long x = atomic_long_read(&pgdat->vm_stat[item]);
1017#ifdef CONFIG_SMP
1018 if (x < 0)
1019 x = 0;
1020#endif
1021 return x;
1022}
ea426c2a
RG
1023
1024unsigned long node_page_state(struct pglist_data *pgdat,
1025 enum node_stat_item item)
1026{
1027 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
1028
1029 return node_page_state_pages(pgdat, item);
1030}
ca889e6c
CL
1031#endif
1032
d7a5752c 1033#ifdef CONFIG_COMPACTION
36deb0be 1034
d7a5752c
MG
1035struct contig_page_info {
1036 unsigned long free_pages;
1037 unsigned long free_blocks_total;
1038 unsigned long free_blocks_suitable;
1039};
1040
1041/*
1042 * Calculate the number of free pages in a zone, how many contiguous
1043 * pages are free and how many are large enough to satisfy an allocation of
1044 * the target size. Note that this function makes no attempt to estimate
1045 * how many suitable free blocks there *might* be if MOVABLE pages were
1046 * migrated. Calculating that is possible, but expensive and can be
1047 * figured out from userspace
1048 */
1049static void fill_contig_page_info(struct zone *zone,
1050 unsigned int suitable_order,
1051 struct contig_page_info *info)
1052{
1053 unsigned int order;
1054
1055 info->free_pages = 0;
1056 info->free_blocks_total = 0;
1057 info->free_blocks_suitable = 0;
1058
23baf831 1059 for (order = 0; order <= MAX_ORDER; order++) {
d7a5752c
MG
1060 unsigned long blocks;
1061
af1c31ac
LS
1062 /*
1063 * Count number of free blocks.
1064 *
1065 * Access to nr_free is lockless as nr_free is used only for
1066 * diagnostic purposes. Use data_race to avoid KCSAN warning.
1067 */
1068 blocks = data_race(zone->free_area[order].nr_free);
d7a5752c
MG
1069 info->free_blocks_total += blocks;
1070
1071 /* Count free base pages */
1072 info->free_pages += blocks << order;
1073
1074 /* Count the suitable free blocks */
1075 if (order >= suitable_order)
1076 info->free_blocks_suitable += blocks <<
1077 (order - suitable_order);
1078 }
1079}
f1a5ab12
MG
1080
1081/*
1082 * A fragmentation index only makes sense if an allocation of a requested
1083 * size would fail. If that is true, the fragmentation index indicates
1084 * whether external fragmentation or a lack of memory was the problem.
1085 * The value can be used to determine if page reclaim or compaction
1086 * should be used
1087 */
56de7263 1088static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
f1a5ab12
MG
1089{
1090 unsigned long requested = 1UL << order;
1091
23baf831 1092 if (WARN_ON_ONCE(order > MAX_ORDER))
88d6ac40
WY
1093 return 0;
1094
f1a5ab12
MG
1095 if (!info->free_blocks_total)
1096 return 0;
1097
1098 /* Fragmentation index only makes sense when a request would fail */
1099 if (info->free_blocks_suitable)
1100 return -1000;
1101
1102 /*
1103 * Index is between 0 and 1 so return within 3 decimal places
1104 *
1105 * 0 => allocation would fail due to lack of memory
1106 * 1 => allocation would fail due to fragmentation
1107 */
1108 return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
1109}
56de7263 1110
facdaa91
NG
1111/*
1112 * Calculates external fragmentation within a zone wrt the given order.
1113 * It is defined as the percentage of pages found in blocks of size
1114 * less than 1 << order. It returns values in range [0, 100].
1115 */
d34c0a75 1116unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
facdaa91
NG
1117{
1118 struct contig_page_info info;
1119
1120 fill_contig_page_info(zone, order, &info);
1121 if (info.free_pages == 0)
1122 return 0;
1123
1124 return div_u64((info.free_pages -
1125 (info.free_blocks_suitable << order)) * 100,
1126 info.free_pages);
1127}
1128
56de7263
MG
1129/* Same as __fragmentation index but allocs contig_page_info on stack */
1130int fragmentation_index(struct zone *zone, unsigned int order)
1131{
1132 struct contig_page_info info;
1133
1134 fill_contig_page_info(zone, order, &info);
1135 return __fragmentation_index(order, &info);
1136}
d7a5752c
MG
1137#endif
1138
ebc5d83d
KK
1139#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
1140 defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
fa25c503
KM
1141#ifdef CONFIG_ZONE_DMA
1142#define TEXT_FOR_DMA(xx) xx "_dma",
1143#else
1144#define TEXT_FOR_DMA(xx)
1145#endif
1146
1147#ifdef CONFIG_ZONE_DMA32
1148#define TEXT_FOR_DMA32(xx) xx "_dma32",
1149#else
1150#define TEXT_FOR_DMA32(xx)
1151#endif
1152
1153#ifdef CONFIG_HIGHMEM
1154#define TEXT_FOR_HIGHMEM(xx) xx "_high",
1155#else
1156#define TEXT_FOR_HIGHMEM(xx)
1157#endif
1158
a39c5d3c
HL
1159#ifdef CONFIG_ZONE_DEVICE
1160#define TEXT_FOR_DEVICE(xx) xx "_device",
1161#else
1162#define TEXT_FOR_DEVICE(xx)
1163#endif
1164
fa25c503 1165#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
a39c5d3c
HL
1166 TEXT_FOR_HIGHMEM(xx) xx "_movable", \
1167 TEXT_FOR_DEVICE(xx)
fa25c503
KM
1168
1169const char * const vmstat_text[] = {
8d92890b 1170 /* enum zone_stat_item counters */
fa25c503 1171 "nr_free_pages",
71c799f4
MK
1172 "nr_zone_inactive_anon",
1173 "nr_zone_active_anon",
1174 "nr_zone_inactive_file",
1175 "nr_zone_active_file",
1176 "nr_zone_unevictable",
5a1c84b4 1177 "nr_zone_write_pending",
fa25c503 1178 "nr_mlock",
fa25c503 1179 "nr_bounce",
91537fee
MK
1180#if IS_ENABLED(CONFIG_ZSMALLOC)
1181 "nr_zspages",
1182#endif
3a321d2a 1183 "nr_free_cma",
dcdfdd40
KS
1184#ifdef CONFIG_UNACCEPTED_MEMORY
1185 "nr_unaccepted",
1186#endif
3a321d2a
KW
1187
1188 /* enum numa_stat_item counters */
fa25c503
KM
1189#ifdef CONFIG_NUMA
1190 "numa_hit",
1191 "numa_miss",
1192 "numa_foreign",
1193 "numa_interleave",
1194 "numa_local",
1195 "numa_other",
1196#endif
09316c09 1197
9d7ea9a2 1198 /* enum node_stat_item counters */
599d0c95
MG
1199 "nr_inactive_anon",
1200 "nr_active_anon",
1201 "nr_inactive_file",
1202 "nr_active_file",
1203 "nr_unevictable",
385386cf
JW
1204 "nr_slab_reclaimable",
1205 "nr_slab_unreclaimable",
599d0c95
MG
1206 "nr_isolated_anon",
1207 "nr_isolated_file",
68d48e6a 1208 "workingset_nodes",
170b04b7
JK
1209 "workingset_refault_anon",
1210 "workingset_refault_file",
1211 "workingset_activate_anon",
1212 "workingset_activate_file",
1213 "workingset_restore_anon",
1214 "workingset_restore_file",
1e6b1085 1215 "workingset_nodereclaim",
50658e2e
MG
1216 "nr_anon_pages",
1217 "nr_mapped",
11fb9989
MG
1218 "nr_file_pages",
1219 "nr_dirty",
1220 "nr_writeback",
1221 "nr_writeback_temp",
1222 "nr_shmem",
1223 "nr_shmem_hugepages",
1224 "nr_shmem_pmdmapped",
60fbf0ab
SL
1225 "nr_file_hugepages",
1226 "nr_file_pmdmapped",
11fb9989 1227 "nr_anon_transparent_hugepages",
c4a25635
MG
1228 "nr_vmscan_write",
1229 "nr_vmscan_immediate_reclaim",
1230 "nr_dirtied",
1231 "nr_written",
8cd7c588 1232 "nr_throttled_written",
b29940c1 1233 "nr_kernel_misc_reclaimable",
1970dc6f
JH
1234 "nr_foll_pin_acquired",
1235 "nr_foll_pin_released",
991e7673
SB
1236 "nr_kernel_stack",
1237#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
1238 "nr_shadow_call_stack",
1239#endif
f0c0c115 1240 "nr_page_table_pages",
ebc97a52 1241 "nr_sec_page_table_pages",
b6038942
SB
1242#ifdef CONFIG_SWAP
1243 "nr_swapcached",
1244#endif
e39bb6be
HY
1245#ifdef CONFIG_NUMA_BALANCING
1246 "pgpromote_success",
c6833e10 1247 "pgpromote_candidate",
e39bb6be 1248#endif
599d0c95 1249
09316c09 1250 /* enum writeback_stat_item counters */
fa25c503
KM
1251 "nr_dirty_threshold",
1252 "nr_dirty_background_threshold",
1253
ebc5d83d 1254#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
09316c09 1255 /* enum vm_event_item counters */
fa25c503
KM
1256 "pgpgin",
1257 "pgpgout",
1258 "pswpin",
1259 "pswpout",
1260
1261 TEXTS_FOR_ZONES("pgalloc")
7cc30fcf
MG
1262 TEXTS_FOR_ZONES("allocstall")
1263 TEXTS_FOR_ZONES("pgskip")
fa25c503
KM
1264
1265 "pgfree",
1266 "pgactivate",
1267 "pgdeactivate",
f7ad2a6c 1268 "pglazyfree",
fa25c503
KM
1269
1270 "pgfault",
1271 "pgmajfault",
854e9ed0 1272 "pglazyfreed",
fa25c503 1273
599d0c95 1274 "pgrefill",
798a6b87 1275 "pgreuse",
599d0c95
MG
1276 "pgsteal_kswapd",
1277 "pgsteal_direct",
57e9cc50 1278 "pgsteal_khugepaged",
668e4147
YS
1279 "pgdemote_kswapd",
1280 "pgdemote_direct",
57e9cc50 1281 "pgdemote_khugepaged",
599d0c95
MG
1282 "pgscan_kswapd",
1283 "pgscan_direct",
57e9cc50 1284 "pgscan_khugepaged",
68243e76 1285 "pgscan_direct_throttle",
497a6c1b
JW
1286 "pgscan_anon",
1287 "pgscan_file",
1288 "pgsteal_anon",
1289 "pgsteal_file",
fa25c503
KM
1290
1291#ifdef CONFIG_NUMA
1292 "zone_reclaim_failed",
1293#endif
1294 "pginodesteal",
1295 "slabs_scanned",
fa25c503
KM
1296 "kswapd_inodesteal",
1297 "kswapd_low_wmark_hit_quickly",
1298 "kswapd_high_wmark_hit_quickly",
fa25c503 1299 "pageoutrun",
fa25c503
KM
1300
1301 "pgrotated",
1302
5509a5d2
DH
1303 "drop_pagecache",
1304 "drop_slab",
8e675f7a 1305 "oom_kill",
5509a5d2 1306
03c5a6e1
MG
1307#ifdef CONFIG_NUMA_BALANCING
1308 "numa_pte_updates",
72403b4a 1309 "numa_huge_pte_updates",
03c5a6e1
MG
1310 "numa_hint_faults",
1311 "numa_hint_faults_local",
1312 "numa_pages_migrated",
1313#endif
5647bc29
MG
1314#ifdef CONFIG_MIGRATION
1315 "pgmigrate_success",
1316 "pgmigrate_fail",
1a5bae25
AK
1317 "thp_migration_success",
1318 "thp_migration_fail",
1319 "thp_migration_split",
5647bc29 1320#endif
fa25c503 1321#ifdef CONFIG_COMPACTION
397487db
MG
1322 "compact_migrate_scanned",
1323 "compact_free_scanned",
1324 "compact_isolated",
fa25c503
KM
1325 "compact_stall",
1326 "compact_fail",
1327 "compact_success",
698b1b30 1328 "compact_daemon_wake",
7f354a54
DR
1329 "compact_daemon_migrate_scanned",
1330 "compact_daemon_free_scanned",
fa25c503
KM
1331#endif
1332
1333#ifdef CONFIG_HUGETLB_PAGE
1334 "htlb_buddy_alloc_success",
1335 "htlb_buddy_alloc_fail",
bbb26920
MK
1336#endif
1337#ifdef CONFIG_CMA
1338 "cma_alloc_success",
1339 "cma_alloc_fail",
fa25c503
KM
1340#endif
1341 "unevictable_pgs_culled",
1342 "unevictable_pgs_scanned",
1343 "unevictable_pgs_rescued",
1344 "unevictable_pgs_mlocked",
1345 "unevictable_pgs_munlocked",
1346 "unevictable_pgs_cleared",
1347 "unevictable_pgs_stranded",
fa25c503
KM
1348
1349#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1350 "thp_fault_alloc",
1351 "thp_fault_fallback",
85b9f46e 1352 "thp_fault_fallback_charge",
fa25c503
KM
1353 "thp_collapse_alloc",
1354 "thp_collapse_alloc_failed",
95ecedcd 1355 "thp_file_alloc",
dcdf11ee 1356 "thp_file_fallback",
85b9f46e 1357 "thp_file_fallback_charge",
95ecedcd 1358 "thp_file_mapped",
122afea9
KS
1359 "thp_split_page",
1360 "thp_split_page_failed",
f9719a03 1361 "thp_deferred_split_page",
122afea9 1362 "thp_split_pmd",
e9ea874a
YY
1363 "thp_scan_exceed_none_pte",
1364 "thp_scan_exceed_swap_pte",
1365 "thp_scan_exceed_share_pte",
ce9311cf
YX
1366#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1367 "thp_split_pud",
1368#endif
d8a8e1f0
KS
1369 "thp_zero_page_alloc",
1370 "thp_zero_page_alloc_failed",
225311a4 1371 "thp_swpout",
fe490cc0 1372 "thp_swpout_fallback",
fa25c503 1373#endif
09316c09
KK
1374#ifdef CONFIG_MEMORY_BALLOON
1375 "balloon_inflate",
1376 "balloon_deflate",
1377#ifdef CONFIG_BALLOON_COMPACTION
1378 "balloon_migrate",
1379#endif
1380#endif /* CONFIG_MEMORY_BALLOON */
ec659934 1381#ifdef CONFIG_DEBUG_TLBFLUSH
9824cf97
DH
1382 "nr_tlb_remote_flush",
1383 "nr_tlb_remote_flush_received",
1384 "nr_tlb_local_flush_all",
1385 "nr_tlb_local_flush_one",
ec659934 1386#endif /* CONFIG_DEBUG_TLBFLUSH */
fa25c503 1387
cbc65df2
HY
1388#ifdef CONFIG_SWAP
1389 "swap_ra",
1390 "swap_ra_hit",
4d45c3af
YY
1391#ifdef CONFIG_KSM
1392 "ksm_swpin_copy",
1393#endif
cbc65df2 1394#endif
94bfe85b
YY
1395#ifdef CONFIG_KSM
1396 "cow_ksm",
1397#endif
f6498b77
JW
1398#ifdef CONFIG_ZSWAP
1399 "zswpin",
1400 "zswpout",
1401#endif
575299ea
S
1402#ifdef CONFIG_X86
1403 "direct_map_level2_splits",
1404 "direct_map_level3_splits",
1405#endif
52f23865
SB
1406#ifdef CONFIG_PER_VMA_LOCK_STATS
1407 "vma_lock_success",
1408 "vma_lock_abort",
1409 "vma_lock_retry",
1410 "vma_lock_miss",
1411#endif
ebc5d83d 1412#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
fa25c503 1413};
ebc5d83d 1414#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
fa25c503 1415
3c486871
AM
1416#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
1417 defined(CONFIG_PROC_FS)
1418static void *frag_start(struct seq_file *m, loff_t *pos)
1419{
1420 pg_data_t *pgdat;
1421 loff_t node = *pos;
1422
1423 for (pgdat = first_online_pgdat();
1424 pgdat && node;
1425 pgdat = next_online_pgdat(pgdat))
1426 --node;
1427
1428 return pgdat;
1429}
1430
1431static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
1432{
1433 pg_data_t *pgdat = (pg_data_t *)arg;
1434
1435 (*pos)++;
1436 return next_online_pgdat(pgdat);
1437}
1438
1439static void frag_stop(struct seq_file *m, void *arg)
1440{
1441}
1442
b2bd8598
DR
1443/*
1444 * Walk zones in a node and print using a callback.
1445 * If @assert_populated is true, only use callback for zones that are populated.
1446 */
3c486871 1447static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
727c080f 1448 bool assert_populated, bool nolock,
3c486871
AM
1449 void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
1450{
1451 struct zone *zone;
1452 struct zone *node_zones = pgdat->node_zones;
1453 unsigned long flags;
1454
1455 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
b2bd8598 1456 if (assert_populated && !populated_zone(zone))
3c486871
AM
1457 continue;
1458
727c080f
VM
1459 if (!nolock)
1460 spin_lock_irqsave(&zone->lock, flags);
3c486871 1461 print(m, pgdat, zone);
727c080f
VM
1462 if (!nolock)
1463 spin_unlock_irqrestore(&zone->lock, flags);
3c486871
AM
1464 }
1465}
1466#endif
1467
d7a5752c 1468#ifdef CONFIG_PROC_FS
467c996c
MG
1469static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
1470 struct zone *zone)
1471{
1472 int order;
1473
1474 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
23baf831 1475 for (order = 0; order <= MAX_ORDER; ++order)
af1c31ac
LS
1476 /*
1477 * Access to nr_free is lockless as nr_free is used only for
1478 * printing purposes. Use data_race to avoid KCSAN warning.
1479 */
1480 seq_printf(m, "%6lu ", data_race(zone->free_area[order].nr_free));
467c996c
MG
1481 seq_putc(m, '\n');
1482}
1483
1484/*
1485 * This walks the free areas for each zone.
1486 */
1487static int frag_show(struct seq_file *m, void *arg)
1488{
1489 pg_data_t *pgdat = (pg_data_t *)arg;
727c080f 1490 walk_zones_in_node(m, pgdat, true, false, frag_show_print);
467c996c
MG
1491 return 0;
1492}
1493
1494static void pagetypeinfo_showfree_print(struct seq_file *m,
1495 pg_data_t *pgdat, struct zone *zone)
1496{
1497 int order, mtype;
1498
1499 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
1500 seq_printf(m, "Node %4d, zone %8s, type %12s ",
1501 pgdat->node_id,
1502 zone->name,
1503 migratetype_names[mtype]);
23baf831 1504 for (order = 0; order <= MAX_ORDER; ++order) {
467c996c
MG
1505 unsigned long freecount = 0;
1506 struct free_area *area;
1507 struct list_head *curr;
93b3a674 1508 bool overflow = false;
467c996c
MG
1509
1510 area = &(zone->free_area[order]);
1511
93b3a674
MH
1512 list_for_each(curr, &area->free_list[mtype]) {
1513 /*
1514 * Cap the free_list iteration because it might
1515 * be really large and we are under a spinlock
1516 * so a long time spent here could trigger a
1517 * hard lockup detector. Anyway this is a
1518 * debugging tool so knowing there is a handful
1519 * of pages of this order should be more than
1520 * sufficient.
1521 */
1522 if (++freecount >= 100000) {
1523 overflow = true;
1524 break;
1525 }
1526 }
1527 seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
1528 spin_unlock_irq(&zone->lock);
1529 cond_resched();
1530 spin_lock_irq(&zone->lock);
467c996c 1531 }
f6ac2354
CL
1532 seq_putc(m, '\n');
1533 }
467c996c
MG
1534}
1535
1536/* Print out the free pages at each order for each migatetype */
33090af9 1537static void pagetypeinfo_showfree(struct seq_file *m, void *arg)
467c996c
MG
1538{
1539 int order;
1540 pg_data_t *pgdat = (pg_data_t *)arg;
1541
1542 /* Print header */
1543 seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
23baf831 1544 for (order = 0; order <= MAX_ORDER; ++order)
467c996c
MG
1545 seq_printf(m, "%6d ", order);
1546 seq_putc(m, '\n');
1547
727c080f 1548 walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
467c996c
MG
1549}
1550
1551static void pagetypeinfo_showblockcount_print(struct seq_file *m,
1552 pg_data_t *pgdat, struct zone *zone)
1553{
1554 int mtype;
1555 unsigned long pfn;
1556 unsigned long start_pfn = zone->zone_start_pfn;
108bcc96 1557 unsigned long end_pfn = zone_end_pfn(zone);
467c996c
MG
1558 unsigned long count[MIGRATE_TYPES] = { 0, };
1559
1560 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
1561 struct page *page;
1562
d336e94e
MH
1563 page = pfn_to_online_page(pfn);
1564 if (!page)
467c996c
MG
1565 continue;
1566
a91c43c7
JK
1567 if (page_zone(page) != zone)
1568 continue;
1569
467c996c
MG
1570 mtype = get_pageblock_migratetype(page);
1571
e80d6a24
MG
1572 if (mtype < MIGRATE_TYPES)
1573 count[mtype]++;
467c996c
MG
1574 }
1575
1576 /* Print counts */
1577 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1578 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1579 seq_printf(m, "%12lu ", count[mtype]);
1580 seq_putc(m, '\n');
1581}
1582
f113e641 1583/* Print out the number of pageblocks for each migratetype */
33090af9 1584static void pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
467c996c
MG
1585{
1586 int mtype;
1587 pg_data_t *pgdat = (pg_data_t *)arg;
1588
1589 seq_printf(m, "\n%-23s", "Number of blocks type ");
1590 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1591 seq_printf(m, "%12s ", migratetype_names[mtype]);
1592 seq_putc(m, '\n');
727c080f
VM
1593 walk_zones_in_node(m, pgdat, true, false,
1594 pagetypeinfo_showblockcount_print);
467c996c
MG
1595}
1596
48c96a36
JK
1597/*
1598 * Print out the number of pageblocks for each migratetype that contain pages
1599 * of other types. This gives an indication of how well fallbacks are being
1600 * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
1601 * to determine what is going on
1602 */
1603static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
1604{
1605#ifdef CONFIG_PAGE_OWNER
1606 int mtype;
1607
7dd80b8a 1608 if (!static_branch_unlikely(&page_owner_inited))
48c96a36
JK
1609 return;
1610
1611 drain_all_pages(NULL);
1612
1613 seq_printf(m, "\n%-23s", "Number of mixed blocks ");
1614 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1615 seq_printf(m, "%12s ", migratetype_names[mtype]);
1616 seq_putc(m, '\n');
1617
727c080f
VM
1618 walk_zones_in_node(m, pgdat, true, true,
1619 pagetypeinfo_showmixedcount_print);
48c96a36
JK
1620#endif /* CONFIG_PAGE_OWNER */
1621}
1622
467c996c
MG
1623/*
1624 * This prints out statistics in relation to grouping pages by mobility.
1625 * It is expensive to collect so do not constantly read the file.
1626 */
1627static int pagetypeinfo_show(struct seq_file *m, void *arg)
1628{
1629 pg_data_t *pgdat = (pg_data_t *)arg;
1630
41b25a37 1631 /* check memoryless node */
a47b53c5 1632 if (!node_state(pgdat->node_id, N_MEMORY))
41b25a37
KM
1633 return 0;
1634
467c996c
MG
1635 seq_printf(m, "Page block order: %d\n", pageblock_order);
1636 seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages);
1637 seq_putc(m, '\n');
1638 pagetypeinfo_showfree(m, pgdat);
1639 pagetypeinfo_showblockcount(m, pgdat);
48c96a36 1640 pagetypeinfo_showmixedcount(m, pgdat);
467c996c 1641
f6ac2354
CL
1642 return 0;
1643}
1644
8f32f7e5 1645static const struct seq_operations fragmentation_op = {
f6ac2354
CL
1646 .start = frag_start,
1647 .next = frag_next,
1648 .stop = frag_stop,
1649 .show = frag_show,
1650};
1651
74e2e8e8 1652static const struct seq_operations pagetypeinfo_op = {
467c996c
MG
1653 .start = frag_start,
1654 .next = frag_next,
1655 .stop = frag_stop,
1656 .show = pagetypeinfo_show,
1657};
1658
e2ecc8a7
MG
1659static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
1660{
1661 int zid;
1662
1663 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1664 struct zone *compare = &pgdat->node_zones[zid];
1665
1666 if (populated_zone(compare))
1667 return zone == compare;
1668 }
1669
e2ecc8a7
MG
1670 return false;
1671}
1672
467c996c
MG
1673static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1674 struct zone *zone)
f6ac2354 1675{
467c996c
MG
1676 int i;
1677 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
e2ecc8a7
MG
1678 if (is_zone_first_populated(pgdat, zone)) {
1679 seq_printf(m, "\n per-node stats");
1680 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
69473e5d
MS
1681 unsigned long pages = node_page_state_pages(pgdat, i);
1682
1683 if (vmstat_item_print_in_thp(i))
1684 pages /= HPAGE_PMD_NR;
9d7ea9a2 1685 seq_printf(m, "\n %-12s %lu", node_stat_name(i),
69473e5d 1686 pages);
e2ecc8a7
MG
1687 }
1688 }
467c996c
MG
1689 seq_printf(m,
1690 "\n pages free %lu"
a6ea8b5b 1691 "\n boost %lu"
467c996c
MG
1692 "\n min %lu"
1693 "\n low %lu"
1694 "\n high %lu"
467c996c 1695 "\n spanned %lu"
9feedc9d 1696 "\n present %lu"
3c381db1
DH
1697 "\n managed %lu"
1698 "\n cma %lu",
88f5acf8 1699 zone_page_state(zone, NR_FREE_PAGES),
a6ea8b5b 1700 zone->watermark_boost,
41858966
MG
1701 min_wmark_pages(zone),
1702 low_wmark_pages(zone),
1703 high_wmark_pages(zone),
467c996c 1704 zone->spanned_pages,
9feedc9d 1705 zone->present_pages,
3c381db1
DH
1706 zone_managed_pages(zone),
1707 zone_cma_pages(zone));
467c996c 1708
467c996c 1709 seq_printf(m,
3484b2de 1710 "\n protection: (%ld",
467c996c
MG
1711 zone->lowmem_reserve[0]);
1712 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
3484b2de 1713 seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
7dfb8bf3
DR
1714 seq_putc(m, ')');
1715
a8a4b7ae
BH
1716 /* If unpopulated, no other information is useful */
1717 if (!populated_zone(zone)) {
1718 seq_putc(m, '\n');
1719 return;
1720 }
1721
7dfb8bf3 1722 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
9d7ea9a2
KK
1723 seq_printf(m, "\n %-12s %lu", zone_stat_name(i),
1724 zone_page_state(zone, i));
7dfb8bf3 1725
3a321d2a 1726#ifdef CONFIG_NUMA
f19298b9 1727 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
9d7ea9a2 1728 seq_printf(m, "\n %-12s %lu", numa_stat_name(i),
f19298b9 1729 zone_numa_event_state(zone, i));
3a321d2a
KW
1730#endif
1731
7dfb8bf3 1732 seq_printf(m, "\n pagesets");
467c996c 1733 for_each_online_cpu(i) {
28f836b6
MG
1734 struct per_cpu_pages *pcp;
1735 struct per_cpu_zonestat __maybe_unused *pzstats;
467c996c 1736
28f836b6 1737 pcp = per_cpu_ptr(zone->per_cpu_pageset, i);
3dfa5721
CL
1738 seq_printf(m,
1739 "\n cpu: %i"
1740 "\n count: %i"
1741 "\n high: %i"
1742 "\n batch: %i",
1743 i,
28f836b6
MG
1744 pcp->count,
1745 pcp->high,
1746 pcp->batch);
df9ecaba 1747#ifdef CONFIG_SMP
28f836b6 1748 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i);
467c996c 1749 seq_printf(m, "\n vm stats threshold: %d",
28f836b6 1750 pzstats->stat_threshold);
df9ecaba 1751#endif
f6ac2354 1752 }
467c996c 1753 seq_printf(m,
599d0c95 1754 "\n node_unreclaimable: %u"
3a50d14d 1755 "\n start_pfn: %lu",
c73322d0 1756 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
3a50d14d 1757 zone->zone_start_pfn);
467c996c
MG
1758 seq_putc(m, '\n');
1759}
1760
1761/*
b2bd8598
DR
1762 * Output information about zones in @pgdat. All zones are printed regardless
1763 * of whether they are populated or not: lowmem_reserve_ratio operates on the
1764 * set of all zones and userspace would not be aware of such zones if they are
1765 * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
467c996c
MG
1766 */
1767static int zoneinfo_show(struct seq_file *m, void *arg)
1768{
1769 pg_data_t *pgdat = (pg_data_t *)arg;
727c080f 1770 walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
f6ac2354
CL
1771 return 0;
1772}
1773
5c9fe628 1774static const struct seq_operations zoneinfo_op = {
f6ac2354
CL
1775 .start = frag_start, /* iterate over all zones. The same as in
1776 * fragmentation. */
1777 .next = frag_next,
1778 .stop = frag_stop,
1779 .show = zoneinfo_show,
1780};
1781
9d7ea9a2 1782#define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
f19298b9 1783 NR_VM_NUMA_EVENT_ITEMS + \
9d7ea9a2
KK
1784 NR_VM_NODE_STAT_ITEMS + \
1785 NR_VM_WRITEBACK_STAT_ITEMS + \
1786 (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
1787 NR_VM_EVENT_ITEMS : 0))
79da826a 1788
f6ac2354
CL
1789static void *vmstat_start(struct seq_file *m, loff_t *pos)
1790{
2244b95a 1791 unsigned long *v;
9d7ea9a2 1792 int i;
f6ac2354 1793
9d7ea9a2 1794 if (*pos >= NR_VMSTAT_ITEMS)
f6ac2354 1795 return NULL;
79da826a 1796
9d7ea9a2 1797 BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
f19298b9 1798 fold_vm_numa_events();
9d7ea9a2 1799 v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
2244b95a
CL
1800 m->private = v;
1801 if (!v)
f6ac2354 1802 return ERR_PTR(-ENOMEM);
2244b95a 1803 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
c41f012a 1804 v[i] = global_zone_page_state(i);
79da826a
MR
1805 v += NR_VM_ZONE_STAT_ITEMS;
1806
3a321d2a 1807#ifdef CONFIG_NUMA
f19298b9
MG
1808 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
1809 v[i] = global_numa_event_state(i);
1810 v += NR_VM_NUMA_EVENT_ITEMS;
3a321d2a
KW
1811#endif
1812
69473e5d 1813 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
ea426c2a 1814 v[i] = global_node_page_state_pages(i);
69473e5d
MS
1815 if (vmstat_item_print_in_thp(i))
1816 v[i] /= HPAGE_PMD_NR;
1817 }
75ef7184
MG
1818 v += NR_VM_NODE_STAT_ITEMS;
1819
79da826a
MR
1820 global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
1821 v + NR_DIRTY_THRESHOLD);
1822 v += NR_VM_WRITEBACK_STAT_ITEMS;
1823
f8891e5e 1824#ifdef CONFIG_VM_EVENT_COUNTERS
79da826a
MR
1825 all_vm_events(v);
1826 v[PGPGIN] /= 2; /* sectors -> kbytes */
1827 v[PGPGOUT] /= 2;
f8891e5e 1828#endif
ff8b16d7 1829 return (unsigned long *)m->private + *pos;
f6ac2354
CL
1830}
1831
1832static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1833{
1834 (*pos)++;
9d7ea9a2 1835 if (*pos >= NR_VMSTAT_ITEMS)
f6ac2354
CL
1836 return NULL;
1837 return (unsigned long *)m->private + *pos;
1838}
1839
1840static int vmstat_show(struct seq_file *m, void *arg)
1841{
1842 unsigned long *l = arg;
1843 unsigned long off = l - (unsigned long *)m->private;
68ba0326
AD
1844
1845 seq_puts(m, vmstat_text[off]);
75ba1d07 1846 seq_put_decimal_ull(m, " ", *l);
68ba0326 1847 seq_putc(m, '\n');
8d92890b
N
1848
1849 if (off == NR_VMSTAT_ITEMS - 1) {
1850 /*
1851 * We've come to the end - add any deprecated counters to avoid
1852 * breaking userspace which might depend on them being present.
1853 */
1854 seq_puts(m, "nr_unstable 0\n");
1855 }
f6ac2354
CL
1856 return 0;
1857}
1858
1859static void vmstat_stop(struct seq_file *m, void *arg)
1860{
1861 kfree(m->private);
1862 m->private = NULL;
1863}
1864
b6aa44ab 1865static const struct seq_operations vmstat_op = {
f6ac2354
CL
1866 .start = vmstat_start,
1867 .next = vmstat_next,
1868 .stop = vmstat_stop,
1869 .show = vmstat_show,
1870};
f6ac2354
CL
1871#endif /* CONFIG_PROC_FS */
1872
df9ecaba 1873#ifdef CONFIG_SMP
d1187ed2 1874static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
77461ab3 1875int sysctl_stat_interval __read_mostly = HZ;
d1187ed2 1876
52b6f46b
HD
1877#ifdef CONFIG_PROC_FS
1878static void refresh_vm_stats(struct work_struct *work)
1879{
1880 refresh_cpu_vm_stats(true);
1881}
1882
1883int vmstat_refresh(struct ctl_table *table, int write,
32927393 1884 void *buffer, size_t *lenp, loff_t *ppos)
52b6f46b
HD
1885{
1886 long val;
1887 int err;
1888 int i;
1889
1890 /*
1891 * The regular update, every sysctl_stat_interval, may come later
1892 * than expected: leaving a significant amount in per_cpu buckets.
1893 * This is particularly misleading when checking a quantity of HUGE
1894 * pages, immediately after running a test. /proc/sys/vm/stat_refresh,
1895 * which can equally be echo'ed to or cat'ted from (by root),
1896 * can be used to update the stats just before reading them.
1897 *
c41f012a 1898 * Oh, and since global_zone_page_state() etc. are so careful to hide
52b6f46b
HD
1899 * transiently negative values, report an error here if any of
1900 * the stats is negative, so we know to go looking for imbalance.
1901 */
1902 err = schedule_on_each_cpu(refresh_vm_stats);
1903 if (err)
1904 return err;
1905 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
75083aae
HD
1906 /*
1907 * Skip checking stats known to go negative occasionally.
1908 */
1909 switch (i) {
1910 case NR_ZONE_WRITE_PENDING:
1911 case NR_FREE_CMA_PAGES:
1912 continue;
1913 }
75ef7184 1914 val = atomic_long_read(&vm_zone_stat[i]);
52b6f46b 1915 if (val < 0) {
c822f622 1916 pr_warn("%s: %s %ld\n",
9d7ea9a2 1917 __func__, zone_stat_name(i), val);
52b6f46b
HD
1918 }
1919 }
76d8cc3c 1920 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
75083aae
HD
1921 /*
1922 * Skip checking stats known to go negative occasionally.
1923 */
1924 switch (i) {
1925 case NR_WRITEBACK:
1926 continue;
1927 }
76d8cc3c
HD
1928 val = atomic_long_read(&vm_node_stat[i]);
1929 if (val < 0) {
1930 pr_warn("%s: %s %ld\n",
1931 __func__, node_stat_name(i), val);
76d8cc3c
HD
1932 }
1933 }
52b6f46b
HD
1934 if (write)
1935 *ppos += *lenp;
1936 else
1937 *lenp = 0;
1938 return 0;
1939}
1940#endif /* CONFIG_PROC_FS */
1941
d1187ed2
CL
1942static void vmstat_update(struct work_struct *w)
1943{
0eb77e98 1944 if (refresh_cpu_vm_stats(true)) {
7cc36bbd
CL
1945 /*
1946 * Counters were updated so we expect more updates
1947 * to occur in the future. Keep on running the
1948 * update worker thread.
1949 */
ce612879 1950 queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
f01f17d3
MH
1951 this_cpu_ptr(&vmstat_work),
1952 round_jiffies_relative(sysctl_stat_interval));
7cc36bbd
CL
1953 }
1954}
1955
1956/*
1957 * Check if the diffs for a certain cpu indicate that
1958 * an update is needed.
1959 */
1960static bool need_update(int cpu)
1961{
2bbd00ae 1962 pg_data_t *last_pgdat = NULL;
7cc36bbd
CL
1963 struct zone *zone;
1964
1965 for_each_populated_zone(zone) {
28f836b6 1966 struct per_cpu_zonestat *pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
2bbd00ae 1967 struct per_cpu_nodestat *n;
28f836b6 1968
7cc36bbd
CL
1969 /*
1970 * The fast way of checking if there are any vmstat diffs.
7cc36bbd 1971 */
64632fd3 1972 if (memchr_inv(pzstats->vm_stat_diff, 0, sizeof(pzstats->vm_stat_diff)))
7cc36bbd 1973 return true;
f19298b9 1974
2bbd00ae
JW
1975 if (last_pgdat == zone->zone_pgdat)
1976 continue;
1977 last_pgdat = zone->zone_pgdat;
1978 n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu);
64632fd3
ML
1979 if (memchr_inv(n->vm_node_stat_diff, 0, sizeof(n->vm_node_stat_diff)))
1980 return true;
7cc36bbd
CL
1981 }
1982 return false;
1983}
1984
7b8da4c7
CL
1985/*
1986 * Switch off vmstat processing and then fold all the remaining differentials
1987 * until the diffs stay at zero. The function is used by NOHZ and can only be
1988 * invoked when tick processing is not active.
1989 */
f01f17d3
MH
1990void quiet_vmstat(void)
1991{
1992 if (system_state != SYSTEM_RUNNING)
1993 return;
1994
7b8da4c7 1995 if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
f01f17d3
MH
1996 return;
1997
1998 if (!need_update(smp_processor_id()))
1999 return;
2000
2001 /*
2002 * Just refresh counters and do not care about the pending delayed
2003 * vmstat_update. It doesn't fire that often to matter and canceling
2004 * it would be too expensive from this path.
2005 * vmstat_shepherd will take care about that for us.
2006 */
2007 refresh_cpu_vm_stats(false);
2008}
2009
7cc36bbd
CL
2010/*
2011 * Shepherd worker thread that checks the
2012 * differentials of processors that have their worker
2013 * threads for vm statistics updates disabled because of
2014 * inactivity.
2015 */
2016static void vmstat_shepherd(struct work_struct *w);
2017
0eb77e98 2018static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
7cc36bbd
CL
2019
2020static void vmstat_shepherd(struct work_struct *w)
2021{
2022 int cpu;
2023
7625eccd 2024 cpus_read_lock();
7cc36bbd 2025 /* Check processors whose vmstat worker threads have been disabled */
7b8da4c7 2026 for_each_online_cpu(cpu) {
f01f17d3 2027 struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
7cc36bbd 2028
be5e015d
MT
2029 /*
2030 * In kernel users of vmstat counters either require the precise value and
2031 * they are using zone_page_state_snapshot interface or they can live with
2032 * an imprecision as the regular flushing can happen at arbitrary time and
2033 * cumulative error can grow (see calculate_normal_threshold).
2034 *
2035 * From that POV the regular flushing can be postponed for CPUs that have
2036 * been isolated from the kernel interference without critical
2037 * infrastructure ever noticing. Skip regular flushing from vmstat_shepherd
2038 * for all isolated CPUs to avoid interference with the isolated workload.
2039 */
2040 if (cpu_is_isolated(cpu))
2041 continue;
2042
7b8da4c7 2043 if (!delayed_work_pending(dw) && need_update(cpu))
ce612879 2044 queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
fbcc8183
JB
2045
2046 cond_resched();
f01f17d3 2047 }
7625eccd 2048 cpus_read_unlock();
7cc36bbd
CL
2049
2050 schedule_delayed_work(&shepherd,
98f4ebb2 2051 round_jiffies_relative(sysctl_stat_interval));
d1187ed2
CL
2052}
2053
7cc36bbd 2054static void __init start_shepherd_timer(void)
d1187ed2 2055{
7cc36bbd
CL
2056 int cpu;
2057
2058 for_each_possible_cpu(cpu)
ccde8bd4 2059 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
7cc36bbd
CL
2060 vmstat_update);
2061
7cc36bbd
CL
2062 schedule_delayed_work(&shepherd,
2063 round_jiffies_relative(sysctl_stat_interval));
d1187ed2
CL
2064}
2065
03e86dba
TC
2066static void __init init_cpu_node_state(void)
2067{
4c501327 2068 int node;
03e86dba 2069
4c501327 2070 for_each_online_node(node) {
b55032f1 2071 if (!cpumask_empty(cpumask_of_node(node)))
4c501327
SAS
2072 node_set_state(node, N_CPU);
2073 }
03e86dba
TC
2074}
2075
5438da97
SAS
2076static int vmstat_cpu_online(unsigned int cpu)
2077{
2078 refresh_zone_stat_thresholds();
734c1570
OS
2079
2080 if (!node_state(cpu_to_node(cpu), N_CPU)) {
2081 node_set_state(cpu_to_node(cpu), N_CPU);
734c1570
OS
2082 }
2083
5438da97
SAS
2084 return 0;
2085}
2086
2087static int vmstat_cpu_down_prep(unsigned int cpu)
2088{
2089 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
2090 return 0;
2091}
2092
2093static int vmstat_cpu_dead(unsigned int cpu)
807a1bd2 2094{
4c501327 2095 const struct cpumask *node_cpus;
5438da97 2096 int node;
807a1bd2 2097
5438da97
SAS
2098 node = cpu_to_node(cpu);
2099
2100 refresh_zone_stat_thresholds();
4c501327 2101 node_cpus = cpumask_of_node(node);
b55032f1 2102 if (!cpumask_empty(node_cpus))
5438da97 2103 return 0;
807a1bd2
TK
2104
2105 node_clear_state(node, N_CPU);
734c1570 2106
5438da97 2107 return 0;
807a1bd2
TK
2108}
2109
8f32f7e5 2110#endif
df9ecaba 2111
ce612879
MH
2112struct workqueue_struct *mm_percpu_wq;
2113
597b7305 2114void __init init_mm_internals(void)
df9ecaba 2115{
ce612879 2116 int ret __maybe_unused;
5438da97 2117
80d136e1 2118 mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
ce612879
MH
2119
2120#ifdef CONFIG_SMP
5438da97
SAS
2121 ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
2122 NULL, vmstat_cpu_dead);
2123 if (ret < 0)
2124 pr_err("vmstat: failed to register 'dead' hotplug state\n");
2125
2126 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
2127 vmstat_cpu_online,
2128 vmstat_cpu_down_prep);
2129 if (ret < 0)
2130 pr_err("vmstat: failed to register 'online' hotplug state\n");
2131
7625eccd 2132 cpus_read_lock();
03e86dba 2133 init_cpu_node_state();
7625eccd 2134 cpus_read_unlock();
d1187ed2 2135
7cc36bbd 2136 start_shepherd_timer();
8f32f7e5
AD
2137#endif
2138#ifdef CONFIG_PROC_FS
fddda2b7 2139 proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
abaed011 2140 proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
fddda2b7
CH
2141 proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
2142 proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
8f32f7e5 2143#endif
df9ecaba 2144}
d7a5752c
MG
2145
2146#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
d7a5752c
MG
2147
2148/*
2149 * Return an index indicating how much of the available free memory is
2150 * unusable for an allocation of the requested size.
2151 */
2152static int unusable_free_index(unsigned int order,
2153 struct contig_page_info *info)
2154{
2155 /* No free memory is interpreted as all free memory is unusable */
2156 if (info->free_pages == 0)
2157 return 1000;
2158
2159 /*
2160 * Index should be a value between 0 and 1. Return a value to 3
2161 * decimal places.
2162 *
2163 * 0 => no fragmentation
2164 * 1 => high fragmentation
2165 */
2166 return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
2167
2168}
2169
2170static void unusable_show_print(struct seq_file *m,
2171 pg_data_t *pgdat, struct zone *zone)
2172{
2173 unsigned int order;
2174 int index;
2175 struct contig_page_info info;
2176
2177 seq_printf(m, "Node %d, zone %8s ",
2178 pgdat->node_id,
2179 zone->name);
23baf831 2180 for (order = 0; order <= MAX_ORDER; ++order) {
d7a5752c
MG
2181 fill_contig_page_info(zone, order, &info);
2182 index = unusable_free_index(order, &info);
2183 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2184 }
2185
2186 seq_putc(m, '\n');
2187}
2188
2189/*
2190 * Display unusable free space index
2191 *
2192 * The unusable free space index measures how much of the available free
2193 * memory cannot be used to satisfy an allocation of a given size and is a
2194 * value between 0 and 1. The higher the value, the more of free memory is
2195 * unusable and by implication, the worse the external fragmentation is. This
2196 * can be expressed as a percentage by multiplying by 100.
2197 */
2198static int unusable_show(struct seq_file *m, void *arg)
2199{
2200 pg_data_t *pgdat = (pg_data_t *)arg;
2201
2202 /* check memoryless node */
a47b53c5 2203 if (!node_state(pgdat->node_id, N_MEMORY))
d7a5752c
MG
2204 return 0;
2205
727c080f 2206 walk_zones_in_node(m, pgdat, true, false, unusable_show_print);
d7a5752c
MG
2207
2208 return 0;
2209}
2210
01a99560 2211static const struct seq_operations unusable_sops = {
d7a5752c
MG
2212 .start = frag_start,
2213 .next = frag_next,
2214 .stop = frag_stop,
2215 .show = unusable_show,
2216};
2217
01a99560 2218DEFINE_SEQ_ATTRIBUTE(unusable);
d7a5752c 2219
f1a5ab12
MG
2220static void extfrag_show_print(struct seq_file *m,
2221 pg_data_t *pgdat, struct zone *zone)
2222{
2223 unsigned int order;
2224 int index;
2225
2226 /* Alloc on stack as interrupts are disabled for zone walk */
2227 struct contig_page_info info;
2228
2229 seq_printf(m, "Node %d, zone %8s ",
2230 pgdat->node_id,
2231 zone->name);
23baf831 2232 for (order = 0; order <= MAX_ORDER; ++order) {
f1a5ab12 2233 fill_contig_page_info(zone, order, &info);
56de7263 2234 index = __fragmentation_index(order, &info);
a9970586 2235 seq_printf(m, "%2d.%03d ", index / 1000, index % 1000);
f1a5ab12
MG
2236 }
2237
2238 seq_putc(m, '\n');
2239}
2240
2241/*
2242 * Display fragmentation index for orders that allocations would fail for
2243 */
2244static int extfrag_show(struct seq_file *m, void *arg)
2245{
2246 pg_data_t *pgdat = (pg_data_t *)arg;
2247
727c080f 2248 walk_zones_in_node(m, pgdat, true, false, extfrag_show_print);
f1a5ab12
MG
2249
2250 return 0;
2251}
2252
01a99560 2253static const struct seq_operations extfrag_sops = {
f1a5ab12
MG
2254 .start = frag_start,
2255 .next = frag_next,
2256 .stop = frag_stop,
2257 .show = extfrag_show,
2258};
2259
01a99560 2260DEFINE_SEQ_ATTRIBUTE(extfrag);
f1a5ab12 2261
d7a5752c
MG
2262static int __init extfrag_debug_init(void)
2263{
bde8bd8a
S
2264 struct dentry *extfrag_debug_root;
2265
d7a5752c 2266 extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
d7a5752c 2267
d9f7979c 2268 debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
01a99560 2269 &unusable_fops);
d7a5752c 2270
d9f7979c 2271 debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
01a99560 2272 &extfrag_fops);
f1a5ab12 2273
d7a5752c
MG
2274 return 0;
2275}
2276
2277module_init(extfrag_debug_init);
2278#endif