a78d70ddeacd686ce4cc2caf2ed76556abbda283
[linux-block.git] / mm / vmstat.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  linux/mm/vmstat.c
4  *
5  *  Manages VM statistics
6  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
7  *
8  *  zoned VM statistics
9  *  Copyright (C) 2006 Silicon Graphics, Inc.,
10  *              Christoph Lameter <cl@gentwo.org>
11  *  Copyright (C) 2008-2014 Christoph Lameter
12  */
13 #include <linux/fs.h>
14 #include <linux/mm.h>
15 #include <linux/err.h>
16 #include <linux/module.h>
17 #include <linux/slab.h>
18 #include <linux/cpu.h>
19 #include <linux/cpumask.h>
20 #include <linux/vmstat.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/debugfs.h>
24 #include <linux/sched.h>
25 #include <linux/math64.h>
26 #include <linux/writeback.h>
27 #include <linux/compaction.h>
28 #include <linux/mm_inline.h>
29 #include <linux/page_owner.h>
30 #include <linux/sched/isolation.h>
31
32 #include "internal.h"
33
34 #ifdef CONFIG_PROC_FS
35 #ifdef CONFIG_NUMA
36 #define ENABLE_NUMA_STAT 1
37 static int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
38
39 /* zero numa counters within a zone */
40 static void zero_zone_numa_counters(struct zone *zone)
41 {
42         int item, cpu;
43
44         for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) {
45                 atomic_long_set(&zone->vm_numa_event[item], 0);
46                 for_each_online_cpu(cpu) {
47                         per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item]
48                                                 = 0;
49                 }
50         }
51 }
52
53 /* zero numa counters of all the populated zones */
54 static void zero_zones_numa_counters(void)
55 {
56         struct zone *zone;
57
58         for_each_populated_zone(zone)
59                 zero_zone_numa_counters(zone);
60 }
61
62 /* zero global numa counters */
63 static void zero_global_numa_counters(void)
64 {
65         int item;
66
67         for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
68                 atomic_long_set(&vm_numa_event[item], 0);
69 }
70
71 static void invalid_numa_statistics(void)
72 {
73         zero_zones_numa_counters();
74         zero_global_numa_counters();
75 }
76
77 static DEFINE_MUTEX(vm_numa_stat_lock);
78
79 static int sysctl_vm_numa_stat_handler(const struct ctl_table *table, int write,
80                 void *buffer, size_t *length, loff_t *ppos)
81 {
82         int ret, oldval;
83
84         mutex_lock(&vm_numa_stat_lock);
85         if (write)
86                 oldval = sysctl_vm_numa_stat;
87         ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
88         if (ret || !write)
89                 goto out;
90
91         if (oldval == sysctl_vm_numa_stat)
92                 goto out;
93         else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
94                 static_branch_enable(&vm_numa_stat_key);
95                 pr_info("enable numa statistics\n");
96         } else {
97                 static_branch_disable(&vm_numa_stat_key);
98                 invalid_numa_statistics();
99                 pr_info("disable numa statistics, and clear numa counters\n");
100         }
101
102 out:
103         mutex_unlock(&vm_numa_stat_lock);
104         return ret;
105 }
106 #endif
107 #endif /* CONFIG_PROC_FS */
108
109 #ifdef CONFIG_VM_EVENT_COUNTERS
110 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
111 EXPORT_PER_CPU_SYMBOL(vm_event_states);
112
113 static void sum_vm_events(unsigned long *ret)
114 {
115         int cpu;
116         int i;
117
118         memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
119
120         for_each_online_cpu(cpu) {
121                 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
122
123                 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
124                         ret[i] += this->event[i];
125         }
126 }
127
128 /*
129  * Accumulate the vm event counters across all CPUs.
130  * The result is unavoidably approximate - it can change
131  * during and after execution of this function.
132 */
133 void all_vm_events(unsigned long *ret)
134 {
135         cpus_read_lock();
136         sum_vm_events(ret);
137         cpus_read_unlock();
138 }
139 EXPORT_SYMBOL_GPL(all_vm_events);
140
141 /*
142  * Fold the foreign cpu events into our own.
143  *
144  * This is adding to the events on one processor
145  * but keeps the global counts constant.
146  */
147 void vm_events_fold_cpu(int cpu)
148 {
149         struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
150         int i;
151
152         for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
153                 count_vm_events(i, fold_state->event[i]);
154                 fold_state->event[i] = 0;
155         }
156 }
157
158 #endif /* CONFIG_VM_EVENT_COUNTERS */
159
160 /*
161  * Manage combined zone based / global counters
162  *
163  * vm_stat contains the global counters
164  */
165 atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
166 atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
167 atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
168 EXPORT_SYMBOL(vm_zone_stat);
169 EXPORT_SYMBOL(vm_node_stat);
170
171 #ifdef CONFIG_NUMA
172 static void fold_vm_zone_numa_events(struct zone *zone)
173 {
174         unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
175         int cpu;
176         enum numa_stat_item item;
177
178         for_each_online_cpu(cpu) {
179                 struct per_cpu_zonestat *pzstats;
180
181                 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
182                 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
183                         zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
184         }
185
186         for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
187                 zone_numa_event_add(zone_numa_events[item], zone, item);
188 }
189
190 void fold_vm_numa_events(void)
191 {
192         struct zone *zone;
193
194         for_each_populated_zone(zone)
195                 fold_vm_zone_numa_events(zone);
196 }
197 #endif
198
199 #ifdef CONFIG_SMP
200
201 int calculate_pressure_threshold(struct zone *zone)
202 {
203         int threshold;
204         int watermark_distance;
205
206         /*
207          * As vmstats are not up to date, there is drift between the estimated
208          * and real values. For high thresholds and a high number of CPUs, it
209          * is possible for the min watermark to be breached while the estimated
210          * value looks fine. The pressure threshold is a reduced value such
211          * that even the maximum amount of drift will not accidentally breach
212          * the min watermark
213          */
214         watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
215         threshold = max(1, (int)(watermark_distance / num_online_cpus()));
216
217         /*
218          * Maximum threshold is 125
219          */
220         threshold = min(125, threshold);
221
222         return threshold;
223 }
224
225 int calculate_normal_threshold(struct zone *zone)
226 {
227         int threshold;
228         int mem;        /* memory in 128 MB units */
229
230         /*
231          * The threshold scales with the number of processors and the amount
232          * of memory per zone. More memory means that we can defer updates for
233          * longer, more processors could lead to more contention.
234          * fls() is used to have a cheap way of logarithmic scaling.
235          *
236          * Some sample thresholds:
237          *
238          * Threshold    Processors      (fls)   Zonesize        fls(mem)+1
239          * ------------------------------------------------------------------
240          * 8            1               1       0.9-1 GB        4
241          * 16           2               2       0.9-1 GB        4
242          * 20           2               2       1-2 GB          5
243          * 24           2               2       2-4 GB          6
244          * 28           2               2       4-8 GB          7
245          * 32           2               2       8-16 GB         8
246          * 4            2               2       <128M           1
247          * 30           4               3       2-4 GB          5
248          * 48           4               3       8-16 GB         8
249          * 32           8               4       1-2 GB          4
250          * 32           8               4       0.9-1GB         4
251          * 10           16              5       <128M           1
252          * 40           16              5       900M            4
253          * 70           64              7       2-4 GB          5
254          * 84           64              7       4-8 GB          6
255          * 108          512             9       4-8 GB          6
256          * 125          1024            10      8-16 GB         8
257          * 125          1024            10      16-32 GB        9
258          */
259
260         mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
261
262         threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
263
264         /*
265          * Maximum threshold is 125
266          */
267         threshold = min(125, threshold);
268
269         return threshold;
270 }
271
272 /*
273  * Refresh the thresholds for each zone.
274  */
275 void refresh_zone_stat_thresholds(void)
276 {
277         struct pglist_data *pgdat;
278         struct zone *zone;
279         int cpu;
280         int threshold;
281
282         /* Zero current pgdat thresholds */
283         for_each_online_pgdat(pgdat) {
284                 for_each_online_cpu(cpu) {
285                         per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
286                 }
287         }
288
289         for_each_populated_zone(zone) {
290                 struct pglist_data *pgdat = zone->zone_pgdat;
291                 unsigned long max_drift, tolerate_drift;
292
293                 threshold = calculate_normal_threshold(zone);
294
295                 for_each_online_cpu(cpu) {
296                         int pgdat_threshold;
297
298                         per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
299                                                         = threshold;
300
301                         /* Base nodestat threshold on the largest populated zone. */
302                         pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
303                         per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
304                                 = max(threshold, pgdat_threshold);
305                 }
306
307                 /*
308                  * Only set percpu_drift_mark if there is a danger that
309                  * NR_FREE_PAGES reports the low watermark is ok when in fact
310                  * the min watermark could be breached by an allocation
311                  */
312                 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
313                 max_drift = num_online_cpus() * threshold;
314                 if (max_drift > tolerate_drift)
315                         zone->percpu_drift_mark = high_wmark_pages(zone) +
316                                         max_drift;
317         }
318 }
319
320 void set_pgdat_percpu_threshold(pg_data_t *pgdat,
321                                 int (*calculate_pressure)(struct zone *))
322 {
323         struct zone *zone;
324         int cpu;
325         int threshold;
326         int i;
327
328         for (i = 0; i < pgdat->nr_zones; i++) {
329                 zone = &pgdat->node_zones[i];
330                 if (!zone->percpu_drift_mark)
331                         continue;
332
333                 threshold = (*calculate_pressure)(zone);
334                 for_each_online_cpu(cpu)
335                         per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
336                                                         = threshold;
337         }
338 }
339
340 /*
341  * For use when we know that interrupts are disabled,
342  * or when we know that preemption is disabled and that
343  * particular counter cannot be updated from interrupt context.
344  */
345 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
346                            long delta)
347 {
348         struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
349         s8 __percpu *p = pcp->vm_stat_diff + item;
350         long x;
351         long t;
352
353         /*
354          * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels,
355          * atomicity is provided by IRQs being disabled -- either explicitly
356          * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables
357          * CPU migrations and preemption potentially corrupts a counter so
358          * disable preemption.
359          */
360         preempt_disable_nested();
361
362         x = delta + __this_cpu_read(*p);
363
364         t = __this_cpu_read(pcp->stat_threshold);
365
366         if (unlikely(abs(x) > t)) {
367                 zone_page_state_add(x, zone, item);
368                 x = 0;
369         }
370         __this_cpu_write(*p, x);
371
372         preempt_enable_nested();
373 }
374 EXPORT_SYMBOL(__mod_zone_page_state);
375
376 void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
377                                 long delta)
378 {
379         struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
380         s8 __percpu *p = pcp->vm_node_stat_diff + item;
381         long x;
382         long t;
383
384         if (vmstat_item_in_bytes(item)) {
385                 /*
386                  * Only cgroups use subpage accounting right now; at
387                  * the global level, these items still change in
388                  * multiples of whole pages. Store them as pages
389                  * internally to keep the per-cpu counters compact.
390                  */
391                 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
392                 delta >>= PAGE_SHIFT;
393         }
394
395         /* See __mod_node_page_state */
396         preempt_disable_nested();
397
398         x = delta + __this_cpu_read(*p);
399
400         t = __this_cpu_read(pcp->stat_threshold);
401
402         if (unlikely(abs(x) > t)) {
403                 node_page_state_add(x, pgdat, item);
404                 x = 0;
405         }
406         __this_cpu_write(*p, x);
407
408         preempt_enable_nested();
409 }
410 EXPORT_SYMBOL(__mod_node_page_state);
411
412 /*
413  * Optimized increment and decrement functions.
414  *
415  * These are only for a single page and therefore can take a struct page *
416  * argument instead of struct zone *. This allows the inclusion of the code
417  * generated for page_zone(page) into the optimized functions.
418  *
419  * No overflow check is necessary and therefore the differential can be
420  * incremented or decremented in place which may allow the compilers to
421  * generate better code.
422  * The increment or decrement is known and therefore one boundary check can
423  * be omitted.
424  *
425  * NOTE: These functions are very performance sensitive. Change only
426  * with care.
427  *
428  * Some processors have inc/dec instructions that are atomic vs an interrupt.
429  * However, the code must first determine the differential location in a zone
430  * based on the processor number and then inc/dec the counter. There is no
431  * guarantee without disabling preemption that the processor will not change
432  * in between and therefore the atomicity vs. interrupt cannot be exploited
433  * in a useful way here.
434  */
435 void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
436 {
437         struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
438         s8 __percpu *p = pcp->vm_stat_diff + item;
439         s8 v, t;
440
441         /* See __mod_node_page_state */
442         preempt_disable_nested();
443
444         v = __this_cpu_inc_return(*p);
445         t = __this_cpu_read(pcp->stat_threshold);
446         if (unlikely(v > t)) {
447                 s8 overstep = t >> 1;
448
449                 zone_page_state_add(v + overstep, zone, item);
450                 __this_cpu_write(*p, -overstep);
451         }
452
453         preempt_enable_nested();
454 }
455
456 void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
457 {
458         struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
459         s8 __percpu *p = pcp->vm_node_stat_diff + item;
460         s8 v, t;
461
462         VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
463
464         /* See __mod_node_page_state */
465         preempt_disable_nested();
466
467         v = __this_cpu_inc_return(*p);
468         t = __this_cpu_read(pcp->stat_threshold);
469         if (unlikely(v > t)) {
470                 s8 overstep = t >> 1;
471
472                 node_page_state_add(v + overstep, pgdat, item);
473                 __this_cpu_write(*p, -overstep);
474         }
475
476         preempt_enable_nested();
477 }
478
479 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
480 {
481         __inc_zone_state(page_zone(page), item);
482 }
483 EXPORT_SYMBOL(__inc_zone_page_state);
484
485 void __inc_node_page_state(struct page *page, enum node_stat_item item)
486 {
487         __inc_node_state(page_pgdat(page), item);
488 }
489 EXPORT_SYMBOL(__inc_node_page_state);
490
491 void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
492 {
493         struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
494         s8 __percpu *p = pcp->vm_stat_diff + item;
495         s8 v, t;
496
497         /* See __mod_node_page_state */
498         preempt_disable_nested();
499
500         v = __this_cpu_dec_return(*p);
501         t = __this_cpu_read(pcp->stat_threshold);
502         if (unlikely(v < - t)) {
503                 s8 overstep = t >> 1;
504
505                 zone_page_state_add(v - overstep, zone, item);
506                 __this_cpu_write(*p, overstep);
507         }
508
509         preempt_enable_nested();
510 }
511
512 void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
513 {
514         struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
515         s8 __percpu *p = pcp->vm_node_stat_diff + item;
516         s8 v, t;
517
518         VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
519
520         /* See __mod_node_page_state */
521         preempt_disable_nested();
522
523         v = __this_cpu_dec_return(*p);
524         t = __this_cpu_read(pcp->stat_threshold);
525         if (unlikely(v < - t)) {
526                 s8 overstep = t >> 1;
527
528                 node_page_state_add(v - overstep, pgdat, item);
529                 __this_cpu_write(*p, overstep);
530         }
531
532         preempt_enable_nested();
533 }
534
535 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
536 {
537         __dec_zone_state(page_zone(page), item);
538 }
539 EXPORT_SYMBOL(__dec_zone_page_state);
540
541 void __dec_node_page_state(struct page *page, enum node_stat_item item)
542 {
543         __dec_node_state(page_pgdat(page), item);
544 }
545 EXPORT_SYMBOL(__dec_node_page_state);
546
547 #ifdef CONFIG_HAVE_CMPXCHG_LOCAL
548 /*
549  * If we have cmpxchg_local support then we do not need to incur the overhead
550  * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
551  *
552  * mod_state() modifies the zone counter state through atomic per cpu
553  * operations.
554  *
555  * Overstep mode specifies how overstep should handled:
556  *     0       No overstepping
557  *     1       Overstepping half of threshold
558  *     -1      Overstepping minus half of threshold
559 */
560 static inline void mod_zone_state(struct zone *zone,
561        enum zone_stat_item item, long delta, int overstep_mode)
562 {
563         struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
564         s8 __percpu *p = pcp->vm_stat_diff + item;
565         long n, t, z;
566         s8 o;
567
568         o = this_cpu_read(*p);
569         do {
570                 z = 0;  /* overflow to zone counters */
571
572                 /*
573                  * The fetching of the stat_threshold is racy. We may apply
574                  * a counter threshold to the wrong the cpu if we get
575                  * rescheduled while executing here. However, the next
576                  * counter update will apply the threshold again and
577                  * therefore bring the counter under the threshold again.
578                  *
579                  * Most of the time the thresholds are the same anyways
580                  * for all cpus in a zone.
581                  */
582                 t = this_cpu_read(pcp->stat_threshold);
583
584                 n = delta + (long)o;
585
586                 if (abs(n) > t) {
587                         int os = overstep_mode * (t >> 1) ;
588
589                         /* Overflow must be added to zone counters */
590                         z = n + os;
591                         n = -os;
592                 }
593         } while (!this_cpu_try_cmpxchg(*p, &o, n));
594
595         if (z)
596                 zone_page_state_add(z, zone, item);
597 }
598
599 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
600                          long delta)
601 {
602         mod_zone_state(zone, item, delta, 0);
603 }
604 EXPORT_SYMBOL(mod_zone_page_state);
605
606 void inc_zone_page_state(struct page *page, enum zone_stat_item item)
607 {
608         mod_zone_state(page_zone(page), item, 1, 1);
609 }
610 EXPORT_SYMBOL(inc_zone_page_state);
611
612 void dec_zone_page_state(struct page *page, enum zone_stat_item item)
613 {
614         mod_zone_state(page_zone(page), item, -1, -1);
615 }
616 EXPORT_SYMBOL(dec_zone_page_state);
617
618 static inline void mod_node_state(struct pglist_data *pgdat,
619        enum node_stat_item item, int delta, int overstep_mode)
620 {
621         struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
622         s8 __percpu *p = pcp->vm_node_stat_diff + item;
623         long n, t, z;
624         s8 o;
625
626         if (vmstat_item_in_bytes(item)) {
627                 /*
628                  * Only cgroups use subpage accounting right now; at
629                  * the global level, these items still change in
630                  * multiples of whole pages. Store them as pages
631                  * internally to keep the per-cpu counters compact.
632                  */
633                 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
634                 delta >>= PAGE_SHIFT;
635         }
636
637         o = this_cpu_read(*p);
638         do {
639                 z = 0;  /* overflow to node counters */
640
641                 /*
642                  * The fetching of the stat_threshold is racy. We may apply
643                  * a counter threshold to the wrong the cpu if we get
644                  * rescheduled while executing here. However, the next
645                  * counter update will apply the threshold again and
646                  * therefore bring the counter under the threshold again.
647                  *
648                  * Most of the time the thresholds are the same anyways
649                  * for all cpus in a node.
650                  */
651                 t = this_cpu_read(pcp->stat_threshold);
652
653                 n = delta + (long)o;
654
655                 if (abs(n) > t) {
656                         int os = overstep_mode * (t >> 1) ;
657
658                         /* Overflow must be added to node counters */
659                         z = n + os;
660                         n = -os;
661                 }
662         } while (!this_cpu_try_cmpxchg(*p, &o, n));
663
664         if (z)
665                 node_page_state_add(z, pgdat, item);
666 }
667
668 void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
669                                         long delta)
670 {
671         mod_node_state(pgdat, item, delta, 0);
672 }
673 EXPORT_SYMBOL(mod_node_page_state);
674
675 void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
676 {
677         mod_node_state(pgdat, item, 1, 1);
678 }
679
680 void inc_node_page_state(struct page *page, enum node_stat_item item)
681 {
682         mod_node_state(page_pgdat(page), item, 1, 1);
683 }
684 EXPORT_SYMBOL(inc_node_page_state);
685
686 void dec_node_page_state(struct page *page, enum node_stat_item item)
687 {
688         mod_node_state(page_pgdat(page), item, -1, -1);
689 }
690 EXPORT_SYMBOL(dec_node_page_state);
691 #else
692 /*
693  * Use interrupt disable to serialize counter updates
694  */
695 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
696                          long delta)
697 {
698         unsigned long flags;
699
700         local_irq_save(flags);
701         __mod_zone_page_state(zone, item, delta);
702         local_irq_restore(flags);
703 }
704 EXPORT_SYMBOL(mod_zone_page_state);
705
706 void inc_zone_page_state(struct page *page, enum zone_stat_item item)
707 {
708         unsigned long flags;
709         struct zone *zone;
710
711         zone = page_zone(page);
712         local_irq_save(flags);
713         __inc_zone_state(zone, item);
714         local_irq_restore(flags);
715 }
716 EXPORT_SYMBOL(inc_zone_page_state);
717
718 void dec_zone_page_state(struct page *page, enum zone_stat_item item)
719 {
720         unsigned long flags;
721
722         local_irq_save(flags);
723         __dec_zone_page_state(page, item);
724         local_irq_restore(flags);
725 }
726 EXPORT_SYMBOL(dec_zone_page_state);
727
728 void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
729 {
730         unsigned long flags;
731
732         local_irq_save(flags);
733         __inc_node_state(pgdat, item);
734         local_irq_restore(flags);
735 }
736 EXPORT_SYMBOL(inc_node_state);
737
738 void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
739                                         long delta)
740 {
741         unsigned long flags;
742
743         local_irq_save(flags);
744         __mod_node_page_state(pgdat, item, delta);
745         local_irq_restore(flags);
746 }
747 EXPORT_SYMBOL(mod_node_page_state);
748
749 void inc_node_page_state(struct page *page, enum node_stat_item item)
750 {
751         unsigned long flags;
752         struct pglist_data *pgdat;
753
754         pgdat = page_pgdat(page);
755         local_irq_save(flags);
756         __inc_node_state(pgdat, item);
757         local_irq_restore(flags);
758 }
759 EXPORT_SYMBOL(inc_node_page_state);
760
761 void dec_node_page_state(struct page *page, enum node_stat_item item)
762 {
763         unsigned long flags;
764
765         local_irq_save(flags);
766         __dec_node_page_state(page, item);
767         local_irq_restore(flags);
768 }
769 EXPORT_SYMBOL(dec_node_page_state);
770 #endif
771
772 /*
773  * Fold a differential into the global counters.
774  * Returns the number of counters updated.
775  */
776 static int fold_diff(int *zone_diff, int *node_diff)
777 {
778         int i;
779         int changes = 0;
780
781         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
782                 if (zone_diff[i]) {
783                         atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
784                         changes++;
785         }
786
787         for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
788                 if (node_diff[i]) {
789                         atomic_long_add(node_diff[i], &vm_node_stat[i]);
790                         changes++;
791         }
792         return changes;
793 }
794
795 /*
796  * Update the zone counters for the current cpu.
797  *
798  * Note that refresh_cpu_vm_stats strives to only access
799  * node local memory. The per cpu pagesets on remote zones are placed
800  * in the memory local to the processor using that pageset. So the
801  * loop over all zones will access a series of cachelines local to
802  * the processor.
803  *
804  * The call to zone_page_state_add updates the cachelines with the
805  * statistics in the remote zone struct as well as the global cachelines
806  * with the global counters. These could cause remote node cache line
807  * bouncing and will have to be only done when necessary.
808  *
809  * The function returns the number of global counters updated.
810  */
811 static int refresh_cpu_vm_stats(bool do_pagesets)
812 {
813         struct pglist_data *pgdat;
814         struct zone *zone;
815         int i;
816         int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
817         int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
818         int changes = 0;
819
820         for_each_populated_zone(zone) {
821                 struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
822                 struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
823
824                 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
825                         int v;
826
827                         v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0);
828                         if (v) {
829
830                                 atomic_long_add(v, &zone->vm_stat[i]);
831                                 global_zone_diff[i] += v;
832 #ifdef CONFIG_NUMA
833                                 /* 3 seconds idle till flush */
834                                 __this_cpu_write(pcp->expire, 3);
835 #endif
836                         }
837                 }
838
839                 if (do_pagesets) {
840                         cond_resched();
841
842                         changes += decay_pcp_high(zone, this_cpu_ptr(pcp));
843 #ifdef CONFIG_NUMA
844                         /*
845                          * Deal with draining the remote pageset of this
846                          * processor
847                          *
848                          * Check if there are pages remaining in this pageset
849                          * if not then there is nothing to expire.
850                          */
851                         if (!__this_cpu_read(pcp->expire) ||
852                                !__this_cpu_read(pcp->count))
853                                 continue;
854
855                         /*
856                          * We never drain zones local to this processor.
857                          */
858                         if (zone_to_nid(zone) == numa_node_id()) {
859                                 __this_cpu_write(pcp->expire, 0);
860                                 continue;
861                         }
862
863                         if (__this_cpu_dec_return(pcp->expire)) {
864                                 changes++;
865                                 continue;
866                         }
867
868                         if (__this_cpu_read(pcp->count)) {
869                                 drain_zone_pages(zone, this_cpu_ptr(pcp));
870                                 changes++;
871                         }
872 #endif
873                 }
874         }
875
876         for_each_online_pgdat(pgdat) {
877                 struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
878
879                 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
880                         int v;
881
882                         v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
883                         if (v) {
884                                 atomic_long_add(v, &pgdat->vm_stat[i]);
885                                 global_node_diff[i] += v;
886                         }
887                 }
888         }
889
890         changes += fold_diff(global_zone_diff, global_node_diff);
891         return changes;
892 }
893
894 /*
895  * Fold the data for an offline cpu into the global array.
896  * There cannot be any access by the offline cpu and therefore
897  * synchronization is simplified.
898  */
899 void cpu_vm_stats_fold(int cpu)
900 {
901         struct pglist_data *pgdat;
902         struct zone *zone;
903         int i;
904         int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
905         int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
906
907         for_each_populated_zone(zone) {
908                 struct per_cpu_zonestat *pzstats;
909
910                 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
911
912                 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
913                         if (pzstats->vm_stat_diff[i]) {
914                                 int v;
915
916                                 v = pzstats->vm_stat_diff[i];
917                                 pzstats->vm_stat_diff[i] = 0;
918                                 atomic_long_add(v, &zone->vm_stat[i]);
919                                 global_zone_diff[i] += v;
920                         }
921                 }
922 #ifdef CONFIG_NUMA
923                 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
924                         if (pzstats->vm_numa_event[i]) {
925                                 unsigned long v;
926
927                                 v = pzstats->vm_numa_event[i];
928                                 pzstats->vm_numa_event[i] = 0;
929                                 zone_numa_event_add(v, zone, i);
930                         }
931                 }
932 #endif
933         }
934
935         for_each_online_pgdat(pgdat) {
936                 struct per_cpu_nodestat *p;
937
938                 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
939
940                 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
941                         if (p->vm_node_stat_diff[i]) {
942                                 int v;
943
944                                 v = p->vm_node_stat_diff[i];
945                                 p->vm_node_stat_diff[i] = 0;
946                                 atomic_long_add(v, &pgdat->vm_stat[i]);
947                                 global_node_diff[i] += v;
948                         }
949         }
950
951         fold_diff(global_zone_diff, global_node_diff);
952 }
953
954 /*
955  * this is only called if !populated_zone(zone), which implies no other users of
956  * pset->vm_stat_diff[] exist.
957  */
958 void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats)
959 {
960         unsigned long v;
961         int i;
962
963         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
964                 if (pzstats->vm_stat_diff[i]) {
965                         v = pzstats->vm_stat_diff[i];
966                         pzstats->vm_stat_diff[i] = 0;
967                         zone_page_state_add(v, zone, i);
968                 }
969         }
970
971 #ifdef CONFIG_NUMA
972         for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
973                 if (pzstats->vm_numa_event[i]) {
974                         v = pzstats->vm_numa_event[i];
975                         pzstats->vm_numa_event[i] = 0;
976                         zone_numa_event_add(v, zone, i);
977                 }
978         }
979 #endif
980 }
981 #endif
982
983 #ifdef CONFIG_NUMA
984 /*
985  * Determine the per node value of a stat item. This function
986  * is called frequently in a NUMA machine, so try to be as
987  * frugal as possible.
988  */
989 unsigned long sum_zone_node_page_state(int node,
990                                  enum zone_stat_item item)
991 {
992         struct zone *zones = NODE_DATA(node)->node_zones;
993         int i;
994         unsigned long count = 0;
995
996         for (i = 0; i < MAX_NR_ZONES; i++)
997                 count += zone_page_state(zones + i, item);
998
999         return count;
1000 }
1001
1002 /* Determine the per node value of a numa stat item. */
1003 unsigned long sum_zone_numa_event_state(int node,
1004                                  enum numa_stat_item item)
1005 {
1006         struct zone *zones = NODE_DATA(node)->node_zones;
1007         unsigned long count = 0;
1008         int i;
1009
1010         for (i = 0; i < MAX_NR_ZONES; i++)
1011                 count += zone_numa_event_state(zones + i, item);
1012
1013         return count;
1014 }
1015
1016 /*
1017  * Determine the per node value of a stat item.
1018  */
1019 unsigned long node_page_state_pages(struct pglist_data *pgdat,
1020                                     enum node_stat_item item)
1021 {
1022         long x = atomic_long_read(&pgdat->vm_stat[item]);
1023 #ifdef CONFIG_SMP
1024         if (x < 0)
1025                 x = 0;
1026 #endif
1027         return x;
1028 }
1029
1030 unsigned long node_page_state(struct pglist_data *pgdat,
1031                               enum node_stat_item item)
1032 {
1033         VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
1034
1035         return node_page_state_pages(pgdat, item);
1036 }
1037 #endif
1038
1039 /*
1040  * Count number of pages "struct page" and "struct page_ext" consume.
1041  * nr_memmap_boot_pages: # of pages allocated by boot allocator
1042  * nr_memmap_pages: # of pages that were allocated by buddy allocator
1043  */
1044 static atomic_long_t nr_memmap_boot_pages = ATOMIC_LONG_INIT(0);
1045 static atomic_long_t nr_memmap_pages = ATOMIC_LONG_INIT(0);
1046
1047 void memmap_boot_pages_add(long delta)
1048 {
1049         atomic_long_add(delta, &nr_memmap_boot_pages);
1050 }
1051
1052 void memmap_pages_add(long delta)
1053 {
1054         atomic_long_add(delta, &nr_memmap_pages);
1055 }
1056
1057 #ifdef CONFIG_COMPACTION
1058
1059 struct contig_page_info {
1060         unsigned long free_pages;
1061         unsigned long free_blocks_total;
1062         unsigned long free_blocks_suitable;
1063 };
1064
1065 /*
1066  * Calculate the number of free pages in a zone, how many contiguous
1067  * pages are free and how many are large enough to satisfy an allocation of
1068  * the target size. Note that this function makes no attempt to estimate
1069  * how many suitable free blocks there *might* be if MOVABLE pages were
1070  * migrated. Calculating that is possible, but expensive and can be
1071  * figured out from userspace
1072  */
1073 static void fill_contig_page_info(struct zone *zone,
1074                                 unsigned int suitable_order,
1075                                 struct contig_page_info *info)
1076 {
1077         unsigned int order;
1078
1079         info->free_pages = 0;
1080         info->free_blocks_total = 0;
1081         info->free_blocks_suitable = 0;
1082
1083         for (order = 0; order < NR_PAGE_ORDERS; order++) {
1084                 unsigned long blocks;
1085
1086                 /*
1087                  * Count number of free blocks.
1088                  *
1089                  * Access to nr_free is lockless as nr_free is used only for
1090                  * diagnostic purposes. Use data_race to avoid KCSAN warning.
1091                  */
1092                 blocks = data_race(zone->free_area[order].nr_free);
1093                 info->free_blocks_total += blocks;
1094
1095                 /* Count free base pages */
1096                 info->free_pages += blocks << order;
1097
1098                 /* Count the suitable free blocks */
1099                 if (order >= suitable_order)
1100                         info->free_blocks_suitable += blocks <<
1101                                                 (order - suitable_order);
1102         }
1103 }
1104
1105 /*
1106  * A fragmentation index only makes sense if an allocation of a requested
1107  * size would fail. If that is true, the fragmentation index indicates
1108  * whether external fragmentation or a lack of memory was the problem.
1109  * The value can be used to determine if page reclaim or compaction
1110  * should be used
1111  */
1112 static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
1113 {
1114         unsigned long requested = 1UL << order;
1115
1116         if (WARN_ON_ONCE(order > MAX_PAGE_ORDER))
1117                 return 0;
1118
1119         if (!info->free_blocks_total)
1120                 return 0;
1121
1122         /* Fragmentation index only makes sense when a request would fail */
1123         if (info->free_blocks_suitable)
1124                 return -1000;
1125
1126         /*
1127          * Index is between 0 and 1 so return within 3 decimal places
1128          *
1129          * 0 => allocation would fail due to lack of memory
1130          * 1 => allocation would fail due to fragmentation
1131          */
1132         return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
1133 }
1134
1135 /*
1136  * Calculates external fragmentation within a zone wrt the given order.
1137  * It is defined as the percentage of pages found in blocks of size
1138  * less than 1 << order. It returns values in range [0, 100].
1139  */
1140 unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
1141 {
1142         struct contig_page_info info;
1143
1144         fill_contig_page_info(zone, order, &info);
1145         if (info.free_pages == 0)
1146                 return 0;
1147
1148         return div_u64((info.free_pages -
1149                         (info.free_blocks_suitable << order)) * 100,
1150                         info.free_pages);
1151 }
1152
1153 /* Same as __fragmentation index but allocs contig_page_info on stack */
1154 int fragmentation_index(struct zone *zone, unsigned int order)
1155 {
1156         struct contig_page_info info;
1157
1158         fill_contig_page_info(zone, order, &info);
1159         return __fragmentation_index(order, &info);
1160 }
1161 #endif
1162
1163 #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
1164     defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
1165 #ifdef CONFIG_ZONE_DMA
1166 #define TEXT_FOR_DMA(xx) xx "_dma",
1167 #else
1168 #define TEXT_FOR_DMA(xx)
1169 #endif
1170
1171 #ifdef CONFIG_ZONE_DMA32
1172 #define TEXT_FOR_DMA32(xx) xx "_dma32",
1173 #else
1174 #define TEXT_FOR_DMA32(xx)
1175 #endif
1176
1177 #ifdef CONFIG_HIGHMEM
1178 #define TEXT_FOR_HIGHMEM(xx) xx "_high",
1179 #else
1180 #define TEXT_FOR_HIGHMEM(xx)
1181 #endif
1182
1183 #ifdef CONFIG_ZONE_DEVICE
1184 #define TEXT_FOR_DEVICE(xx) xx "_device",
1185 #else
1186 #define TEXT_FOR_DEVICE(xx)
1187 #endif
1188
1189 #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
1190                                         TEXT_FOR_HIGHMEM(xx) xx "_movable", \
1191                                         TEXT_FOR_DEVICE(xx)
1192
1193 const char * const vmstat_text[] = {
1194         /* enum zone_stat_item counters */
1195         "nr_free_pages",
1196         "nr_free_pages_blocks",
1197         "nr_zone_inactive_anon",
1198         "nr_zone_active_anon",
1199         "nr_zone_inactive_file",
1200         "nr_zone_active_file",
1201         "nr_zone_unevictable",
1202         "nr_zone_write_pending",
1203         "nr_mlock",
1204 #if IS_ENABLED(CONFIG_ZSMALLOC)
1205         "nr_zspages",
1206 #endif
1207         "nr_free_cma",
1208 #ifdef CONFIG_UNACCEPTED_MEMORY
1209         "nr_unaccepted",
1210 #endif
1211
1212         /* enum numa_stat_item counters */
1213 #ifdef CONFIG_NUMA
1214         "numa_hit",
1215         "numa_miss",
1216         "numa_foreign",
1217         "numa_interleave",
1218         "numa_local",
1219         "numa_other",
1220 #endif
1221
1222         /* enum node_stat_item counters */
1223         "nr_inactive_anon",
1224         "nr_active_anon",
1225         "nr_inactive_file",
1226         "nr_active_file",
1227         "nr_unevictable",
1228         "nr_slab_reclaimable",
1229         "nr_slab_unreclaimable",
1230         "nr_isolated_anon",
1231         "nr_isolated_file",
1232         "workingset_nodes",
1233         "workingset_refault_anon",
1234         "workingset_refault_file",
1235         "workingset_activate_anon",
1236         "workingset_activate_file",
1237         "workingset_restore_anon",
1238         "workingset_restore_file",
1239         "workingset_nodereclaim",
1240         "nr_anon_pages",
1241         "nr_mapped",
1242         "nr_file_pages",
1243         "nr_dirty",
1244         "nr_writeback",
1245         "nr_writeback_temp",
1246         "nr_shmem",
1247         "nr_shmem_hugepages",
1248         "nr_shmem_pmdmapped",
1249         "nr_file_hugepages",
1250         "nr_file_pmdmapped",
1251         "nr_anon_transparent_hugepages",
1252         "nr_vmscan_write",
1253         "nr_vmscan_immediate_reclaim",
1254         "nr_dirtied",
1255         "nr_written",
1256         "nr_throttled_written",
1257         "nr_kernel_misc_reclaimable",
1258         "nr_foll_pin_acquired",
1259         "nr_foll_pin_released",
1260         "nr_kernel_stack",
1261 #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
1262         "nr_shadow_call_stack",
1263 #endif
1264         "nr_page_table_pages",
1265         "nr_sec_page_table_pages",
1266 #ifdef CONFIG_IOMMU_SUPPORT
1267         "nr_iommu_pages",
1268 #endif
1269 #ifdef CONFIG_SWAP
1270         "nr_swapcached",
1271 #endif
1272 #ifdef CONFIG_NUMA_BALANCING
1273         "pgpromote_success",
1274         "pgpromote_candidate",
1275 #endif
1276         "pgdemote_kswapd",
1277         "pgdemote_direct",
1278         "pgdemote_khugepaged",
1279         "pgdemote_proactive",
1280 #ifdef CONFIG_HUGETLB_PAGE
1281         "nr_hugetlb",
1282 #endif
1283         "nr_balloon_pages",
1284         /* system-wide enum vm_stat_item counters */
1285         "nr_dirty_threshold",
1286         "nr_dirty_background_threshold",
1287         "nr_memmap_pages",
1288         "nr_memmap_boot_pages",
1289
1290 #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
1291         /* enum vm_event_item counters */
1292         "pgpgin",
1293         "pgpgout",
1294         "pswpin",
1295         "pswpout",
1296
1297         TEXTS_FOR_ZONES("pgalloc")
1298         TEXTS_FOR_ZONES("allocstall")
1299         TEXTS_FOR_ZONES("pgskip")
1300
1301         "pgfree",
1302         "pgactivate",
1303         "pgdeactivate",
1304         "pglazyfree",
1305
1306         "pgfault",
1307         "pgmajfault",
1308         "pglazyfreed",
1309
1310         "pgrefill",
1311         "pgreuse",
1312         "pgsteal_kswapd",
1313         "pgsteal_direct",
1314         "pgsteal_khugepaged",
1315         "pgsteal_proactive",
1316         "pgscan_kswapd",
1317         "pgscan_direct",
1318         "pgscan_khugepaged",
1319         "pgscan_proactive",
1320         "pgscan_direct_throttle",
1321         "pgscan_anon",
1322         "pgscan_file",
1323         "pgsteal_anon",
1324         "pgsteal_file",
1325
1326 #ifdef CONFIG_NUMA
1327         "zone_reclaim_success",
1328         "zone_reclaim_failed",
1329 #endif
1330         "pginodesteal",
1331         "slabs_scanned",
1332         "kswapd_inodesteal",
1333         "kswapd_low_wmark_hit_quickly",
1334         "kswapd_high_wmark_hit_quickly",
1335         "pageoutrun",
1336
1337         "pgrotated",
1338
1339         "drop_pagecache",
1340         "drop_slab",
1341         "oom_kill",
1342
1343 #ifdef CONFIG_NUMA_BALANCING
1344         "numa_pte_updates",
1345         "numa_huge_pte_updates",
1346         "numa_hint_faults",
1347         "numa_hint_faults_local",
1348         "numa_pages_migrated",
1349 #endif
1350 #ifdef CONFIG_MIGRATION
1351         "pgmigrate_success",
1352         "pgmigrate_fail",
1353         "thp_migration_success",
1354         "thp_migration_fail",
1355         "thp_migration_split",
1356 #endif
1357 #ifdef CONFIG_COMPACTION
1358         "compact_migrate_scanned",
1359         "compact_free_scanned",
1360         "compact_isolated",
1361         "compact_stall",
1362         "compact_fail",
1363         "compact_success",
1364         "compact_daemon_wake",
1365         "compact_daemon_migrate_scanned",
1366         "compact_daemon_free_scanned",
1367 #endif
1368
1369 #ifdef CONFIG_HUGETLB_PAGE
1370         "htlb_buddy_alloc_success",
1371         "htlb_buddy_alloc_fail",
1372 #endif
1373 #ifdef CONFIG_CMA
1374         "cma_alloc_success",
1375         "cma_alloc_fail",
1376 #endif
1377         "unevictable_pgs_culled",
1378         "unevictable_pgs_scanned",
1379         "unevictable_pgs_rescued",
1380         "unevictable_pgs_mlocked",
1381         "unevictable_pgs_munlocked",
1382         "unevictable_pgs_cleared",
1383         "unevictable_pgs_stranded",
1384
1385 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1386         "thp_fault_alloc",
1387         "thp_fault_fallback",
1388         "thp_fault_fallback_charge",
1389         "thp_collapse_alloc",
1390         "thp_collapse_alloc_failed",
1391         "thp_file_alloc",
1392         "thp_file_fallback",
1393         "thp_file_fallback_charge",
1394         "thp_file_mapped",
1395         "thp_split_page",
1396         "thp_split_page_failed",
1397         "thp_deferred_split_page",
1398         "thp_underused_split_page",
1399         "thp_split_pmd",
1400         "thp_scan_exceed_none_pte",
1401         "thp_scan_exceed_swap_pte",
1402         "thp_scan_exceed_share_pte",
1403 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1404         "thp_split_pud",
1405 #endif
1406         "thp_zero_page_alloc",
1407         "thp_zero_page_alloc_failed",
1408         "thp_swpout",
1409         "thp_swpout_fallback",
1410 #endif
1411 #ifdef CONFIG_MEMORY_BALLOON
1412         "balloon_inflate",
1413         "balloon_deflate",
1414 #ifdef CONFIG_BALLOON_COMPACTION
1415         "balloon_migrate",
1416 #endif
1417 #endif /* CONFIG_MEMORY_BALLOON */
1418 #ifdef CONFIG_DEBUG_TLBFLUSH
1419         "nr_tlb_remote_flush",
1420         "nr_tlb_remote_flush_received",
1421         "nr_tlb_local_flush_all",
1422         "nr_tlb_local_flush_one",
1423 #endif /* CONFIG_DEBUG_TLBFLUSH */
1424
1425 #ifdef CONFIG_SWAP
1426         "swap_ra",
1427         "swap_ra_hit",
1428         "swpin_zero",
1429         "swpout_zero",
1430 #ifdef CONFIG_KSM
1431         "ksm_swpin_copy",
1432 #endif
1433 #endif
1434 #ifdef CONFIG_KSM
1435         "cow_ksm",
1436 #endif
1437 #ifdef CONFIG_ZSWAP
1438         "zswpin",
1439         "zswpout",
1440         "zswpwb",
1441 #endif
1442 #ifdef CONFIG_X86
1443         "direct_map_level2_splits",
1444         "direct_map_level3_splits",
1445         "direct_map_level2_collapses",
1446         "direct_map_level3_collapses",
1447 #endif
1448 #ifdef CONFIG_PER_VMA_LOCK_STATS
1449         "vma_lock_success",
1450         "vma_lock_abort",
1451         "vma_lock_retry",
1452         "vma_lock_miss",
1453 #endif
1454 #ifdef CONFIG_DEBUG_STACK_USAGE
1455         "kstack_1k",
1456 #if THREAD_SIZE > 1024
1457         "kstack_2k",
1458 #endif
1459 #if THREAD_SIZE > 2048
1460         "kstack_4k",
1461 #endif
1462 #if THREAD_SIZE > 4096
1463         "kstack_8k",
1464 #endif
1465 #if THREAD_SIZE > 8192
1466         "kstack_16k",
1467 #endif
1468 #if THREAD_SIZE > 16384
1469         "kstack_32k",
1470 #endif
1471 #if THREAD_SIZE > 32768
1472         "kstack_64k",
1473 #endif
1474 #if THREAD_SIZE > 65536
1475         "kstack_rest",
1476 #endif
1477 #endif
1478 #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
1479 };
1480 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
1481
1482 #if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
1483      defined(CONFIG_PROC_FS)
1484 static void *frag_start(struct seq_file *m, loff_t *pos)
1485 {
1486         pg_data_t *pgdat;
1487         loff_t node = *pos;
1488
1489         for (pgdat = first_online_pgdat();
1490              pgdat && node;
1491              pgdat = next_online_pgdat(pgdat))
1492                 --node;
1493
1494         return pgdat;
1495 }
1496
1497 static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
1498 {
1499         pg_data_t *pgdat = (pg_data_t *)arg;
1500
1501         (*pos)++;
1502         return next_online_pgdat(pgdat);
1503 }
1504
1505 static void frag_stop(struct seq_file *m, void *arg)
1506 {
1507 }
1508
1509 /*
1510  * Walk zones in a node and print using a callback.
1511  * If @assert_populated is true, only use callback for zones that are populated.
1512  */
1513 static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
1514                 bool assert_populated, bool nolock,
1515                 void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
1516 {
1517         struct zone *zone;
1518         struct zone *node_zones = pgdat->node_zones;
1519         unsigned long flags;
1520
1521         for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
1522                 if (assert_populated && !populated_zone(zone))
1523                         continue;
1524
1525                 if (!nolock)
1526                         spin_lock_irqsave(&zone->lock, flags);
1527                 print(m, pgdat, zone);
1528                 if (!nolock)
1529                         spin_unlock_irqrestore(&zone->lock, flags);
1530         }
1531 }
1532 #endif
1533
1534 #ifdef CONFIG_PROC_FS
1535 static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
1536                                                 struct zone *zone)
1537 {
1538         int order;
1539
1540         seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1541         for (order = 0; order < NR_PAGE_ORDERS; ++order)
1542                 /*
1543                  * Access to nr_free is lockless as nr_free is used only for
1544                  * printing purposes. Use data_race to avoid KCSAN warning.
1545                  */
1546                 seq_printf(m, "%6lu ", data_race(zone->free_area[order].nr_free));
1547         seq_putc(m, '\n');
1548 }
1549
1550 /*
1551  * This walks the free areas for each zone.
1552  */
1553 static int frag_show(struct seq_file *m, void *arg)
1554 {
1555         pg_data_t *pgdat = (pg_data_t *)arg;
1556         walk_zones_in_node(m, pgdat, true, false, frag_show_print);
1557         return 0;
1558 }
1559
1560 static void pagetypeinfo_showfree_print(struct seq_file *m,
1561                                         pg_data_t *pgdat, struct zone *zone)
1562 {
1563         int order, mtype;
1564
1565         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
1566                 seq_printf(m, "Node %4d, zone %8s, type %12s ",
1567                                         pgdat->node_id,
1568                                         zone->name,
1569                                         migratetype_names[mtype]);
1570                 for (order = 0; order < NR_PAGE_ORDERS; ++order) {
1571                         unsigned long freecount = 0;
1572                         struct free_area *area;
1573                         struct list_head *curr;
1574                         bool overflow = false;
1575
1576                         area = &(zone->free_area[order]);
1577
1578                         list_for_each(curr, &area->free_list[mtype]) {
1579                                 /*
1580                                  * Cap the free_list iteration because it might
1581                                  * be really large and we are under a spinlock
1582                                  * so a long time spent here could trigger a
1583                                  * hard lockup detector. Anyway this is a
1584                                  * debugging tool so knowing there is a handful
1585                                  * of pages of this order should be more than
1586                                  * sufficient.
1587                                  */
1588                                 if (++freecount >= 100000) {
1589                                         overflow = true;
1590                                         break;
1591                                 }
1592                         }
1593                         seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
1594                         spin_unlock_irq(&zone->lock);
1595                         cond_resched();
1596                         spin_lock_irq(&zone->lock);
1597                 }
1598                 seq_putc(m, '\n');
1599         }
1600 }
1601
1602 /* Print out the free pages at each order for each migatetype */
1603 static void pagetypeinfo_showfree(struct seq_file *m, void *arg)
1604 {
1605         int order;
1606         pg_data_t *pgdat = (pg_data_t *)arg;
1607
1608         /* Print header */
1609         seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
1610         for (order = 0; order < NR_PAGE_ORDERS; ++order)
1611                 seq_printf(m, "%6d ", order);
1612         seq_putc(m, '\n');
1613
1614         walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
1615 }
1616
1617 static void pagetypeinfo_showblockcount_print(struct seq_file *m,
1618                                         pg_data_t *pgdat, struct zone *zone)
1619 {
1620         int mtype;
1621         unsigned long pfn;
1622         unsigned long start_pfn = zone->zone_start_pfn;
1623         unsigned long end_pfn = zone_end_pfn(zone);
1624         unsigned long count[MIGRATE_TYPES] = { 0, };
1625
1626         for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
1627                 struct page *page;
1628
1629                 page = pfn_to_online_page(pfn);
1630                 if (!page)
1631                         continue;
1632
1633                 if (page_zone(page) != zone)
1634                         continue;
1635
1636                 mtype = get_pageblock_migratetype(page);
1637
1638                 if (mtype < MIGRATE_TYPES)
1639                         count[mtype]++;
1640         }
1641
1642         /* Print counts */
1643         seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1644         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1645                 seq_printf(m, "%12lu ", count[mtype]);
1646         seq_putc(m, '\n');
1647 }
1648
1649 /* Print out the number of pageblocks for each migratetype */
1650 static void pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
1651 {
1652         int mtype;
1653         pg_data_t *pgdat = (pg_data_t *)arg;
1654
1655         seq_printf(m, "\n%-23s", "Number of blocks type ");
1656         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1657                 seq_printf(m, "%12s ", migratetype_names[mtype]);
1658         seq_putc(m, '\n');
1659         walk_zones_in_node(m, pgdat, true, false,
1660                 pagetypeinfo_showblockcount_print);
1661 }
1662
1663 /*
1664  * Print out the number of pageblocks for each migratetype that contain pages
1665  * of other types. This gives an indication of how well fallbacks are being
1666  * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
1667  * to determine what is going on
1668  */
1669 static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
1670 {
1671 #ifdef CONFIG_PAGE_OWNER
1672         int mtype;
1673
1674         if (!static_branch_unlikely(&page_owner_inited))
1675                 return;
1676
1677         drain_all_pages(NULL);
1678
1679         seq_printf(m, "\n%-23s", "Number of mixed blocks ");
1680         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1681                 seq_printf(m, "%12s ", migratetype_names[mtype]);
1682         seq_putc(m, '\n');
1683
1684         walk_zones_in_node(m, pgdat, true, true,
1685                 pagetypeinfo_showmixedcount_print);
1686 #endif /* CONFIG_PAGE_OWNER */
1687 }
1688
1689 /*
1690  * This prints out statistics in relation to grouping pages by mobility.
1691  * It is expensive to collect so do not constantly read the file.
1692  */
1693 static int pagetypeinfo_show(struct seq_file *m, void *arg)
1694 {
1695         pg_data_t *pgdat = (pg_data_t *)arg;
1696
1697         /* check memoryless node */
1698         if (!node_state(pgdat->node_id, N_MEMORY))
1699                 return 0;
1700
1701         seq_printf(m, "Page block order: %d\n", pageblock_order);
1702         seq_printf(m, "Pages per block:  %lu\n", pageblock_nr_pages);
1703         seq_putc(m, '\n');
1704         pagetypeinfo_showfree(m, pgdat);
1705         pagetypeinfo_showblockcount(m, pgdat);
1706         pagetypeinfo_showmixedcount(m, pgdat);
1707
1708         return 0;
1709 }
1710
1711 static const struct seq_operations fragmentation_op = {
1712         .start  = frag_start,
1713         .next   = frag_next,
1714         .stop   = frag_stop,
1715         .show   = frag_show,
1716 };
1717
1718 static const struct seq_operations pagetypeinfo_op = {
1719         .start  = frag_start,
1720         .next   = frag_next,
1721         .stop   = frag_stop,
1722         .show   = pagetypeinfo_show,
1723 };
1724
1725 static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
1726 {
1727         int zid;
1728
1729         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1730                 struct zone *compare = &pgdat->node_zones[zid];
1731
1732                 if (populated_zone(compare))
1733                         return zone == compare;
1734         }
1735
1736         return false;
1737 }
1738
1739 static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1740                                                         struct zone *zone)
1741 {
1742         int i;
1743         seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
1744         if (is_zone_first_populated(pgdat, zone)) {
1745                 seq_printf(m, "\n  per-node stats");
1746                 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
1747                         unsigned long pages = node_page_state_pages(pgdat, i);
1748
1749                         if (vmstat_item_print_in_thp(i))
1750                                 pages /= HPAGE_PMD_NR;
1751                         seq_printf(m, "\n      %-12s %lu", node_stat_name(i),
1752                                    pages);
1753                 }
1754         }
1755         seq_printf(m,
1756                    "\n  pages free     %lu"
1757                    "\n        boost    %lu"
1758                    "\n        min      %lu"
1759                    "\n        low      %lu"
1760                    "\n        high     %lu"
1761                    "\n        promo    %lu"
1762                    "\n        spanned  %lu"
1763                    "\n        present  %lu"
1764                    "\n        managed  %lu"
1765                    "\n        cma      %lu",
1766                    zone_page_state(zone, NR_FREE_PAGES),
1767                    zone->watermark_boost,
1768                    min_wmark_pages(zone),
1769                    low_wmark_pages(zone),
1770                    high_wmark_pages(zone),
1771                    promo_wmark_pages(zone),
1772                    zone->spanned_pages,
1773                    zone->present_pages,
1774                    zone_managed_pages(zone),
1775                    zone_cma_pages(zone));
1776
1777         seq_printf(m,
1778                    "\n        protection: (%ld",
1779                    zone->lowmem_reserve[0]);
1780         for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
1781                 seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
1782         seq_putc(m, ')');
1783
1784         /* If unpopulated, no other information is useful */
1785         if (!populated_zone(zone)) {
1786                 seq_putc(m, '\n');
1787                 return;
1788         }
1789
1790         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
1791                 seq_printf(m, "\n      %-12s %lu", zone_stat_name(i),
1792                            zone_page_state(zone, i));
1793
1794 #ifdef CONFIG_NUMA
1795         fold_vm_zone_numa_events(zone);
1796         for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
1797                 seq_printf(m, "\n      %-12s %lu", numa_stat_name(i),
1798                            zone_numa_event_state(zone, i));
1799 #endif
1800
1801         seq_printf(m, "\n  pagesets");
1802         for_each_online_cpu(i) {
1803                 struct per_cpu_pages *pcp;
1804                 struct per_cpu_zonestat __maybe_unused *pzstats;
1805
1806                 pcp = per_cpu_ptr(zone->per_cpu_pageset, i);
1807                 seq_printf(m,
1808                            "\n    cpu: %i"
1809                            "\n              count:    %i"
1810                            "\n              high:     %i"
1811                            "\n              batch:    %i"
1812                            "\n              high_min: %i"
1813                            "\n              high_max: %i",
1814                            i,
1815                            pcp->count,
1816                            pcp->high,
1817                            pcp->batch,
1818                            pcp->high_min,
1819                            pcp->high_max);
1820 #ifdef CONFIG_SMP
1821                 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i);
1822                 seq_printf(m, "\n  vm stats threshold: %d",
1823                                 pzstats->stat_threshold);
1824 #endif
1825         }
1826         seq_printf(m,
1827                    "\n  node_unreclaimable:  %u"
1828                    "\n  start_pfn:           %lu",
1829                    pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
1830                    zone->zone_start_pfn);
1831         seq_putc(m, '\n');
1832 }
1833
1834 /*
1835  * Output information about zones in @pgdat.  All zones are printed regardless
1836  * of whether they are populated or not: lowmem_reserve_ratio operates on the
1837  * set of all zones and userspace would not be aware of such zones if they are
1838  * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
1839  */
1840 static int zoneinfo_show(struct seq_file *m, void *arg)
1841 {
1842         pg_data_t *pgdat = (pg_data_t *)arg;
1843         walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
1844         return 0;
1845 }
1846
1847 static const struct seq_operations zoneinfo_op = {
1848         .start  = frag_start, /* iterate over all zones. The same as in
1849                                * fragmentation. */
1850         .next   = frag_next,
1851         .stop   = frag_stop,
1852         .show   = zoneinfo_show,
1853 };
1854
1855 #define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
1856                          NR_VM_NUMA_EVENT_ITEMS + \
1857                          NR_VM_NODE_STAT_ITEMS + \
1858                          NR_VM_STAT_ITEMS + \
1859                          (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
1860                           NR_VM_EVENT_ITEMS : 0))
1861
1862 static void *vmstat_start(struct seq_file *m, loff_t *pos)
1863 {
1864         unsigned long *v;
1865         int i;
1866
1867         if (*pos >= NR_VMSTAT_ITEMS)
1868                 return NULL;
1869
1870         BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
1871         fold_vm_numa_events();
1872         v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
1873         m->private = v;
1874         if (!v)
1875                 return ERR_PTR(-ENOMEM);
1876         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
1877                 v[i] = global_zone_page_state(i);
1878         v += NR_VM_ZONE_STAT_ITEMS;
1879
1880 #ifdef CONFIG_NUMA
1881         for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
1882                 v[i] = global_numa_event_state(i);
1883         v += NR_VM_NUMA_EVENT_ITEMS;
1884 #endif
1885
1886         for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
1887                 v[i] = global_node_page_state_pages(i);
1888                 if (vmstat_item_print_in_thp(i))
1889                         v[i] /= HPAGE_PMD_NR;
1890         }
1891         v += NR_VM_NODE_STAT_ITEMS;
1892
1893         global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
1894                             v + NR_DIRTY_THRESHOLD);
1895         v[NR_MEMMAP_PAGES] = atomic_long_read(&nr_memmap_pages);
1896         v[NR_MEMMAP_BOOT_PAGES] = atomic_long_read(&nr_memmap_boot_pages);
1897         v += NR_VM_STAT_ITEMS;
1898
1899 #ifdef CONFIG_VM_EVENT_COUNTERS
1900         all_vm_events(v);
1901         v[PGPGIN] /= 2;         /* sectors -> kbytes */
1902         v[PGPGOUT] /= 2;
1903 #endif
1904         return (unsigned long *)m->private + *pos;
1905 }
1906
1907 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1908 {
1909         (*pos)++;
1910         if (*pos >= NR_VMSTAT_ITEMS)
1911                 return NULL;
1912         return (unsigned long *)m->private + *pos;
1913 }
1914
1915 static int vmstat_show(struct seq_file *m, void *arg)
1916 {
1917         unsigned long *l = arg;
1918         unsigned long off = l - (unsigned long *)m->private;
1919
1920         seq_puts(m, vmstat_text[off]);
1921         seq_put_decimal_ull(m, " ", *l);
1922         seq_putc(m, '\n');
1923
1924         if (off == NR_VMSTAT_ITEMS - 1) {
1925                 /*
1926                  * We've come to the end - add any deprecated counters to avoid
1927                  * breaking userspace which might depend on them being present.
1928                  */
1929                 seq_puts(m, "nr_unstable 0\n");
1930         }
1931         return 0;
1932 }
1933
1934 static void vmstat_stop(struct seq_file *m, void *arg)
1935 {
1936         kfree(m->private);
1937         m->private = NULL;
1938 }
1939
1940 static const struct seq_operations vmstat_op = {
1941         .start  = vmstat_start,
1942         .next   = vmstat_next,
1943         .stop   = vmstat_stop,
1944         .show   = vmstat_show,
1945 };
1946 #endif /* CONFIG_PROC_FS */
1947
1948 #ifdef CONFIG_SMP
1949 static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
1950 static int sysctl_stat_interval __read_mostly = HZ;
1951 static int vmstat_late_init_done;
1952
1953 #ifdef CONFIG_PROC_FS
1954 static void refresh_vm_stats(struct work_struct *work)
1955 {
1956         refresh_cpu_vm_stats(true);
1957 }
1958
1959 static int vmstat_refresh(const struct ctl_table *table, int write,
1960                    void *buffer, size_t *lenp, loff_t *ppos)
1961 {
1962         long val;
1963         int err;
1964         int i;
1965
1966         /*
1967          * The regular update, every sysctl_stat_interval, may come later
1968          * than expected: leaving a significant amount in per_cpu buckets.
1969          * This is particularly misleading when checking a quantity of HUGE
1970          * pages, immediately after running a test.  /proc/sys/vm/stat_refresh,
1971          * which can equally be echo'ed to or cat'ted from (by root),
1972          * can be used to update the stats just before reading them.
1973          *
1974          * Oh, and since global_zone_page_state() etc. are so careful to hide
1975          * transiently negative values, report an error here if any of
1976          * the stats is negative, so we know to go looking for imbalance.
1977          */
1978         err = schedule_on_each_cpu(refresh_vm_stats);
1979         if (err)
1980                 return err;
1981         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
1982                 /*
1983                  * Skip checking stats known to go negative occasionally.
1984                  */
1985                 switch (i) {
1986                 case NR_ZONE_WRITE_PENDING:
1987                 case NR_FREE_CMA_PAGES:
1988                         continue;
1989                 }
1990                 val = atomic_long_read(&vm_zone_stat[i]);
1991                 if (val < 0) {
1992                         pr_warn("%s: %s %ld\n",
1993                                 __func__, zone_stat_name(i), val);
1994                 }
1995         }
1996         for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
1997                 /*
1998                  * Skip checking stats known to go negative occasionally.
1999                  */
2000                 switch (i) {
2001                 case NR_WRITEBACK:
2002                         continue;
2003                 }
2004                 val = atomic_long_read(&vm_node_stat[i]);
2005                 if (val < 0) {
2006                         pr_warn("%s: %s %ld\n",
2007                                 __func__, node_stat_name(i), val);
2008                 }
2009         }
2010         if (write)
2011                 *ppos += *lenp;
2012         else
2013                 *lenp = 0;
2014         return 0;
2015 }
2016 #endif /* CONFIG_PROC_FS */
2017
2018 static void vmstat_update(struct work_struct *w)
2019 {
2020         if (refresh_cpu_vm_stats(true)) {
2021                 /*
2022                  * Counters were updated so we expect more updates
2023                  * to occur in the future. Keep on running the
2024                  * update worker thread.
2025                  */
2026                 queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
2027                                 this_cpu_ptr(&vmstat_work),
2028                                 round_jiffies_relative(sysctl_stat_interval));
2029         }
2030 }
2031
2032 /*
2033  * Check if the diffs for a certain cpu indicate that
2034  * an update is needed.
2035  */
2036 static bool need_update(int cpu)
2037 {
2038         pg_data_t *last_pgdat = NULL;
2039         struct zone *zone;
2040
2041         for_each_populated_zone(zone) {
2042                 struct per_cpu_zonestat *pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
2043                 struct per_cpu_nodestat *n;
2044
2045                 /*
2046                  * The fast way of checking if there are any vmstat diffs.
2047                  */
2048                 if (memchr_inv(pzstats->vm_stat_diff, 0, sizeof(pzstats->vm_stat_diff)))
2049                         return true;
2050
2051                 if (last_pgdat == zone->zone_pgdat)
2052                         continue;
2053                 last_pgdat = zone->zone_pgdat;
2054                 n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu);
2055                 if (memchr_inv(n->vm_node_stat_diff, 0, sizeof(n->vm_node_stat_diff)))
2056                         return true;
2057         }
2058         return false;
2059 }
2060
2061 /*
2062  * Switch off vmstat processing and then fold all the remaining differentials
2063  * until the diffs stay at zero. The function is used by NOHZ and can only be
2064  * invoked when tick processing is not active.
2065  */
2066 void quiet_vmstat(void)
2067 {
2068         if (system_state != SYSTEM_RUNNING)
2069                 return;
2070
2071         if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
2072                 return;
2073
2074         if (!need_update(smp_processor_id()))
2075                 return;
2076
2077         /*
2078          * Just refresh counters and do not care about the pending delayed
2079          * vmstat_update. It doesn't fire that often to matter and canceling
2080          * it would be too expensive from this path.
2081          * vmstat_shepherd will take care about that for us.
2082          */
2083         refresh_cpu_vm_stats(false);
2084 }
2085
2086 /*
2087  * Shepherd worker thread that checks the
2088  * differentials of processors that have their worker
2089  * threads for vm statistics updates disabled because of
2090  * inactivity.
2091  */
2092 static void vmstat_shepherd(struct work_struct *w);
2093
2094 static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
2095
2096 static void vmstat_shepherd(struct work_struct *w)
2097 {
2098         int cpu;
2099
2100         cpus_read_lock();
2101         /* Check processors whose vmstat worker threads have been disabled */
2102         for_each_online_cpu(cpu) {
2103                 struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
2104
2105                 /*
2106                  * In kernel users of vmstat counters either require the precise value and
2107                  * they are using zone_page_state_snapshot interface or they can live with
2108                  * an imprecision as the regular flushing can happen at arbitrary time and
2109                  * cumulative error can grow (see calculate_normal_threshold).
2110                  *
2111                  * From that POV the regular flushing can be postponed for CPUs that have
2112                  * been isolated from the kernel interference without critical
2113                  * infrastructure ever noticing. Skip regular flushing from vmstat_shepherd
2114                  * for all isolated CPUs to avoid interference with the isolated workload.
2115                  */
2116                 if (cpu_is_isolated(cpu))
2117                         continue;
2118
2119                 if (!delayed_work_pending(dw) && need_update(cpu))
2120                         queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
2121
2122                 cond_resched();
2123         }
2124         cpus_read_unlock();
2125
2126         schedule_delayed_work(&shepherd,
2127                 round_jiffies_relative(sysctl_stat_interval));
2128 }
2129
2130 static void __init start_shepherd_timer(void)
2131 {
2132         int cpu;
2133
2134         for_each_possible_cpu(cpu) {
2135                 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
2136                         vmstat_update);
2137
2138                 /*
2139                  * For secondary CPUs during CPU hotplug scenarios,
2140                  * vmstat_cpu_online() will enable the work.
2141                  * mm/vmstat:online enables and disables vmstat_work
2142                  * symmetrically during CPU hotplug events.
2143                  */
2144                 if (!cpu_online(cpu))
2145                         disable_delayed_work_sync(&per_cpu(vmstat_work, cpu));
2146         }
2147
2148         schedule_delayed_work(&shepherd,
2149                 round_jiffies_relative(sysctl_stat_interval));
2150 }
2151
2152 static void __init init_cpu_node_state(void)
2153 {
2154         int node;
2155
2156         for_each_online_node(node) {
2157                 if (!cpumask_empty(cpumask_of_node(node)))
2158                         node_set_state(node, N_CPU);
2159         }
2160 }
2161
2162 static int vmstat_cpu_online(unsigned int cpu)
2163 {
2164         if (vmstat_late_init_done)
2165                 refresh_zone_stat_thresholds();
2166
2167         if (!node_state(cpu_to_node(cpu), N_CPU)) {
2168                 node_set_state(cpu_to_node(cpu), N_CPU);
2169         }
2170         enable_delayed_work(&per_cpu(vmstat_work, cpu));
2171
2172         return 0;
2173 }
2174
2175 static int vmstat_cpu_down_prep(unsigned int cpu)
2176 {
2177         disable_delayed_work_sync(&per_cpu(vmstat_work, cpu));
2178         return 0;
2179 }
2180
2181 static int vmstat_cpu_dead(unsigned int cpu)
2182 {
2183         const struct cpumask *node_cpus;
2184         int node;
2185
2186         node = cpu_to_node(cpu);
2187
2188         refresh_zone_stat_thresholds();
2189         node_cpus = cpumask_of_node(node);
2190         if (!cpumask_empty(node_cpus))
2191                 return 0;
2192
2193         node_clear_state(node, N_CPU);
2194
2195         return 0;
2196 }
2197
2198 static int __init vmstat_late_init(void)
2199 {
2200         refresh_zone_stat_thresholds();
2201         vmstat_late_init_done = 1;
2202
2203         return 0;
2204 }
2205 late_initcall(vmstat_late_init);
2206 #endif
2207
2208 #ifdef CONFIG_PROC_FS
2209 static const struct ctl_table vmstat_table[] = {
2210 #ifdef CONFIG_SMP
2211         {
2212                 .procname       = "stat_interval",
2213                 .data           = &sysctl_stat_interval,
2214                 .maxlen         = sizeof(sysctl_stat_interval),
2215                 .mode           = 0644,
2216                 .proc_handler   = proc_dointvec_jiffies,
2217         },
2218         {
2219                 .procname       = "stat_refresh",
2220                 .data           = NULL,
2221                 .maxlen         = 0,
2222                 .mode           = 0600,
2223                 .proc_handler   = vmstat_refresh,
2224         },
2225 #endif
2226 #ifdef CONFIG_NUMA
2227         {
2228                 .procname       = "numa_stat",
2229                 .data           = &sysctl_vm_numa_stat,
2230                 .maxlen         = sizeof(int),
2231                 .mode           = 0644,
2232                 .proc_handler   = sysctl_vm_numa_stat_handler,
2233                 .extra1         = SYSCTL_ZERO,
2234                 .extra2         = SYSCTL_ONE,
2235         },
2236 #endif
2237 };
2238 #endif
2239
2240 struct workqueue_struct *mm_percpu_wq;
2241
2242 void __init init_mm_internals(void)
2243 {
2244         int ret __maybe_unused;
2245
2246         mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
2247
2248 #ifdef CONFIG_SMP
2249         ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
2250                                         NULL, vmstat_cpu_dead);
2251         if (ret < 0)
2252                 pr_err("vmstat: failed to register 'dead' hotplug state\n");
2253
2254         ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
2255                                         vmstat_cpu_online,
2256                                         vmstat_cpu_down_prep);
2257         if (ret < 0)
2258                 pr_err("vmstat: failed to register 'online' hotplug state\n");
2259
2260         cpus_read_lock();
2261         init_cpu_node_state();
2262         cpus_read_unlock();
2263
2264         start_shepherd_timer();
2265 #endif
2266 #ifdef CONFIG_PROC_FS
2267         proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
2268         proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
2269         proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
2270         proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
2271         register_sysctl_init("vm", vmstat_table);
2272 #endif
2273 }
2274
2275 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
2276
2277 /*
2278  * Return an index indicating how much of the available free memory is
2279  * unusable for an allocation of the requested size.
2280  */
2281 static int unusable_free_index(unsigned int order,
2282                                 struct contig_page_info *info)
2283 {
2284         /* No free memory is interpreted as all free memory is unusable */
2285         if (info->free_pages == 0)
2286                 return 1000;
2287
2288         /*
2289          * Index should be a value between 0 and 1. Return a value to 3
2290          * decimal places.
2291          *
2292          * 0 => no fragmentation
2293          * 1 => high fragmentation
2294          */
2295         return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
2296
2297 }
2298
2299 static void unusable_show_print(struct seq_file *m,
2300                                         pg_data_t *pgdat, struct zone *zone)
2301 {
2302         unsigned int order;
2303         int index;
2304         struct contig_page_info info;
2305
2306         seq_printf(m, "Node %d, zone %8s ",
2307                                 pgdat->node_id,
2308                                 zone->name);
2309         for (order = 0; order < NR_PAGE_ORDERS; ++order) {
2310                 fill_contig_page_info(zone, order, &info);
2311                 index = unusable_free_index(order, &info);
2312                 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2313         }
2314
2315         seq_putc(m, '\n');
2316 }
2317
2318 /*
2319  * Display unusable free space index
2320  *
2321  * The unusable free space index measures how much of the available free
2322  * memory cannot be used to satisfy an allocation of a given size and is a
2323  * value between 0 and 1. The higher the value, the more of free memory is
2324  * unusable and by implication, the worse the external fragmentation is. This
2325  * can be expressed as a percentage by multiplying by 100.
2326  */
2327 static int unusable_show(struct seq_file *m, void *arg)
2328 {
2329         pg_data_t *pgdat = (pg_data_t *)arg;
2330
2331         /* check memoryless node */
2332         if (!node_state(pgdat->node_id, N_MEMORY))
2333                 return 0;
2334
2335         walk_zones_in_node(m, pgdat, true, false, unusable_show_print);
2336
2337         return 0;
2338 }
2339
2340 static const struct seq_operations unusable_sops = {
2341         .start  = frag_start,
2342         .next   = frag_next,
2343         .stop   = frag_stop,
2344         .show   = unusable_show,
2345 };
2346
2347 DEFINE_SEQ_ATTRIBUTE(unusable);
2348
2349 static void extfrag_show_print(struct seq_file *m,
2350                                         pg_data_t *pgdat, struct zone *zone)
2351 {
2352         unsigned int order;
2353         int index;
2354
2355         /* Alloc on stack as interrupts are disabled for zone walk */
2356         struct contig_page_info info;
2357
2358         seq_printf(m, "Node %d, zone %8s ",
2359                                 pgdat->node_id,
2360                                 zone->name);
2361         for (order = 0; order < NR_PAGE_ORDERS; ++order) {
2362                 fill_contig_page_info(zone, order, &info);
2363                 index = __fragmentation_index(order, &info);
2364                 seq_printf(m, "%2d.%03d ", index / 1000, index % 1000);
2365         }
2366
2367         seq_putc(m, '\n');
2368 }
2369
2370 /*
2371  * Display fragmentation index for orders that allocations would fail for
2372  */
2373 static int extfrag_show(struct seq_file *m, void *arg)
2374 {
2375         pg_data_t *pgdat = (pg_data_t *)arg;
2376
2377         walk_zones_in_node(m, pgdat, true, false, extfrag_show_print);
2378
2379         return 0;
2380 }
2381
2382 static const struct seq_operations extfrag_sops = {
2383         .start  = frag_start,
2384         .next   = frag_next,
2385         .stop   = frag_stop,
2386         .show   = extfrag_show,
2387 };
2388
2389 DEFINE_SEQ_ATTRIBUTE(extfrag);
2390
2391 static int __init extfrag_debug_init(void)
2392 {
2393         struct dentry *extfrag_debug_root;
2394
2395         extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
2396
2397         debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
2398                             &unusable_fops);
2399
2400         debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
2401                             &extfrag_fops);
2402
2403         return 0;
2404 }
2405
2406 module_init(extfrag_debug_init);
2407
2408 #endif