mm/vmstat.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  *  linux/mm/vmstat.c
   4  *
   5  *  Manages VM statistics
   6  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   7  *
   8  *  zoned VM statistics
   9  *  Copyright (C) 2006 Silicon Graphics, Inc.,
  10  *              Christoph Lameter <cl@gentwo.org>
  11  *  Copyright (C) 2008-2014 Christoph Lameter
  12  */
  13 #include <linux/fs.h>
  14 #include <linux/mm.h>
  15 #include <linux/err.h>
  16 #include <linux/module.h>
  17 #include <linux/slab.h>
  18 #include <linux/cpu.h>
  19 #include <linux/cpumask.h>
  20 #include <linux/vmstat.h>
  21 #include <linux/proc_fs.h>
  22 #include <linux/seq_file.h>
  23 #include <linux/debugfs.h>
  24 #include <linux/sched.h>
  25 #include <linux/math64.h>
  26 #include <linux/writeback.h>
  27 #include <linux/compaction.h>
  28 #include <linux/mm_inline.h>
  29 #include <linux/page_owner.h>
  30 #include <linux/sched/isolation.h>
  31
  32 #include "internal.h"
  33
  34 #ifdef CONFIG_PROC_FS
  35 #ifdef CONFIG_NUMA
  36 #define ENABLE_NUMA_STAT 1
  37 static int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
  38
  39 /* zero numa counters within a zone */
  40 static void zero_zone_numa_counters(struct zone *zone)
  41 {
  42         int item, cpu;
  43
  44         for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) {
  45                 atomic_long_set(&zone->vm_numa_event[item], 0);
  46                 for_each_online_cpu(cpu) {
  47                         per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item]
  48                                                 = 0;
  49                 }
  50         }
  51 }
  52
  53 /* zero numa counters of all the populated zones */
  54 static void zero_zones_numa_counters(void)
  55 {
  56         struct zone *zone;
  57
  58         for_each_populated_zone(zone)
  59                 zero_zone_numa_counters(zone);
  60 }
  61
  62 /* zero global numa counters */
  63 static void zero_global_numa_counters(void)
  64 {
  65         int item;
  66
  67         for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
  68                 atomic_long_set(&vm_numa_event[item], 0);
  69 }
  70
  71 static void invalid_numa_statistics(void)
  72 {
  73         zero_zones_numa_counters();
  74         zero_global_numa_counters();
  75 }
  76
  77 static DEFINE_MUTEX(vm_numa_stat_lock);
  78
  79 static int sysctl_vm_numa_stat_handler(const struct ctl_table *table, int write,
  80                 void *buffer, size_t *length, loff_t *ppos)
  81 {
  82         int ret, oldval;
  83
  84         mutex_lock(&vm_numa_stat_lock);
  85         if (write)
  86                 oldval = sysctl_vm_numa_stat;
  87         ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
  88         if (ret || !write)
  89                 goto out;
  90
  91         if (oldval == sysctl_vm_numa_stat)
  92                 goto out;
  93         else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
  94                 static_branch_enable(&vm_numa_stat_key);
  95                 pr_info("enable numa statistics\n");
  96         } else {
  97                 static_branch_disable(&vm_numa_stat_key);
  98                 invalid_numa_statistics();
  99                 pr_info("disable numa statistics, and clear numa counters\n");
 100         }
 101
 102 out:
 103         mutex_unlock(&vm_numa_stat_lock);
 104         return ret;
 105 }
 106 #endif
 107 #endif /* CONFIG_PROC_FS */
 108
 109 #ifdef CONFIG_VM_EVENT_COUNTERS
 110 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
 111 EXPORT_PER_CPU_SYMBOL(vm_event_states);
 112
 113 static void sum_vm_events(unsigned long *ret)
 114 {
 115         int cpu;
 116         int i;
 117
 118         memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
 119
 120         for_each_online_cpu(cpu) {
 121                 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
 122
 123                 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
 124                         ret[i] += this->event[i];
 125         }
 126 }
 127
 128 /*
 129  * Accumulate the vm event counters across all CPUs.
 130  * The result is unavoidably approximate - it can change
 131  * during and after execution of this function.
 132 */
 133 void all_vm_events(unsigned long *ret)
 134 {
 135         cpus_read_lock();
 136         sum_vm_events(ret);
 137         cpus_read_unlock();
 138 }
 139 EXPORT_SYMBOL_GPL(all_vm_events);
 140
 141 /*
 142  * Fold the foreign cpu events into our own.
 143  *
 144  * This is adding to the events on one processor
 145  * but keeps the global counts constant.
 146  */
 147 void vm_events_fold_cpu(int cpu)
 148 {
 149         struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
 150         int i;
 151
 152         for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
 153                 count_vm_events(i, fold_state->event[i]);
 154                 fold_state->event[i] = 0;
 155         }
 156 }
 157
 158 #endif /* CONFIG_VM_EVENT_COUNTERS */
 159
 160 /*
 161  * Manage combined zone based / global counters
 162  *
 163  * vm_stat contains the global counters
 164  */
 165 atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
 166 atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
 167 atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
 168 EXPORT_SYMBOL(vm_zone_stat);
 169 EXPORT_SYMBOL(vm_node_stat);
 170
 171 #ifdef CONFIG_NUMA
 172 static void fold_vm_zone_numa_events(struct zone *zone)
 173 {
 174         unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
 175         int cpu;
 176         enum numa_stat_item item;
 177
 178         for_each_online_cpu(cpu) {
 179                 struct per_cpu_zonestat *pzstats;
 180
 181                 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
 182                 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
 183                         zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
 184         }
 185
 186         for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
 187                 zone_numa_event_add(zone_numa_events[item], zone, item);
 188 }
 189
 190 void fold_vm_numa_events(void)
 191 {
 192         struct zone *zone;
 193
 194         for_each_populated_zone(zone)
 195                 fold_vm_zone_numa_events(zone);
 196 }
 197 #endif
 198
 199 #ifdef CONFIG_SMP
 200
 201 int calculate_pressure_threshold(struct zone *zone)
 202 {
 203         int threshold;
 204         int watermark_distance;
 205
 206         /*
 207          * As vmstats are not up to date, there is drift between the estimated
 208          * and real values. For high thresholds and a high number of CPUs, it
 209          * is possible for the min watermark to be breached while the estimated
 210          * value looks fine. The pressure threshold is a reduced value such
 211          * that even the maximum amount of drift will not accidentally breach
 212          * the min watermark
 213          */
 214         watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
 215         threshold = max(1, (int)(watermark_distance / num_online_cpus()));
 216
 217         /*
 218          * Maximum threshold is 125
 219          */
 220         threshold = min(125, threshold);
 221
 222         return threshold;
 223 }
 224
 225 int calculate_normal_threshold(struct zone *zone)
 226 {
 227         int threshold;
 228         int mem;        /* memory in 128 MB units */
 229
 230         /*
 231          * The threshold scales with the number of processors and the amount
 232          * of memory per zone. More memory means that we can defer updates for
 233          * longer, more processors could lead to more contention.
 234          * fls() is used to have a cheap way of logarithmic scaling.
 235          *
 236          * Some sample thresholds:
 237          *
 238          * Threshold    Processors      (fls)   Zonesize        fls(mem)+1
 239          * ------------------------------------------------------------------
 240          * 8            1               1       0.9-1 GB        4
 241          * 16           2               2       0.9-1 GB        4
 242          * 20           2               2       1-2 GB          5
 243          * 24           2               2       2-4 GB          6
 244          * 28           2               2       4-8 GB          7
 245          * 32           2               2       8-16 GB         8
 246          * 4            2               2       <128M           1
 247          * 30           4               3       2-4 GB          5
 248          * 48           4               3       8-16 GB         8
 249          * 32           8               4       1-2 GB          4
 250          * 32           8               4       0.9-1GB         4
 251          * 10           16              5       <128M           1
 252          * 40           16              5       900M            4
 253          * 70           64              7       2-4 GB          5
 254          * 84           64              7       4-8 GB          6
 255          * 108          512             9       4-8 GB          6
 256          * 125          1024            10      8-16 GB         8
 257          * 125          1024            10      16-32 GB        9
 258          */
 259
 260         mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
 261
 262         threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
 263
 264         /*
 265          * Maximum threshold is 125
 266          */
 267         threshold = min(125, threshold);
 268
 269         return threshold;
 270 }
 271
 272 /*
 273  * Refresh the thresholds for each zone.
 274  */
 275 void refresh_zone_stat_thresholds(void)
 276 {
 277         struct pglist_data *pgdat;
 278         struct zone *zone;
 279         int cpu;
 280         int threshold;
 281
 282         /* Zero current pgdat thresholds */
 283         for_each_online_pgdat(pgdat) {
 284                 for_each_online_cpu(cpu) {
 285                         per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
 286                 }
 287         }
 288
 289         for_each_populated_zone(zone) {
 290                 struct pglist_data *pgdat = zone->zone_pgdat;
 291                 unsigned long max_drift, tolerate_drift;
 292
 293                 threshold = calculate_normal_threshold(zone);
 294
 295                 for_each_online_cpu(cpu) {
 296                         int pgdat_threshold;
 297
 298                         per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
 299                                                         = threshold;
 300
 301                         /* Base nodestat threshold on the largest populated zone. */
 302                         pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
 303                         per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
 304                                 = max(threshold, pgdat_threshold);
 305                 }
 306
 307                 /*
 308                  * Only set percpu_drift_mark if there is a danger that
 309                  * NR_FREE_PAGES reports the low watermark is ok when in fact
 310                  * the min watermark could be breached by an allocation
 311                  */
 312                 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
 313                 max_drift = num_online_cpus() * threshold;
 314                 if (max_drift > tolerate_drift)
 315                         zone->percpu_drift_mark = high_wmark_pages(zone) +
 316                                         max_drift;
 317         }
 318 }
 319
 320 void set_pgdat_percpu_threshold(pg_data_t *pgdat,
 321                                 int (*calculate_pressure)(struct zone *))
 322 {
 323         struct zone *zone;
 324         int cpu;
 325         int threshold;
 326         int i;
 327
 328         for (i = 0; i < pgdat->nr_zones; i++) {
 329                 zone = &pgdat->node_zones[i];
 330                 if (!zone->percpu_drift_mark)
 331                         continue;
 332
 333                 threshold = (*calculate_pressure)(zone);
 334                 for_each_online_cpu(cpu)
 335                         per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
 336                                                         = threshold;
 337         }
 338 }
 339
 340 /*
 341  * For use when we know that interrupts are disabled,
 342  * or when we know that preemption is disabled and that
 343  * particular counter cannot be updated from interrupt context.
 344  */
 345 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 346                            long delta)
 347 {
 348         struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
 349         s8 __percpu *p = pcp->vm_stat_diff + item;
 350         long x;
 351         long t;
 352
 353         /*
 354          * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels,
 355          * atomicity is provided by IRQs being disabled -- either explicitly
 356          * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables
 357          * CPU migrations and preemption potentially corrupts a counter so
 358          * disable preemption.
 359          */
 360         preempt_disable_nested();
 361
 362         x = delta + __this_cpu_read(*p);
 363
 364         t = __this_cpu_read(pcp->stat_threshold);
 365
 366         if (unlikely(abs(x) > t)) {
 367                 zone_page_state_add(x, zone, item);
 368                 x = 0;
 369         }
 370         __this_cpu_write(*p, x);
 371
 372         preempt_enable_nested();
 373 }
 374 EXPORT_SYMBOL(__mod_zone_page_state);
 375
 376 void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
 377                                 long delta)
 378 {
 379         struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
 380         s8 __percpu *p = pcp->vm_node_stat_diff + item;
 381         long x;
 382         long t;
 383
 384         if (vmstat_item_in_bytes(item)) {
 385                 /*
 386                  * Only cgroups use subpage accounting right now; at
 387                  * the global level, these items still change in
 388                  * multiples of whole pages. Store them as pages
 389                  * internally to keep the per-cpu counters compact.
 390                  */
 391                 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
 392                 delta >>= PAGE_SHIFT;
 393         }
 394
 395         /* See __mod_node_page_state */
 396         preempt_disable_nested();
 397
 398         x = delta + __this_cpu_read(*p);
 399
 400         t = __this_cpu_read(pcp->stat_threshold);
 401
 402         if (unlikely(abs(x) > t)) {
 403                 node_page_state_add(x, pgdat, item);
 404                 x = 0;
 405         }
 406         __this_cpu_write(*p, x);
 407
 408         preempt_enable_nested();
 409 }
 410 EXPORT_SYMBOL(__mod_node_page_state);
 411
 412 /*
 413  * Optimized increment and decrement functions.
 414  *
 415  * These are only for a single page and therefore can take a struct page *
 416  * argument instead of struct zone *. This allows the inclusion of the code
 417  * generated for page_zone(page) into the optimized functions.
 418  *
 419  * No overflow check is necessary and therefore the differential can be
 420  * incremented or decremented in place which may allow the compilers to
 421  * generate better code.
 422  * The increment or decrement is known and therefore one boundary check can
 423  * be omitted.
 424  *
 425  * NOTE: These functions are very performance sensitive. Change only
 426  * with care.
 427  *
 428  * Some processors have inc/dec instructions that are atomic vs an interrupt.
 429  * However, the code must first determine the differential location in a zone
 430  * based on the processor number and then inc/dec the counter. There is no
 431  * guarantee without disabling preemption that the processor will not change
 432  * in between and therefore the atomicity vs. interrupt cannot be exploited
 433  * in a useful way here.
 434  */
 435 void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 436 {
 437         struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
 438         s8 __percpu *p = pcp->vm_stat_diff + item;
 439         s8 v, t;
 440
 441         /* See __mod_node_page_state */
 442         preempt_disable_nested();
 443
 444         v = __this_cpu_inc_return(*p);
 445         t = __this_cpu_read(pcp->stat_threshold);
 446         if (unlikely(v > t)) {
 447                 s8 overstep = t >> 1;
 448
 449                 zone_page_state_add(v + overstep, zone, item);
 450                 __this_cpu_write(*p, -overstep);
 451         }
 452
 453         preempt_enable_nested();
 454 }
 455
 456 void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
 457 {
 458         struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
 459         s8 __percpu *p = pcp->vm_node_stat_diff + item;
 460         s8 v, t;
 461
 462         VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
 463
 464         /* See __mod_node_page_state */
 465         preempt_disable_nested();
 466
 467         v = __this_cpu_inc_return(*p);
 468         t = __this_cpu_read(pcp->stat_threshold);
 469         if (unlikely(v > t)) {
 470                 s8 overstep = t >> 1;
 471
 472                 node_page_state_add(v + overstep, pgdat, item);
 473                 __this_cpu_write(*p, -overstep);
 474         }
 475
 476         preempt_enable_nested();
 477 }
 478
 479 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
 480 {
 481         __inc_zone_state(page_zone(page), item);
 482 }
 483 EXPORT_SYMBOL(__inc_zone_page_state);
 484
 485 void __inc_node_page_state(struct page *page, enum node_stat_item item)
 486 {
 487         __inc_node_state(page_pgdat(page), item);
 488 }
 489 EXPORT_SYMBOL(__inc_node_page_state);
 490
 491 void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
 492 {
 493         struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
 494         s8 __percpu *p = pcp->vm_stat_diff + item;
 495         s8 v, t;
 496
 497         /* See __mod_node_page_state */
 498         preempt_disable_nested();
 499
 500         v = __this_cpu_dec_return(*p);
 501         t = __this_cpu_read(pcp->stat_threshold);
 502         if (unlikely(v < - t)) {
 503                 s8 overstep = t >> 1;
 504
 505                 zone_page_state_add(v - overstep, zone, item);
 506                 __this_cpu_write(*p, overstep);
 507         }
 508
 509         preempt_enable_nested();
 510 }
 511
 512 void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
 513 {
 514         struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
 515         s8 __percpu *p = pcp->vm_node_stat_diff + item;
 516         s8 v, t;
 517
 518         VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
 519
 520         /* See __mod_node_page_state */
 521         preempt_disable_nested();
 522
 523         v = __this_cpu_dec_return(*p);
 524         t = __this_cpu_read(pcp->stat_threshold);
 525         if (unlikely(v < - t)) {
 526                 s8 overstep = t >> 1;
 527
 528                 node_page_state_add(v - overstep, pgdat, item);
 529                 __this_cpu_write(*p, overstep);
 530         }
 531
 532         preempt_enable_nested();
 533 }
 534
 535 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
 536 {
 537         __dec_zone_state(page_zone(page), item);
 538 }
 539 EXPORT_SYMBOL(__dec_zone_page_state);
 540
 541 void __dec_node_page_state(struct page *page, enum node_stat_item item)
 542 {
 543         __dec_node_state(page_pgdat(page), item);
 544 }
 545 EXPORT_SYMBOL(__dec_node_page_state);
 546
 547 #ifdef CONFIG_HAVE_CMPXCHG_LOCAL
 548 /*
 549  * If we have cmpxchg_local support then we do not need to incur the overhead
 550  * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
 551  *
 552  * mod_state() modifies the zone counter state through atomic per cpu
 553  * operations.
 554  *
 555  * Overstep mode specifies how overstep should handled:
 556  *     0       No overstepping
 557  *     1       Overstepping half of threshold
 558  *     -1      Overstepping minus half of threshold
 559 */
 560 static inline void mod_zone_state(struct zone *zone,
 561        enum zone_stat_item item, long delta, int overstep_mode)
 562 {
 563         struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
 564         s8 __percpu *p = pcp->vm_stat_diff + item;
 565         long n, t, z;
 566         s8 o;
 567
 568         o = this_cpu_read(*p);
 569         do {
 570                 z = 0;  /* overflow to zone counters */
 571
 572                 /*
 573                  * The fetching of the stat_threshold is racy. We may apply
 574                  * a counter threshold to the wrong the cpu if we get
 575                  * rescheduled while executing here. However, the next
 576                  * counter update will apply the threshold again and
 577                  * therefore bring the counter under the threshold again.
 578                  *
 579                  * Most of the time the thresholds are the same anyways
 580                  * for all cpus in a zone.
 581                  */
 582                 t = this_cpu_read(pcp->stat_threshold);
 583
 584                 n = delta + (long)o;
 585
 586                 if (abs(n) > t) {
 587                         int os = overstep_mode * (t >> 1) ;
 588
 589                         /* Overflow must be added to zone counters */
 590                         z = n + os;
 591                         n = -os;
 592                 }
 593         } while (!this_cpu_try_cmpxchg(*p, &o, n));
 594
 595         if (z)
 596                 zone_page_state_add(z, zone, item);
 597 }
 598
 599 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 600                          long delta)
 601 {
 602         mod_zone_state(zone, item, delta, 0);
 603 }
 604 EXPORT_SYMBOL(mod_zone_page_state);
 605
 606 void inc_zone_page_state(struct page *page, enum zone_stat_item item)
 607 {
 608         mod_zone_state(page_zone(page), item, 1, 1);
 609 }
 610 EXPORT_SYMBOL(inc_zone_page_state);
 611
 612 void dec_zone_page_state(struct page *page, enum zone_stat_item item)
 613 {
 614         mod_zone_state(page_zone(page), item, -1, -1);
 615 }
 616 EXPORT_SYMBOL(dec_zone_page_state);
 617
 618 static inline void mod_node_state(struct pglist_data *pgdat,
 619        enum node_stat_item item, int delta, int overstep_mode)
 620 {
 621         struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
 622         s8 __percpu *p = pcp->vm_node_stat_diff + item;
 623         long n, t, z;
 624         s8 o;
 625
 626         if (vmstat_item_in_bytes(item)) {
 627                 /*
 628                  * Only cgroups use subpage accounting right now; at
 629                  * the global level, these items still change in
 630                  * multiples of whole pages. Store them as pages
 631                  * internally to keep the per-cpu counters compact.
 632                  */
 633                 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
 634                 delta >>= PAGE_SHIFT;
 635         }
 636
 637         o = this_cpu_read(*p);
 638         do {
 639                 z = 0;  /* overflow to node counters */
 640
 641                 /*
 642                  * The fetching of the stat_threshold is racy. We may apply
 643                  * a counter threshold to the wrong the cpu if we get
 644                  * rescheduled while executing here. However, the next
 645                  * counter update will apply the threshold again and
 646                  * therefore bring the counter under the threshold again.
 647                  *
 648                  * Most of the time the thresholds are the same anyways
 649                  * for all cpus in a node.
 650                  */
 651                 t = this_cpu_read(pcp->stat_threshold);
 652
 653                 n = delta + (long)o;
 654
 655                 if (abs(n) > t) {
 656                         int os = overstep_mode * (t >> 1) ;
 657
 658                         /* Overflow must be added to node counters */
 659                         z = n + os;
 660                         n = -os;
 661                 }
 662         } while (!this_cpu_try_cmpxchg(*p, &o, n));
 663
 664         if (z)
 665                 node_page_state_add(z, pgdat, item);
 666 }
 667
 668 void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
 669                                         long delta)
 670 {
 671         mod_node_state(pgdat, item, delta, 0);
 672 }
 673 EXPORT_SYMBOL(mod_node_page_state);
 674
 675 void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
 676 {
 677         mod_node_state(pgdat, item, 1, 1);
 678 }
 679
 680 void inc_node_page_state(struct page *page, enum node_stat_item item)
 681 {
 682         mod_node_state(page_pgdat(page), item, 1, 1);
 683 }
 684 EXPORT_SYMBOL(inc_node_page_state);
 685
 686 void dec_node_page_state(struct page *page, enum node_stat_item item)
 687 {
 688         mod_node_state(page_pgdat(page), item, -1, -1);
 689 }
 690 EXPORT_SYMBOL(dec_node_page_state);
 691 #else
 692 /*
 693  * Use interrupt disable to serialize counter updates
 694  */
 695 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 696                          long delta)
 697 {
 698         unsigned long flags;
 699
 700         local_irq_save(flags);
 701         __mod_zone_page_state(zone, item, delta);
 702         local_irq_restore(flags);
 703 }
 704 EXPORT_SYMBOL(mod_zone_page_state);
 705
 706 void inc_zone_page_state(struct page *page, enum zone_stat_item item)
 707 {
 708         unsigned long flags;
 709         struct zone *zone;
 710
 711         zone = page_zone(page);
 712         local_irq_save(flags);
 713         __inc_zone_state(zone, item);
 714         local_irq_restore(flags);
 715 }
 716 EXPORT_SYMBOL(inc_zone_page_state);
 717
 718 void dec_zone_page_state(struct page *page, enum zone_stat_item item)
 719 {
 720         unsigned long flags;
 721
 722         local_irq_save(flags);
 723         __dec_zone_page_state(page, item);
 724         local_irq_restore(flags);
 725 }
 726 EXPORT_SYMBOL(dec_zone_page_state);
 727
 728 void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
 729 {
 730         unsigned long flags;
 731
 732         local_irq_save(flags);
 733         __inc_node_state(pgdat, item);
 734         local_irq_restore(flags);
 735 }
 736 EXPORT_SYMBOL(inc_node_state);
 737
 738 void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
 739                                         long delta)
 740 {
 741         unsigned long flags;
 742
 743         local_irq_save(flags);
 744         __mod_node_page_state(pgdat, item, delta);
 745         local_irq_restore(flags);
 746 }
 747 EXPORT_SYMBOL(mod_node_page_state);
 748
 749 void inc_node_page_state(struct page *page, enum node_stat_item item)
 750 {
 751         unsigned long flags;
 752         struct pglist_data *pgdat;
 753
 754         pgdat = page_pgdat(page);
 755         local_irq_save(flags);
 756         __inc_node_state(pgdat, item);
 757         local_irq_restore(flags);
 758 }
 759 EXPORT_SYMBOL(inc_node_page_state);
 760
 761 void dec_node_page_state(struct page *page, enum node_stat_item item)
 762 {
 763         unsigned long flags;
 764
 765         local_irq_save(flags);
 766         __dec_node_page_state(page, item);
 767         local_irq_restore(flags);
 768 }
 769 EXPORT_SYMBOL(dec_node_page_state);
 770 #endif
 771
 772 /*
 773  * Fold a differential into the global counters.
 774  * Returns the number of counters updated.
 775  */
 776 static int fold_diff(int *zone_diff, int *node_diff)
 777 {
 778         int i;
 779         int changes = 0;
 780
 781         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
 782                 if (zone_diff[i]) {
 783                         atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
 784                         changes++;
 785         }
 786
 787         for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
 788                 if (node_diff[i]) {
 789                         atomic_long_add(node_diff[i], &vm_node_stat[i]);
 790                         changes++;
 791         }
 792         return changes;
 793 }
 794
 795 /*
 796  * Update the zone counters for the current cpu.
 797  *
 798  * Note that refresh_cpu_vm_stats strives to only access
 799  * node local memory. The per cpu pagesets on remote zones are placed
 800  * in the memory local to the processor using that pageset. So the
 801  * loop over all zones will access a series of cachelines local to
 802  * the processor.
 803  *
 804  * The call to zone_page_state_add updates the cachelines with the
 805  * statistics in the remote zone struct as well as the global cachelines
 806  * with the global counters. These could cause remote node cache line
 807  * bouncing and will have to be only done when necessary.
 808  *
 809  * The function returns the number of global counters updated.
 810  */
 811 static int refresh_cpu_vm_stats(bool do_pagesets)
 812 {
 813         struct pglist_data *pgdat;
 814         struct zone *zone;
 815         int i;
 816         int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
 817         int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
 818         int changes = 0;
 819
 820         for_each_populated_zone(zone) {
 821                 struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
 822                 struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
 823
 824                 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
 825                         int v;
 826
 827                         v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0);
 828                         if (v) {
 829
 830                                 atomic_long_add(v, &zone->vm_stat[i]);
 831                                 global_zone_diff[i] += v;
 832 #ifdef CONFIG_NUMA
 833                                 /* 3 seconds idle till flush */
 834                                 __this_cpu_write(pcp->expire, 3);
 835 #endif
 836                         }
 837                 }
 838
 839                 if (do_pagesets) {
 840                         cond_resched();
 841
 842                         changes += decay_pcp_high(zone, this_cpu_ptr(pcp));
 843 #ifdef CONFIG_NUMA
 844                         /*
 845                          * Deal with draining the remote pageset of this
 846                          * processor
 847                          *
 848                          * Check if there are pages remaining in this pageset
 849                          * if not then there is nothing to expire.
 850                          */
 851                         if (!__this_cpu_read(pcp->expire) ||
 852                                !__this_cpu_read(pcp->count))
 853                                 continue;
 854
 855                         /*
 856                          * We never drain zones local to this processor.
 857                          */
 858                         if (zone_to_nid(zone) == numa_node_id()) {
 859                                 __this_cpu_write(pcp->expire, 0);
 860                                 continue;
 861                         }
 862
 863                         if (__this_cpu_dec_return(pcp->expire)) {
 864                                 changes++;
 865                                 continue;
 866                         }
 867
 868                         if (__this_cpu_read(pcp->count)) {
 869                                 drain_zone_pages(zone, this_cpu_ptr(pcp));
 870                                 changes++;
 871                         }
 872 #endif
 873                 }
 874         }
 875
 876         for_each_online_pgdat(pgdat) {
 877                 struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
 878
 879                 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
 880                         int v;
 881
 882                         v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
 883                         if (v) {
 884                                 atomic_long_add(v, &pgdat->vm_stat[i]);
 885                                 global_node_diff[i] += v;
 886                         }
 887                 }
 888         }
 889
 890         changes += fold_diff(global_zone_diff, global_node_diff);
 891         return changes;
 892 }
 893
 894 /*
 895  * Fold the data for an offline cpu into the global array.
 896  * There cannot be any access by the offline cpu and therefore
 897  * synchronization is simplified.
 898  */
 899 void cpu_vm_stats_fold(int cpu)
 900 {
 901         struct pglist_data *pgdat;
 902         struct zone *zone;
 903         int i;
 904         int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
 905         int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
 906
 907         for_each_populated_zone(zone) {
 908                 struct per_cpu_zonestat *pzstats;
 909
 910                 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
 911
 912                 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
 913                         if (pzstats->vm_stat_diff[i]) {
 914                                 int v;
 915
 916                                 v = pzstats->vm_stat_diff[i];
 917                                 pzstats->vm_stat_diff[i] = 0;
 918                                 atomic_long_add(v, &zone->vm_stat[i]);
 919                                 global_zone_diff[i] += v;
 920                         }
 921                 }
 922 #ifdef CONFIG_NUMA
 923                 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
 924                         if (pzstats->vm_numa_event[i]) {
 925                                 unsigned long v;
 926
 927                                 v = pzstats->vm_numa_event[i];
 928                                 pzstats->vm_numa_event[i] = 0;
 929                                 zone_numa_event_add(v, zone, i);
 930                         }
 931                 }
 932 #endif
 933         }
 934
 935         for_each_online_pgdat(pgdat) {
 936                 struct per_cpu_nodestat *p;
 937
 938                 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
 939
 940                 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
 941                         if (p->vm_node_stat_diff[i]) {
 942                                 int v;
 943
 944                                 v = p->vm_node_stat_diff[i];
 945                                 p->vm_node_stat_diff[i] = 0;
 946                                 atomic_long_add(v, &pgdat->vm_stat[i]);
 947                                 global_node_diff[i] += v;
 948                         }
 949         }
 950
 951         fold_diff(global_zone_diff, global_node_diff);
 952 }
 953
 954 /*
 955  * this is only called if !populated_zone(zone), which implies no other users of
 956  * pset->vm_stat_diff[] exist.
 957  */
 958 void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats)
 959 {
 960         unsigned long v;
 961         int i;
 962
 963         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
 964                 if (pzstats->vm_stat_diff[i]) {
 965                         v = pzstats->vm_stat_diff[i];
 966                         pzstats->vm_stat_diff[i] = 0;
 967                         zone_page_state_add(v, zone, i);
 968                 }
 969         }
 970
 971 #ifdef CONFIG_NUMA
 972         for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
 973                 if (pzstats->vm_numa_event[i]) {
 974                         v = pzstats->vm_numa_event[i];
 975                         pzstats->vm_numa_event[i] = 0;
 976                         zone_numa_event_add(v, zone, i);
 977                 }
 978         }
 979 #endif
 980 }
 981 #endif
 982
 983 #ifdef CONFIG_NUMA
 984 /*
 985  * Determine the per node value of a stat item. This function
 986  * is called frequently in a NUMA machine, so try to be as
 987  * frugal as possible.
 988  */
 989 unsigned long sum_zone_node_page_state(int node,
 990                                  enum zone_stat_item item)
 991 {
 992         struct zone *zones = NODE_DATA(node)->node_zones;
 993         int i;
 994         unsigned long count = 0;
 995
 996         for (i = 0; i < MAX_NR_ZONES; i++)
 997                 count += zone_page_state(zones + i, item);
 998
 999         return count;
1000 }
1001
1002 /* Determine the per node value of a numa stat item. */
1003 unsigned long sum_zone_numa_event_state(int node,
1004                                  enum numa_stat_item item)
1005 {
1006         struct zone *zones = NODE_DATA(node)->node_zones;
1007         unsigned long count = 0;
1008         int i;
1009
1010         for (i = 0; i < MAX_NR_ZONES; i++)
1011                 count += zone_numa_event_state(zones + i, item);
1012
1013         return count;
1014 }
1015
1016 /*
1017  * Determine the per node value of a stat item.
1018  */
1019 unsigned long node_page_state_pages(struct pglist_data *pgdat,
1020                                     enum node_stat_item item)
1021 {
1022         long x = atomic_long_read(&pgdat->vm_stat[item]);
1023 #ifdef CONFIG_SMP
1024         if (x < 0)
1025                 x = 0;
1026 #endif
1027         return x;
1028 }
1029
1030 unsigned long node_page_state(struct pglist_data *pgdat,
1031                               enum node_stat_item item)
1032 {
1033         VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
1034
1035         return node_page_state_pages(pgdat, item);
1036 }
1037 #endif
1038
1039 /*
1040  * Count number of pages "struct page" and "struct page_ext" consume.
1041  * nr_memmap_boot_pages: # of pages allocated by boot allocator
1042  * nr_memmap_pages: # of pages that were allocated by buddy allocator
1043  */
1044 static atomic_long_t nr_memmap_boot_pages = ATOMIC_LONG_INIT(0);
1045 static atomic_long_t nr_memmap_pages = ATOMIC_LONG_INIT(0);
1046
1047 void memmap_boot_pages_add(long delta)
1048 {
1049         atomic_long_add(delta, &nr_memmap_boot_pages);
1050 }
1051
1052 void memmap_pages_add(long delta)
1053 {
1054         atomic_long_add(delta, &nr_memmap_pages);
1055 }
1056
1057 #ifdef CONFIG_COMPACTION
1058
1059 struct contig_page_info {
1060         unsigned long free_pages;
1061         unsigned long free_blocks_total;
1062         unsigned long free_blocks_suitable;
1063 };
1064
1065 /*
1066  * Calculate the number of free pages in a zone, how many contiguous
1067  * pages are free and how many are large enough to satisfy an allocation of
1068  * the target size. Note that this function makes no attempt to estimate
1069  * how many suitable free blocks there *might* be if MOVABLE pages were
1070  * migrated. Calculating that is possible, but expensive and can be
1071  * figured out from userspace
1072  */
1073 static void fill_contig_page_info(struct zone *zone,
1074                                 unsigned int suitable_order,
1075                                 struct contig_page_info *info)
1076 {
1077         unsigned int order;
1078
1079         info->free_pages = 0;
1080         info->free_blocks_total = 0;
1081         info->free_blocks_suitable = 0;
1082
1083         for (order = 0; order < NR_PAGE_ORDERS; order++) {
1084                 unsigned long blocks;
1085
1086                 /*
1087                  * Count number of free blocks.
1088                  *
1089                  * Access to nr_free is lockless as nr_free is used only for
1090                  * diagnostic purposes. Use data_race to avoid KCSAN warning.
1091                  */
1092                 blocks = data_race(zone->free_area[order].nr_free);
1093                 info->free_blocks_total += blocks;
1094
1095                 /* Count free base pages */
1096                 info->free_pages += blocks << order;
1097
1098                 /* Count the suitable free blocks */
1099                 if (order >= suitable_order)
1100                         info->free_blocks_suitable += blocks <<
1101                                                 (order - suitable_order);
1102         }
1103 }
1104
1105 /*
1106  * A fragmentation index only makes sense if an allocation of a requested
1107  * size would fail. If that is true, the fragmentation index indicates
1108  * whether external fragmentation or a lack of memory was the problem.
1109  * The value can be used to determine if page reclaim or compaction
1110  * should be used
1111  */
1112 static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
1113 {
1114         unsigned long requested = 1UL << order;
1115
1116         if (WARN_ON_ONCE(order > MAX_PAGE_ORDER))
1117                 return 0;
1118
1119         if (!info->free_blocks_total)
1120                 return 0;
1121
1122         /* Fragmentation index only makes sense when a request would fail */
1123         if (info->free_blocks_suitable)
1124                 return -1000;
1125
1126         /*
1127          * Index is between 0 and 1 so return within 3 decimal places
1128          *
1129          * 0 => allocation would fail due to lack of memory
1130          * 1 => allocation would fail due to fragmentation
1131          */
1132         return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
1133 }
1134
1135 /*
1136  * Calculates external fragmentation within a zone wrt the given order.
1137  * It is defined as the percentage of pages found in blocks of size
1138  * less than 1 << order. It returns values in range [0, 100].
1139  */
1140 unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
1141 {
1142         struct contig_page_info info;
1143
1144         fill_contig_page_info(zone, order, &info);
1145         if (info.free_pages == 0)
1146                 return 0;
1147
1148         return div_u64((info.free_pages -
1149                         (info.free_blocks_suitable << order)) * 100,
1150                         info.free_pages);
1151 }
1152
1153 /* Same as __fragmentation index but allocs contig_page_info on stack */
1154 int fragmentation_index(struct zone *zone, unsigned int order)
1155 {
1156         struct contig_page_info info;
1157
1158         fill_contig_page_info(zone, order, &info);
1159         return __fragmentation_index(order, &info);
1160 }
1161 #endif
1162
1163 #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
1164     defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
1165 #ifdef CONFIG_ZONE_DMA
1166 #define TEXT_FOR_DMA(xx) xx "_dma",
1167 #else
1168 #define TEXT_FOR_DMA(xx)
1169 #endif
1170
1171 #ifdef CONFIG_ZONE_DMA32
1172 #define TEXT_FOR_DMA32(xx) xx "_dma32",
1173 #else
1174 #define TEXT_FOR_DMA32(xx)
1175 #endif
1176
1177 #ifdef CONFIG_HIGHMEM
1178 #define TEXT_FOR_HIGHMEM(xx) xx "_high",
1179 #else
1180 #define TEXT_FOR_HIGHMEM(xx)
1181 #endif
1182
1183 #ifdef CONFIG_ZONE_DEVICE
1184 #define TEXT_FOR_DEVICE(xx) xx "_device",
1185 #else
1186 #define TEXT_FOR_DEVICE(xx)
1187 #endif
1188
1189 #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
1190                                         TEXT_FOR_HIGHMEM(xx) xx "_movable", \
1191                                         TEXT_FOR_DEVICE(xx)
1192
1193 const char * const vmstat_text[] = {
1194         /* enum zone_stat_item counters */
1195         "nr_free_pages",
1196         "nr_free_pages_blocks",
1197         "nr_zone_inactive_anon",
1198         "nr_zone_active_anon",
1199         "nr_zone_inactive_file",
1200         "nr_zone_active_file",
1201         "nr_zone_unevictable",
1202         "nr_zone_write_pending",
1203         "nr_mlock",
1204 #if IS_ENABLED(CONFIG_ZSMALLOC)
1205         "nr_zspages",
1206 #endif
1207         "nr_free_cma",
1208 #ifdef CONFIG_UNACCEPTED_MEMORY
1209         "nr_unaccepted",
1210 #endif
1211
1212         /* enum numa_stat_item counters */
1213 #ifdef CONFIG_NUMA
1214         "numa_hit",
1215         "numa_miss",
1216         "numa_foreign",
1217         "numa_interleave",
1218         "numa_local",
1219         "numa_other",
1220 #endif
1221
1222         /* enum node_stat_item counters */
1223         "nr_inactive_anon",
1224         "nr_active_anon",
1225         "nr_inactive_file",
1226         "nr_active_file",
1227         "nr_unevictable",
1228         "nr_slab_reclaimable",
1229         "nr_slab_unreclaimable",
1230         "nr_isolated_anon",
1231         "nr_isolated_file",
1232         "workingset_nodes",
1233         "workingset_refault_anon",
1234         "workingset_refault_file",
1235         "workingset_activate_anon",
1236         "workingset_activate_file",
1237         "workingset_restore_anon",
1238         "workingset_restore_file",
1239         "workingset_nodereclaim",
1240         "nr_anon_pages",
1241         "nr_mapped",
1242         "nr_file_pages",
1243         "nr_dirty",
1244         "nr_writeback",
1245         "nr_writeback_temp",
1246         "nr_shmem",
1247         "nr_shmem_hugepages",
1248         "nr_shmem_pmdmapped",
1249         "nr_file_hugepages",
1250         "nr_file_pmdmapped",
1251         "nr_anon_transparent_hugepages",
1252         "nr_vmscan_write",
1253         "nr_vmscan_immediate_reclaim",
1254         "nr_dirtied",
1255         "nr_written",
1256         "nr_throttled_written",
1257         "nr_kernel_misc_reclaimable",
1258         "nr_foll_pin_acquired",
1259         "nr_foll_pin_released",
1260         "nr_kernel_stack",
1261 #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
1262         "nr_shadow_call_stack",
1263 #endif
1264         "nr_page_table_pages",
1265         "nr_sec_page_table_pages",
1266 #ifdef CONFIG_IOMMU_SUPPORT
1267         "nr_iommu_pages",
1268 #endif
1269 #ifdef CONFIG_SWAP
1270         "nr_swapcached",
1271 #endif
1272 #ifdef CONFIG_NUMA_BALANCING
1273         "pgpromote_success",
1274         "pgpromote_candidate",
1275 #endif
1276         "pgdemote_kswapd",
1277         "pgdemote_direct",
1278         "pgdemote_khugepaged",
1279         "pgdemote_proactive",
1280 #ifdef CONFIG_HUGETLB_PAGE
1281         "nr_hugetlb",
1282 #endif
1283         "nr_balloon_pages",
1284         /* system-wide enum vm_stat_item counters */
1285         "nr_dirty_threshold",
1286         "nr_dirty_background_threshold",
1287         "nr_memmap_pages",
1288         "nr_memmap_boot_pages",
1289
1290 #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
1291         /* enum vm_event_item counters */
1292         "pgpgin",
1293         "pgpgout",
1294         "pswpin",
1295         "pswpout",
1296
1297         TEXTS_FOR_ZONES("pgalloc")
1298         TEXTS_FOR_ZONES("allocstall")
1299         TEXTS_FOR_ZONES("pgskip")
1300
1301         "pgfree",
1302         "pgactivate",
1303         "pgdeactivate",
1304         "pglazyfree",
1305
1306         "pgfault",
1307         "pgmajfault",
1308         "pglazyfreed",
1309
1310         "pgrefill",
1311         "pgreuse",
1312         "pgsteal_kswapd",
1313         "pgsteal_direct",
1314         "pgsteal_khugepaged",
1315         "pgsteal_proactive",
1316         "pgscan_kswapd",
1317         "pgscan_direct",
1318         "pgscan_khugepaged",
1319         "pgscan_proactive",
1320         "pgscan_direct_throttle",
1321         "pgscan_anon",
1322         "pgscan_file",
1323         "pgsteal_anon",
1324         "pgsteal_file",
1325
1326 #ifdef CONFIG_NUMA
1327         "zone_reclaim_success",
1328         "zone_reclaim_failed",
1329 #endif
1330         "pginodesteal",
1331         "slabs_scanned",
1332         "kswapd_inodesteal",
1333         "kswapd_low_wmark_hit_quickly",
1334         "kswapd_high_wmark_hit_quickly",
1335         "pageoutrun",
1336
1337         "pgrotated",
1338
1339         "drop_pagecache",
1340         "drop_slab",
1341         "oom_kill",
1342
1343 #ifdef CONFIG_NUMA_BALANCING
1344         "numa_pte_updates",
1345         "numa_huge_pte_updates",
1346         "numa_hint_faults",
1347         "numa_hint_faults_local",
1348         "numa_pages_migrated",
1349 #endif
1350 #ifdef CONFIG_MIGRATION
1351         "pgmigrate_success",
1352         "pgmigrate_fail",
1353         "thp_migration_success",
1354         "thp_migration_fail",
1355         "thp_migration_split",
1356 #endif
1357 #ifdef CONFIG_COMPACTION
1358         "compact_migrate_scanned",
1359         "compact_free_scanned",
1360         "compact_isolated",
1361         "compact_stall",
1362         "compact_fail",
1363         "compact_success",
1364         "compact_daemon_wake",
1365         "compact_daemon_migrate_scanned",
1366         "compact_daemon_free_scanned",
1367 #endif
1368
1369 #ifdef CONFIG_HUGETLB_PAGE
1370         "htlb_buddy_alloc_success",
1371         "htlb_buddy_alloc_fail",
1372 #endif
1373 #ifdef CONFIG_CMA
1374         "cma_alloc_success",
1375         "cma_alloc_fail",
1376 #endif
1377         "unevictable_pgs_culled",
1378         "unevictable_pgs_scanned",
1379         "unevictable_pgs_rescued",
1380         "unevictable_pgs_mlocked",
1381         "unevictable_pgs_munlocked",
1382         "unevictable_pgs_cleared",
1383         "unevictable_pgs_stranded",
1384
1385 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1386         "thp_fault_alloc",
1387         "thp_fault_fallback",
1388         "thp_fault_fallback_charge",
1389         "thp_collapse_alloc",
1390         "thp_collapse_alloc_failed",
1391         "thp_file_alloc",
1392         "thp_file_fallback",
1393         "thp_file_fallback_charge",
1394         "thp_file_mapped",
1395         "thp_split_page",
1396         "thp_split_page_failed",
1397         "thp_deferred_split_page",
1398         "thp_underused_split_page",
1399         "thp_split_pmd",
1400         "thp_scan_exceed_none_pte",
1401         "thp_scan_exceed_swap_pte",
1402         "thp_scan_exceed_share_pte",
1403 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1404         "thp_split_pud",
1405 #endif
1406         "thp_zero_page_alloc",
1407         "thp_zero_page_alloc_failed",
1408         "thp_swpout",
1409         "thp_swpout_fallback",
1410 #endif
1411 #ifdef CONFIG_MEMORY_BALLOON
1412         "balloon_inflate",
1413         "balloon_deflate",
1414 #ifdef CONFIG_BALLOON_COMPACTION
1415         "balloon_migrate",
1416 #endif
1417 #endif /* CONFIG_MEMORY_BALLOON */
1418 #ifdef CONFIG_DEBUG_TLBFLUSH
1419         "nr_tlb_remote_flush",
1420         "nr_tlb_remote_flush_received",
1421         "nr_tlb_local_flush_all",
1422         "nr_tlb_local_flush_one",
1423 #endif /* CONFIG_DEBUG_TLBFLUSH */
1424
1425 #ifdef CONFIG_SWAP
1426         "swap_ra",
1427         "swap_ra_hit",
1428         "swpin_zero",
1429         "swpout_zero",
1430 #ifdef CONFIG_KSM
1431         "ksm_swpin_copy",
1432 #endif
1433 #endif
1434 #ifdef CONFIG_KSM
1435         "cow_ksm",
1436 #endif
1437 #ifdef CONFIG_ZSWAP
1438         "zswpin",
1439         "zswpout",
1440         "zswpwb",
1441 #endif
1442 #ifdef CONFIG_X86
1443         "direct_map_level2_splits",
1444         "direct_map_level3_splits",
1445         "direct_map_level2_collapses",
1446         "direct_map_level3_collapses",
1447 #endif
1448 #ifdef CONFIG_PER_VMA_LOCK_STATS
1449         "vma_lock_success",
1450         "vma_lock_abort",
1451         "vma_lock_retry",
1452         "vma_lock_miss",
1453 #endif
1454 #ifdef CONFIG_DEBUG_STACK_USAGE
1455         "kstack_1k",
1456 #if THREAD_SIZE > 1024
1457         "kstack_2k",
1458 #endif
1459 #if THREAD_SIZE > 2048
1460         "kstack_4k",
1461 #endif
1462 #if THREAD_SIZE > 4096
1463         "kstack_8k",
1464 #endif
1465 #if THREAD_SIZE > 8192
1466         "kstack_16k",
1467 #endif
1468 #if THREAD_SIZE > 16384
1469         "kstack_32k",
1470 #endif
1471 #if THREAD_SIZE > 32768
1472         "kstack_64k",
1473 #endif
1474 #if THREAD_SIZE > 65536
1475         "kstack_rest",
1476 #endif
1477 #endif
1478 #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
1479 };
1480 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
1481
1482 #if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
1483      defined(CONFIG_PROC_FS)
1484 static void *frag_start(struct seq_file *m, loff_t *pos)
1485 {
1486         pg_data_t *pgdat;
1487         loff_t node = *pos;
1488
1489         for (pgdat = first_online_pgdat();
1490              pgdat && node;
1491              pgdat = next_online_pgdat(pgdat))
1492                 --node;
1493
1494         return pgdat;
1495 }
1496
1497 static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
1498 {
1499         pg_data_t *pgdat = (pg_data_t *)arg;
1500
1501         (*pos)++;
1502         return next_online_pgdat(pgdat);
1503 }
1504
1505 static void frag_stop(struct seq_file *m, void *arg)
1506 {
1507 }
1508
1509 /*
1510  * Walk zones in a node and print using a callback.
1511  * If @assert_populated is true, only use callback for zones that are populated.
1512  */
1513 static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
1514                 bool assert_populated, bool nolock,
1515                 void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
1516 {
1517         struct zone *zone;
1518         struct zone *node_zones = pgdat->node_zones;
1519         unsigned long flags;
1520
1521         for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
1522                 if (assert_populated && !populated_zone(zone))
1523                         continue;
1524
1525                 if (!nolock)
1526                         spin_lock_irqsave(&zone->lock, flags);
1527                 print(m, pgdat, zone);
1528                 if (!nolock)
1529                         spin_unlock_irqrestore(&zone->lock, flags);
1530         }
1531 }
1532 #endif
1533
1534 #ifdef CONFIG_PROC_FS
1535 static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
1536                                                 struct zone *zone)
1537 {
1538         int order;
1539
1540         seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1541         for (order = 0; order < NR_PAGE_ORDERS; ++order)
1542                 /*
1543                  * Access to nr_free is lockless as nr_free is used only for
1544                  * printing purposes. Use data_race to avoid KCSAN warning.
1545                  */
1546                 seq_printf(m, "%6lu ", data_race(zone->free_area[order].nr_free));
1547         seq_putc(m, '\n');
1548 }
1549
1550 /*
1551  * This walks the free areas for each zone.
1552  */
1553 static int frag_show(struct seq_file *m, void *arg)
1554 {
1555         pg_data_t *pgdat = (pg_data_t *)arg;
1556         walk_zones_in_node(m, pgdat, true, false, frag_show_print);
1557         return 0;
1558 }
1559
1560 static void pagetypeinfo_showfree_print(struct seq_file *m,
1561                                         pg_data_t *pgdat, struct zone *zone)
1562 {
1563         int order, mtype;
1564
1565         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
1566                 seq_printf(m, "Node %4d, zone %8s, type %12s ",
1567                                         pgdat->node_id,
1568                                         zone->name,
1569                                         migratetype_names[mtype]);
1570                 for (order = 0; order < NR_PAGE_ORDERS; ++order) {
1571                         unsigned long freecount = 0;
1572                         struct free_area *area;
1573                         struct list_head *curr;
1574                         bool overflow = false;
1575
1576                         area = &(zone->free_area[order]);
1577
1578                         list_for_each(curr, &area->free_list[mtype]) {
1579                                 /*
1580                                  * Cap the free_list iteration because it might
1581                                  * be really large and we are under a spinlock
1582                                  * so a long time spent here could trigger a
1583                                  * hard lockup detector. Anyway this is a
1584                                  * debugging tool so knowing there is a handful
1585                                  * of pages of this order should be more than
1586                                  * sufficient.
1587                                  */
1588                                 if (++freecount >= 100000) {
1589                                         overflow = true;
1590                                         break;
1591                                 }
1592                         }
1593                         seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
1594                         spin_unlock_irq(&zone->lock);
1595                         cond_resched();
1596                         spin_lock_irq(&zone->lock);
1597                 }
1598                 seq_putc(m, '\n');
1599         }
1600 }
1601
1602 /* Print out the free pages at each order for each migatetype */
1603 static void pagetypeinfo_showfree(struct seq_file *m, void *arg)
1604 {
1605         int order;
1606         pg_data_t *pgdat = (pg_data_t *)arg;
1607
1608         /* Print header */
1609         seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
1610         for (order = 0; order < NR_PAGE_ORDERS; ++order)
1611                 seq_printf(m, "%6d ", order);
1612         seq_putc(m, '\n');
1613
1614         walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
1615 }
1616
1617 static void pagetypeinfo_showblockcount_print(struct seq_file *m,
1618                                         pg_data_t *pgdat, struct zone *zone)
1619 {
1620         int mtype;
1621         unsigned long pfn;
1622         unsigned long start_pfn = zone->zone_start_pfn;
1623         unsigned long end_pfn = zone_end_pfn(zone);
1624         unsigned long count[MIGRATE_TYPES] = { 0, };
1625
1626         for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
1627                 struct page *page;
1628
1629                 page = pfn_to_online_page(pfn);
1630                 if (!page)
1631                         continue;
1632
1633                 if (page_zone(page) != zone)
1634                         continue;
1635
1636                 mtype = get_pageblock_migratetype(page);
1637
1638                 if (mtype < MIGRATE_TYPES)
1639                         count[mtype]++;
1640         }
1641
1642         /* Print counts */
1643         seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1644         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1645                 seq_printf(m, "%12lu ", count[mtype]);
1646         seq_putc(m, '\n');
1647 }
1648
1649 /* Print out the number of pageblocks for each migratetype */
1650 static void pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
1651 {
1652         int mtype;
1653         pg_data_t *pgdat = (pg_data_t *)arg;
1654
1655         seq_printf(m, "\n%-23s", "Number of blocks type ");
1656         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1657                 seq_printf(m, "%12s ", migratetype_names[mtype]);
1658         seq_putc(m, '\n');
1659         walk_zones_in_node(m, pgdat, true, false,
1660                 pagetypeinfo_showblockcount_print);
1661 }
1662
1663 /*
1664  * Print out the number of pageblocks for each migratetype that contain pages
1665  * of other types. This gives an indication of how well fallbacks are being
1666  * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
1667  * to determine what is going on
1668  */
1669 static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
1670 {
1671 #ifdef CONFIG_PAGE_OWNER
1672         int mtype;
1673
1674         if (!static_branch_unlikely(&page_owner_inited))
1675                 return;
1676
1677         drain_all_pages(NULL);
1678
1679         seq_printf(m, "\n%-23s", "Number of mixed blocks ");
1680         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1681                 seq_printf(m, "%12s ", migratetype_names[mtype]);
1682         seq_putc(m, '\n');
1683
1684         walk_zones_in_node(m, pgdat, true, true,
1685                 pagetypeinfo_showmixedcount_print);
1686 #endif /* CONFIG_PAGE_OWNER */
1687 }
1688
1689 /*
1690  * This prints out statistics in relation to grouping pages by mobility.
1691  * It is expensive to collect so do not constantly read the file.
1692  */
1693 static int pagetypeinfo_show(struct seq_file *m, void *arg)
1694 {
1695         pg_data_t *pgdat = (pg_data_t *)arg;
1696
1697         /* check memoryless node */
1698         if (!node_state(pgdat->node_id, N_MEMORY))
1699                 return 0;
1700
1701         seq_printf(m, "Page block order: %d\n", pageblock_order);
1702         seq_printf(m, "Pages per block:  %lu\n", pageblock_nr_pages);
1703         seq_putc(m, '\n');
1704         pagetypeinfo_showfree(m, pgdat);
1705         pagetypeinfo_showblockcount(m, pgdat);
1706         pagetypeinfo_showmixedcount(m, pgdat);
1707
1708         return 0;
1709 }
1710
1711 static const struct seq_operations fragmentation_op = {
1712         .start  = frag_start,
1713         .next   = frag_next,
1714         .stop   = frag_stop,
1715         .show   = frag_show,
1716 };
1717
1718 static const struct seq_operations pagetypeinfo_op = {
1719         .start  = frag_start,
1720         .next   = frag_next,
1721         .stop   = frag_stop,
1722         .show   = pagetypeinfo_show,
1723 };
1724
1725 static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
1726 {
1727         int zid;
1728
1729         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1730                 struct zone *compare = &pgdat->node_zones[zid];
1731
1732                 if (populated_zone(compare))
1733                         return zone == compare;
1734         }
1735
1736         return false;
1737 }
1738
1739 static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1740                                                         struct zone *zone)
1741 {
1742         int i;
1743         seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
1744         if (is_zone_first_populated(pgdat, zone)) {
1745                 seq_printf(m, "\n  per-node stats");
1746                 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
1747                         unsigned long pages = node_page_state_pages(pgdat, i);
1748
1749                         if (vmstat_item_print_in_thp(i))
1750                                 pages /= HPAGE_PMD_NR;
1751                         seq_printf(m, "\n      %-12s %lu", node_stat_name(i),
1752                                    pages);
1753                 }
1754         }
1755         seq_printf(m,
1756                    "\n  pages free     %lu"
1757                    "\n        boost    %lu"
1758                    "\n        min      %lu"
1759                    "\n        low      %lu"
1760                    "\n        high     %lu"
1761                    "\n        promo    %lu"
1762                    "\n        spanned  %lu"
1763                    "\n        present  %lu"
1764                    "\n        managed  %lu"
1765                    "\n        cma      %lu",
1766                    zone_page_state(zone, NR_FREE_PAGES),
1767                    zone->watermark_boost,
1768                    min_wmark_pages(zone),
1769                    low_wmark_pages(zone),
1770                    high_wmark_pages(zone),
1771                    promo_wmark_pages(zone),
1772                    zone->spanned_pages,
1773                    zone->present_pages,
1774                    zone_managed_pages(zone),
1775                    zone_cma_pages(zone));
1776
1777         seq_printf(m,
1778                    "\n        protection: (%ld",
1779                    zone->lowmem_reserve[0]);
1780         for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
1781                 seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
1782         seq_putc(m, ')');
1783
1784         /* If unpopulated, no other information is useful */
1785         if (!populated_zone(zone)) {
1786                 seq_putc(m, '\n');
1787                 return;
1788         }
1789
1790         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
1791                 seq_printf(m, "\n      %-12s %lu", zone_stat_name(i),
1792                            zone_page_state(zone, i));
1793
1794 #ifdef CONFIG_NUMA
1795         fold_vm_zone_numa_events(zone);
1796         for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
1797                 seq_printf(m, "\n      %-12s %lu", numa_stat_name(i),
1798                            zone_numa_event_state(zone, i));
1799 #endif
1800
1801         seq_printf(m, "\n  pagesets");
1802         for_each_online_cpu(i) {
1803                 struct per_cpu_pages *pcp;
1804                 struct per_cpu_zonestat __maybe_unused *pzstats;
1805
1806                 pcp = per_cpu_ptr(zone->per_cpu_pageset, i);
1807                 seq_printf(m,
1808                            "\n    cpu: %i"
1809                            "\n              count:    %i"
1810                            "\n              high:     %i"
1811                            "\n              batch:    %i"
1812                            "\n              high_min: %i"
1813                            "\n              high_max: %i",
1814                            i,
1815                            pcp->count,
1816                            pcp->high,
1817                            pcp->batch,
1818                            pcp->high_min,
1819                            pcp->high_max);
1820 #ifdef CONFIG_SMP
1821                 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i);
1822                 seq_printf(m, "\n  vm stats threshold: %d",
1823                                 pzstats->stat_threshold);
1824 #endif
1825         }
1826         seq_printf(m,
1827                    "\n  node_unreclaimable:  %u"
1828                    "\n  start_pfn:           %lu",
1829                    pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
1830                    zone->zone_start_pfn);
1831         seq_putc(m, '\n');
1832 }
1833
1834 /*
1835  * Output information about zones in @pgdat.  All zones are printed regardless
1836  * of whether they are populated or not: lowmem_reserve_ratio operates on the
1837  * set of all zones and userspace would not be aware of such zones if they are
1838  * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
1839  */
1840 static int zoneinfo_show(struct seq_file *m, void *arg)
1841 {
1842         pg_data_t *pgdat = (pg_data_t *)arg;
1843         walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
1844         return 0;
1845 }
1846
1847 static const struct seq_operations zoneinfo_op = {
1848         .start  = frag_start, /* iterate over all zones. The same as in
1849                                * fragmentation. */
1850         .next   = frag_next,
1851         .stop   = frag_stop,
1852         .show   = zoneinfo_show,
1853 };
1854
1855 #define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
1856                          NR_VM_NUMA_EVENT_ITEMS + \
1857                          NR_VM_NODE_STAT_ITEMS + \
1858                          NR_VM_STAT_ITEMS + \
1859                          (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
1860                           NR_VM_EVENT_ITEMS : 0))
1861
1862 static void *vmstat_start(struct seq_file *m, loff_t *pos)
1863 {
1864         unsigned long *v;
1865         int i;
1866
1867         if (*pos >= NR_VMSTAT_ITEMS)
1868                 return NULL;
1869
1870         BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
1871         fold_vm_numa_events();
1872         v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
1873         m->private = v;
1874         if (!v)
1875                 return ERR_PTR(-ENOMEM);
1876         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
1877                 v[i] = global_zone_page_state(i);
1878         v += NR_VM_ZONE_STAT_ITEMS;
1879
1880 #ifdef CONFIG_NUMA
1881         for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
1882                 v[i] = global_numa_event_state(i);
1883         v += NR_VM_NUMA_EVENT_ITEMS;
1884 #endif
1885
1886         for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
1887                 v[i] = global_node_page_state_pages(i);
1888                 if (vmstat_item_print_in_thp(i))
1889                         v[i] /= HPAGE_PMD_NR;
1890         }
1891         v += NR_VM_NODE_STAT_ITEMS;
1892
1893         global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
1894                             v + NR_DIRTY_THRESHOLD);
1895         v[NR_MEMMAP_PAGES] = atomic_long_read(&nr_memmap_pages);
1896         v[NR_MEMMAP_BOOT_PAGES] = atomic_long_read(&nr_memmap_boot_pages);
1897         v += NR_VM_STAT_ITEMS;
1898
1899 #ifdef CONFIG_VM_EVENT_COUNTERS
1900         all_vm_events(v);
1901         v[PGPGIN] /= 2;         /* sectors -> kbytes */
1902         v[PGPGOUT] /= 2;
1903 #endif
1904         return (unsigned long *)m->private + *pos;
1905 }
1906
1907 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1908 {
1909         (*pos)++;
1910         if (*pos >= NR_VMSTAT_ITEMS)
1911                 return NULL;
1912         return (unsigned long *)m->private + *pos;
1913 }
1914
1915 static int vmstat_show(struct seq_file *m, void *arg)
1916 {
1917         unsigned long *l = arg;
1918         unsigned long off = l - (unsigned long *)m->private;
1919
1920         seq_puts(m, vmstat_text[off]);
1921         seq_put_decimal_ull(m, " ", *l);
1922         seq_putc(m, '\n');
1923
1924         if (off == NR_VMSTAT_ITEMS - 1) {
1925                 /*
1926                  * We've come to the end - add any deprecated counters to avoid
1927                  * breaking userspace which might depend on them being present.
1928                  */
1929                 seq_puts(m, "nr_unstable 0\n");
1930         }
1931         return 0;
1932 }
1933
1934 static void vmstat_stop(struct seq_file *m, void *arg)
1935 {
1936         kfree(m->private);
1937         m->private = NULL;
1938 }
1939
1940 static const struct seq_operations vmstat_op = {
1941         .start  = vmstat_start,
1942         .next   = vmstat_next,
1943         .stop   = vmstat_stop,
1944         .show   = vmstat_show,
1945 };
1946 #endif /* CONFIG_PROC_FS */
1947
1948 #ifdef CONFIG_SMP
1949 static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
1950 static int sysctl_stat_interval __read_mostly = HZ;
1951 static int vmstat_late_init_done;
1952
1953 #ifdef CONFIG_PROC_FS
1954 static void refresh_vm_stats(struct work_struct *work)
1955 {
1956         refresh_cpu_vm_stats(true);
1957 }
1958
1959 static int vmstat_refresh(const struct ctl_table *table, int write,
1960                    void *buffer, size_t *lenp, loff_t *ppos)
1961 {
1962         long val;
1963         int err;
1964         int i;
1965
1966         /*
1967          * The regular update, every sysctl_stat_interval, may come later
1968          * than expected: leaving a significant amount in per_cpu buckets.
1969          * This is particularly misleading when checking a quantity of HUGE
1970          * pages, immediately after running a test.  /proc/sys/vm/stat_refresh,
1971          * which can equally be echo'ed to or cat'ted from (by root),
1972          * can be used to update the stats just before reading them.
1973          *
1974          * Oh, and since global_zone_page_state() etc. are so careful to hide
1975          * transiently negative values, report an error here if any of
1976          * the stats is negative, so we know to go looking for imbalance.
1977          */
1978         err = schedule_on_each_cpu(refresh_vm_stats);
1979         if (err)
1980                 return err;
1981         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
1982                 /*
1983                  * Skip checking stats known to go negative occasionally.
1984                  */
1985                 switch (i) {
1986                 case NR_ZONE_WRITE_PENDING:
1987                 case NR_FREE_CMA_PAGES:
1988                         continue;
1989                 }
1990                 val = atomic_long_read(&vm_zone_stat[i]);
1991                 if (val < 0) {
1992                         pr_warn("%s: %s %ld\n",
1993                                 __func__, zone_stat_name(i), val);
1994                 }
1995         }
1996         for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
1997                 /*
1998                  * Skip checking stats known to go negative occasionally.
1999                  */
2000                 switch (i) {
2001                 case NR_WRITEBACK:
2002                         continue;
2003                 }
2004                 val = atomic_long_read(&vm_node_stat[i]);
2005                 if (val < 0) {
2006                         pr_warn("%s: %s %ld\n",
2007                                 __func__, node_stat_name(i), val);
2008                 }
2009         }
2010         if (write)
2011                 *ppos += *lenp;
2012         else
2013                 *lenp = 0;
2014         return 0;
2015 }
2016 #endif /* CONFIG_PROC_FS */
2017
2018 static void vmstat_update(struct work_struct *w)
2019 {
2020         if (refresh_cpu_vm_stats(true)) {
2021                 /*
2022                  * Counters were updated so we expect more updates
2023                  * to occur in the future. Keep on running the
2024                  * update worker thread.
2025                  */
2026                 queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
2027                                 this_cpu_ptr(&vmstat_work),
2028                                 round_jiffies_relative(sysctl_stat_interval));
2029         }
2030 }
2031
2032 /*
2033  * Check if the diffs for a certain cpu indicate that
2034  * an update is needed.
2035  */
2036 static bool need_update(int cpu)
2037 {
2038         pg_data_t *last_pgdat = NULL;
2039         struct zone *zone;
2040
2041         for_each_populated_zone(zone) {
2042                 struct per_cpu_zonestat *pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
2043                 struct per_cpu_nodestat *n;
2044
2045                 /*
2046                  * The fast way of checking if there are any vmstat diffs.
2047                  */
2048                 if (memchr_inv(pzstats->vm_stat_diff, 0, sizeof(pzstats->vm_stat_diff)))
2049                         return true;
2050
2051                 if (last_pgdat == zone->zone_pgdat)
2052                         continue;
2053                 last_pgdat = zone->zone_pgdat;
2054                 n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu);
2055                 if (memchr_inv(n->vm_node_stat_diff, 0, sizeof(n->vm_node_stat_diff)))
2056                         return true;
2057         }
2058         return false;
2059 }
2060
2061 /*
2062  * Switch off vmstat processing and then fold all the remaining differentials
2063  * until the diffs stay at zero. The function is used by NOHZ and can only be
2064  * invoked when tick processing is not active.
2065  */
2066 void quiet_vmstat(void)
2067 {
2068         if (system_state != SYSTEM_RUNNING)
2069                 return;
2070
2071         if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
2072                 return;
2073
2074         if (!need_update(smp_processor_id()))
2075                 return;
2076
2077         /*
2078          * Just refresh counters and do not care about the pending delayed
2079          * vmstat_update. It doesn't fire that often to matter and canceling
2080          * it would be too expensive from this path.
2081          * vmstat_shepherd will take care about that for us.
2082          */
2083         refresh_cpu_vm_stats(false);
2084 }
2085
2086 /*
2087  * Shepherd worker thread that checks the
2088  * differentials of processors that have their worker
2089  * threads for vm statistics updates disabled because of
2090  * inactivity.
2091  */
2092 static void vmstat_shepherd(struct work_struct *w);
2093
2094 static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
2095
2096 static void vmstat_shepherd(struct work_struct *w)
2097 {
2098         int cpu;
2099
2100         cpus_read_lock();
2101         /* Check processors whose vmstat worker threads have been disabled */
2102         for_each_online_cpu(cpu) {
2103                 struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
2104
2105                 /*
2106                  * In kernel users of vmstat counters either require the precise value and
2107                  * they are using zone_page_state_snapshot interface or they can live with
2108                  * an imprecision as the regular flushing can happen at arbitrary time and
2109                  * cumulative error can grow (see calculate_normal_threshold).
2110                  *
2111                  * From that POV the regular flushing can be postponed for CPUs that have
2112                  * been isolated from the kernel interference without critical
2113                  * infrastructure ever noticing. Skip regular flushing from vmstat_shepherd
2114                  * for all isolated CPUs to avoid interference with the isolated workload.
2115                  */
2116                 if (cpu_is_isolated(cpu))
2117                         continue;
2118
2119                 if (!delayed_work_pending(dw) && need_update(cpu))
2120                         queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
2121
2122                 cond_resched();
2123         }
2124         cpus_read_unlock();
2125
2126         schedule_delayed_work(&shepherd,
2127                 round_jiffies_relative(sysctl_stat_interval));
2128 }
2129
2130 static void __init start_shepherd_timer(void)
2131 {
2132         int cpu;
2133
2134         for_each_possible_cpu(cpu) {
2135                 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
2136                         vmstat_update);
2137
2138                 /*
2139                  * For secondary CPUs during CPU hotplug scenarios,
2140                  * vmstat_cpu_online() will enable the work.
2141                  * mm/vmstat:online enables and disables vmstat_work
2142                  * symmetrically during CPU hotplug events.
2143                  */
2144                 if (!cpu_online(cpu))
2145                         disable_delayed_work_sync(&per_cpu(vmstat_work, cpu));
2146         }
2147
2148         schedule_delayed_work(&shepherd,
2149                 round_jiffies_relative(sysctl_stat_interval));
2150 }
2151
2152 static void __init init_cpu_node_state(void)
2153 {
2154         int node;
2155
2156         for_each_online_node(node) {
2157                 if (!cpumask_empty(cpumask_of_node(node)))
2158                         node_set_state(node, N_CPU);
2159         }
2160 }
2161
2162 static int vmstat_cpu_online(unsigned int cpu)
2163 {
2164         if (vmstat_late_init_done)
2165                 refresh_zone_stat_thresholds();
2166
2167         if (!node_state(cpu_to_node(cpu), N_CPU)) {
2168                 node_set_state(cpu_to_node(cpu), N_CPU);
2169         }
2170         enable_delayed_work(&per_cpu(vmstat_work, cpu));
2171
2172         return 0;
2173 }
2174
2175 static int vmstat_cpu_down_prep(unsigned int cpu)
2176 {
2177         disable_delayed_work_sync(&per_cpu(vmstat_work, cpu));
2178         return 0;
2179 }
2180
2181 static int vmstat_cpu_dead(unsigned int cpu)
2182 {
2183         const struct cpumask *node_cpus;
2184         int node;
2185
2186         node = cpu_to_node(cpu);
2187
2188         refresh_zone_stat_thresholds();
2189         node_cpus = cpumask_of_node(node);
2190         if (!cpumask_empty(node_cpus))
2191                 return 0;
2192
2193         node_clear_state(node, N_CPU);
2194
2195         return 0;
2196 }
2197
2198 static int __init vmstat_late_init(void)
2199 {
2200         refresh_zone_stat_thresholds();
2201         vmstat_late_init_done = 1;
2202
2203         return 0;
2204 }
2205 late_initcall(vmstat_late_init);
2206 #endif
2207
2208 #ifdef CONFIG_PROC_FS
2209 static const struct ctl_table vmstat_table[] = {
2210 #ifdef CONFIG_SMP
2211         {
2212                 .procname       = "stat_interval",
2213                 .data           = &sysctl_stat_interval,
2214                 .maxlen         = sizeof(sysctl_stat_interval),
2215                 .mode           = 0644,
2216                 .proc_handler   = proc_dointvec_jiffies,
2217         },
2218         {
2219                 .procname       = "stat_refresh",
2220                 .data           = NULL,
2221                 .maxlen         = 0,
2222                 .mode           = 0600,
2223                 .proc_handler   = vmstat_refresh,
2224         },
2225 #endif
2226 #ifdef CONFIG_NUMA
2227         {
2228                 .procname       = "numa_stat",
2229                 .data           = &sysctl_vm_numa_stat,
2230                 .maxlen         = sizeof(int),
2231                 .mode           = 0644,
2232                 .proc_handler   = sysctl_vm_numa_stat_handler,
2233                 .extra1         = SYSCTL_ZERO,
2234                 .extra2         = SYSCTL_ONE,
2235         },
2236 #endif
2237 };
2238 #endif
2239
2240 struct workqueue_struct *mm_percpu_wq;
2241
2242 void __init init_mm_internals(void)
2243 {
2244         int ret __maybe_unused;
2245
2246         mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
2247
2248 #ifdef CONFIG_SMP
2249         ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
2250                                         NULL, vmstat_cpu_dead);
2251         if (ret < 0)
2252                 pr_err("vmstat: failed to register 'dead' hotplug state\n");
2253
2254         ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
2255                                         vmstat_cpu_online,
2256                                         vmstat_cpu_down_prep);
2257         if (ret < 0)
2258                 pr_err("vmstat: failed to register 'online' hotplug state\n");
2259
2260         cpus_read_lock();
2261         init_cpu_node_state();
2262         cpus_read_unlock();
2263
2264         start_shepherd_timer();
2265 #endif
2266 #ifdef CONFIG_PROC_FS
2267         proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
2268         proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
2269         proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
2270         proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
2271         register_sysctl_init("vm", vmstat_table);
2272 #endif
2273 }
2274
2275 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
2276
2277 /*
2278  * Return an index indicating how much of the available free memory is
2279  * unusable for an allocation of the requested size.
2280  */
2281 static int unusable_free_index(unsigned int order,
2282                                 struct contig_page_info *info)
2283 {
2284         /* No free memory is interpreted as all free memory is unusable */
2285         if (info->free_pages == 0)
2286                 return 1000;
2287
2288         /*
2289          * Index should be a value between 0 and 1. Return a value to 3
2290          * decimal places.
2291          *
2292          * 0 => no fragmentation
2293          * 1 => high fragmentation
2294          */
2295         return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
2296
2297 }
2298
2299 static void unusable_show_print(struct seq_file *m,
2300                                         pg_data_t *pgdat, struct zone *zone)
2301 {
2302         unsigned int order;
2303         int index;
2304         struct contig_page_info info;
2305
2306         seq_printf(m, "Node %d, zone %8s ",
2307                                 pgdat->node_id,
2308                                 zone->name);
2309         for (order = 0; order < NR_PAGE_ORDERS; ++order) {
2310                 fill_contig_page_info(zone, order, &info);
2311                 index = unusable_free_index(order, &info);
2312                 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2313         }
2314
2315         seq_putc(m, '\n');
2316 }
2317
2318 /*
2319  * Display unusable free space index
2320  *
2321  * The unusable free space index measures how much of the available free
2322  * memory cannot be used to satisfy an allocation of a given size and is a
2323  * value between 0 and 1. The higher the value, the more of free memory is
2324  * unusable and by implication, the worse the external fragmentation is. This
2325  * can be expressed as a percentage by multiplying by 100.
2326  */
2327 static int unusable_show(struct seq_file *m, void *arg)
2328 {
2329         pg_data_t *pgdat = (pg_data_t *)arg;
2330
2331         /* check memoryless node */
2332         if (!node_state(pgdat->node_id, N_MEMORY))
2333                 return 0;
2334
2335         walk_zones_in_node(m, pgdat, true, false, unusable_show_print);
2336
2337         return 0;
2338 }
2339
2340 static const struct seq_operations unusable_sops = {
2341         .start  = frag_start,
2342         .next   = frag_next,
2343         .stop   = frag_stop,
2344         .show   = unusable_show,
2345 };
2346
2347 DEFINE_SEQ_ATTRIBUTE(unusable);
2348
2349 static void extfrag_show_print(struct seq_file *m,
2350                                         pg_data_t *pgdat, struct zone *zone)
2351 {
2352         unsigned int order;
2353         int index;
2354
2355         /* Alloc on stack as interrupts are disabled for zone walk */
2356         struct contig_page_info info;
2357
2358         seq_printf(m, "Node %d, zone %8s ",
2359                                 pgdat->node_id,
2360                                 zone->name);
2361         for (order = 0; order < NR_PAGE_ORDERS; ++order) {
2362                 fill_contig_page_info(zone, order, &info);
2363                 index = __fragmentation_index(order, &info);
2364                 seq_printf(m, "%2d.%03d ", index / 1000, index % 1000);
2365         }
2366
2367         seq_putc(m, '\n');
2368 }
2369
2370 /*
2371  * Display fragmentation index for orders that allocations would fail for
2372  */
2373 static int extfrag_show(struct seq_file *m, void *arg)
2374 {
2375         pg_data_t *pgdat = (pg_data_t *)arg;
2376
2377         walk_zones_in_node(m, pgdat, true, false, extfrag_show_print);
2378
2379         return 0;
2380 }
2381
2382 static const struct seq_operations extfrag_sops = {
2383         .start  = frag_start,
2384         .next   = frag_next,
2385         .stop   = frag_stop,
2386         .show   = extfrag_show,
2387 };
2388
2389 DEFINE_SEQ_ATTRIBUTE(extfrag);
2390
2391 static int __init extfrag_debug_init(void)
2392 {
2393         struct dentry *extfrag_debug_root;
2394
2395         extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
2396
2397         debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
2398                             &unusable_fops);
2399
2400         debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
2401                             &extfrag_fops);
2402
2403         return 0;
2404 }
2405
2406 module_init(extfrag_debug_init);
2407
2408 #endif