Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma

[linux-2.6-block.git] / mm / vmstat.c
diff --git a/mm/vmstat.c b/mm/vmstat.c

index 5e43004828971e778a185ec69f2ef9710f085fb4..77e42ef388c2a832169d4bcace8a505c45d95516 100644 (file)
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -569,50 +569,19 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
  #endif
  
  #ifdef CONFIG_NUMA
-/*
- * zonelist = the list of zones passed to the allocator
- * z       = the zone from which the allocation occurred.
- *
- * Must be called with interrupts disabled.
- *
- * When __GFP_OTHER_NODE is set assume the node of the preferred
- * zone is the local node. This is useful for daemons who allocate
- * memory on behalf of other processes.
- */
-void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
-{
-       if (z->zone_pgdat == preferred_zone->zone_pgdat) {
-               __inc_zone_state(z, NUMA_HIT);
-       } else {
-               __inc_zone_state(z, NUMA_MISS);
-               __inc_zone_state(preferred_zone, NUMA_FOREIGN);
-       }
-       if (z->node == ((flags & __GFP_OTHER_NODE) ?
-                       preferred_zone->node : numa_node_id()))
-               __inc_zone_state(z, NUMA_LOCAL);
-       else
-               __inc_zone_state(z, NUMA_OTHER);
-}
-
  /*
   * Determine the per node value of a stat item.
   */
  unsigned long node_page_state(int node, enum zone_stat_item item)
  {
         struct zone *zones = NODE_DATA(node)->node_zones;
+       int i;
+       unsigned long count = 0;
  
-       return
-#ifdef CONFIG_ZONE_DMA
-               zone_page_state(&zones[ZONE_DMA], item) +
-#endif
-#ifdef CONFIG_ZONE_DMA32
-               zone_page_state(&zones[ZONE_DMA32], item) +
-#endif
-#ifdef CONFIG_HIGHMEM
-               zone_page_state(&zones[ZONE_HIGHMEM], item) +
-#endif
-               zone_page_state(&zones[ZONE_NORMAL], item) +
-               zone_page_state(&zones[ZONE_MOVABLE], item);
+       for (i = 0; i < MAX_NR_ZONES; i++)
+               count += zone_page_state(zones + i, item);
+
+       return count;
  }
  
  #endif
@@ -1010,6 +979,9 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
                 if (!memmap_valid_within(pfn, page, zone))
                         continue;
  
+               if (page_zone(page) != zone)
+                       continue;
+
                 mtype = get_pageblock_migratetype(page);
  
                 if (mtype < MIGRATE_TYPES)
@@ -1069,13 +1041,17 @@ static void pagetypeinfo_showmixedcount_print(struct seq_file *m,
                 block_end_pfn = min(block_end_pfn, end_pfn);
  
                 page = pfn_to_page(pfn);
-               pageblock_mt = get_pfnblock_migratetype(page, pfn);
+               pageblock_mt = get_pageblock_migratetype(page);
  
                 for (; pfn < block_end_pfn; pfn++) {
                         if (!pfn_valid_within(pfn))
                                 continue;
  
                         page = pfn_to_page(pfn);
+
+                       if (page_zone(page) != zone)
+                               continue;
+
                         if (PageBuddy(page)) {
                                 pfn += (1UL << page_order(page)) - 1;
                                 continue;
@@ -1376,7 +1352,66 @@ static const struct file_operations proc_vmstat_file_operations = {
  static struct workqueue_struct *vmstat_wq;
  static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
  int sysctl_stat_interval __read_mostly = HZ;
-static cpumask_var_t cpu_stat_off;
+
+#ifdef CONFIG_PROC_FS
+static void refresh_vm_stats(struct work_struct *work)
+{
+       refresh_cpu_vm_stats(true);
+}
+
+int vmstat_refresh(struct ctl_table *table, int write,
+                  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+       long val;
+       int err;
+       int i;
+
+       /*
+        * The regular update, every sysctl_stat_interval, may come later
+        * than expected: leaving a significant amount in per_cpu buckets.
+        * This is particularly misleading when checking a quantity of HUGE
+        * pages, immediately after running a test.  /proc/sys/vm/stat_refresh,
+        * which can equally be echo'ed to or cat'ted from (by root),
+        * can be used to update the stats just before reading them.
+        *
+        * Oh, and since global_page_state() etc. are so careful to hide
+        * transiently negative values, report an error here if any of
+        * the stats is negative, so we know to go looking for imbalance.
+        */
+       err = schedule_on_each_cpu(refresh_vm_stats);
+       if (err)
+               return err;
+       for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+               val = atomic_long_read(&vm_stat[i]);
+               if (val < 0) {
+                       switch (i) {
+                       case NR_ALLOC_BATCH:
+                       case NR_PAGES_SCANNED:
+                               /*
+                                * These are often seen to go negative in
+                                * recent kernels, but not to go permanently
+                                * negative.  Whilst it would be nicer not to
+                                * have exceptions, rooting them out would be
+                                * another task, of rather low priority.
+                                */
+                               break;
+                       default:
+                               pr_warn("%s: %s %ld\n",
+                                       __func__, vmstat_text[i], val);
+                               err = -EINVAL;
+                               break;
+                       }
+               }
+       }
+       if (err)
+               return err;
+       if (write)
+               *ppos += *lenp;
+       else
+               *lenp = 0;
+       return 0;
+}
+#endif /* CONFIG_PROC_FS */
  
  static void vmstat_update(struct work_struct *w)
  {
@@ -1385,24 +1420,10 @@ static void vmstat_update(struct work_struct *w)
                  * Counters were updated so we expect more updates
                  * to occur in the future. Keep on running the
                  * update worker thread.
-                * If we were marked on cpu_stat_off clear the flag
-                * so that vmstat_shepherd doesn't schedule us again.
                  */
-               if (!cpumask_test_and_clear_cpu(smp_processor_id(),
-                                               cpu_stat_off)) {
-                       queue_delayed_work_on(smp_processor_id(), vmstat_wq,
+               queue_delayed_work_on(smp_processor_id(), vmstat_wq,
                                 this_cpu_ptr(&vmstat_work),
                                 round_jiffies_relative(sysctl_stat_interval));
-               }
-       } else {
-               /*
-                * We did not update any counters so the app may be in
-                * a mode where it does not cause counter updates.
-                * We may be uselessly running vmstat_update.
-                * Defer the checking for differentials to the
-                * shepherd thread on a different processor.
-                */
-               cpumask_set_cpu(smp_processor_id(), cpu_stat_off);
         }
  }
  
@@ -1434,16 +1455,17 @@ static bool need_update(int cpu)
         return false;
  }
  
+/*
+ * Switch off vmstat processing and then fold all the remaining differentials
+ * until the diffs stay at zero. The function is used by NOHZ and can only be
+ * invoked when tick processing is not active.
+ */
  void quiet_vmstat(void)
  {
         if (system_state != SYSTEM_RUNNING)
                 return;
  
-       /*
-        * If we are already in hands of the shepherd then there
-        * is nothing for us to do here.
-        */
-       if (cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
+       if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
                 return;
  
         if (!need_update(smp_processor_id()))
@@ -1458,7 +1480,6 @@ void quiet_vmstat(void)
         refresh_cpu_vm_stats(false);
  }
  
-
  /*
   * Shepherd worker thread that checks the
   * differentials of processors that have their worker
@@ -1475,20 +1496,11 @@ static void vmstat_shepherd(struct work_struct *w)
  
         get_online_cpus();
         /* Check processors whose vmstat worker threads have been disabled */
-       for_each_cpu(cpu, cpu_stat_off) {
+       for_each_online_cpu(cpu) {
                 struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
  
-               if (need_update(cpu)) {
-                       if (cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
-                               queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
-               } else {
-                       /*
-                        * Cancel the work if quiet_vmstat has put this
-                        * cpu on cpu_stat_off because the work item might
-                        * be still scheduled
-                        */
-                       cancel_delayed_work(dw);
-               }
+               if (!delayed_work_pending(dw) && need_update(cpu))
+                       queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
         }
         put_online_cpus();
  
@@ -1504,10 +1516,6 @@ static void __init start_shepherd_timer(void)
                 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
                         vmstat_update);
  
-       if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))
-               BUG();
-       cpumask_copy(cpu_stat_off, cpu_online_mask);
-
         vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
         schedule_delayed_work(&shepherd,
                 round_jiffies_relative(sysctl_stat_interval));
@@ -1542,16 +1550,13 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb,
         case CPU_ONLINE_FROZEN:
                 refresh_zone_stat_thresholds();
                 node_set_state(cpu_to_node(cpu), N_CPU);
-               cpumask_set_cpu(cpu, cpu_stat_off);
                 break;
         case CPU_DOWN_PREPARE:
         case CPU_DOWN_PREPARE_FROZEN:
                 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
-               cpumask_clear_cpu(cpu, cpu_stat_off);
                 break;
         case CPU_DOWN_FAILED:
         case CPU_DOWN_FAILED_FROZEN:
-               cpumask_set_cpu(cpu, cpu_stat_off);
                 break;
         case CPU_DEAD:
         case CPU_DEAD_FROZEN: