mem-hotplug: fix potential race while building zonelist for new populated zone

author Haicheng Li <haicheng.li@linux.intel.com>

Mon, 24 May 2010 21:32:52 +0000 (14:32 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 25 May 2010 15:07:02 +0000 (08:07 -0700)
author Haicheng Li <haicheng.li@linux.intel.com>
Mon, 24 May 2010 21:32:52 +0000 (14:32 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 25 May 2010 15:07:02 +0000 (08:07 -0700)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index a367ed5bb3fecdc0f1853c77f40f07c3f8c9c0ec..0fa491326c4a165eceeef43f24c0f30ac3834832 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -650,6 +650,7 @@ typedef struct pglist_data {
  
  #include <linux/memory_hotplug.h>
  
+extern struct mutex zonelists_mutex;
  void get_zone_counts(unsigned long *active, unsigned long *inactive,
                         unsigned long *free);
  void build_all_zonelists(void *data);
diff --git a/kernel/cpu.c b/kernel/cpu.c

index 3e8b3ba27175f4c13ed217aeaf3771a127ad6553..124ad9d6be1644864dda2a9eb55eea94a29c075c 100644 (file)
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -357,8 +357,11 @@ int __cpuinit cpu_up(unsigned int cpu)
                 return -ENOMEM;
         }
  
-       if (pgdat->node_zonelists->_zonerefs->zone == NULL)
+       if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
+               mutex_lock(&zonelists_mutex);
                 build_all_zonelists(NULL);
+               mutex_unlock(&zonelists_mutex);
+       }
  #endif
  
         cpu_maps_update_begin();
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c

index 089cc97aed3c5506e22487e158106502c6dde666..a4cfcdc00455de4be15fcec98c76e45f8de5feab 100644 (file)
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -389,11 +389,6 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
         int nid;
         int ret;
         struct memory_notify arg;
-       /*
-        * mutex to protect zone->pageset when it's still shared
-        * in onlined_pages()
-        */
-       static DEFINE_MUTEX(zone_pageset_mutex);
  
         arg.start_pfn = pfn;
         arg.nr_pages = nr_pages;
@@ -420,14 +415,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
          * This means the page allocator ignores this zone.
          * So, zonelist must be updated after online.
          */
-       mutex_lock(&zone_pageset_mutex);
+       mutex_lock(&zonelists_mutex);
         if (!populated_zone(zone))
                 need_zonelists_rebuild = 1;
  
         ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
                 online_pages_range);
         if (ret) {
-               mutex_unlock(&zone_pageset_mutex);
+               mutex_unlock(&zonelists_mutex);
                 printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
                         nr_pages, pfn);
                 memory_notify(MEM_CANCEL_ONLINE, &arg);
@@ -441,7 +436,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
         else
                 zone_pcp_update(zone);
  
-       mutex_unlock(&zone_pageset_mutex);
+       mutex_unlock(&zonelists_mutex);
         setup_per_zone_wmarks();
         calculate_zone_inactive_ratio(zone);
         if (onlined_pages) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 21c52d2d8624a368c388e15e6df338441f6ba5ca..08b349931ebc6bffef8be14c1d5d890750a88038 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2571,8 +2571,11 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
                         strncpy((char*)table->data, saved_string,
                                 NUMA_ZONELIST_ORDER_LEN);
                         user_zonelist_order = oldval;
-               } else if (oldval != user_zonelist_order)
+               } else if (oldval != user_zonelist_order) {
+                       mutex_lock(&zonelists_mutex);
                         build_all_zonelists(NULL);
+                       mutex_unlock(&zonelists_mutex);
+               }
         }
  out:
         mutex_unlock(&zl_order_mutex);
@@ -2924,6 +2927,12 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
  static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
  static void setup_zone_pageset(struct zone *zone);
  
+/*
+ * Global mutex to protect against size modification of zonelists
+ * as well as to serialize pageset setup for the new populated zone.
+ */
+DEFINE_MUTEX(zonelists_mutex);
+
  /* return values int ....just for stop_machine() */
  static __init_refok int __build_all_zonelists(void *data)
  {
@@ -2967,6 +2976,10 @@ static __init_refok int __build_all_zonelists(void *data)
         return 0;
  }
  
+/*
+ * Called with zonelists_mutex held always
+ * unless system_state == SYSTEM_BOOTING.
+ */
  void build_all_zonelists(void *data)
  {
         set_zonelist_order();
author	Haicheng Li <haicheng.li@linux.intel.com>
	Mon, 24 May 2010 21:32:52 +0000 (14:32 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 25 May 2010 15:07:02 +0000 (08:07 -0700)
include/linux/mmzone.h		patch \| blob \| blame \| history
kernel/cpu.c		patch \| blob \| blame \| history
mm/memory_hotplug.c		patch \| blob \| blame \| history
mm/page_alloc.c		patch \| blob \| blame \| history