Merge tag 'mce-recovery-for-tip' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Ingo Molnar <mingo@elte.hu>

Thu, 26 Jan 2012 10:40:13 +0000 (11:40 +0100)

committer Ingo Molnar <mingo@elte.hu>

Thu, 26 Jan 2012 10:40:13 +0000 (11:40 +0100)
author Ingo Molnar <mingo@elte.hu>
Thu, 26 Jan 2012 10:40:13 +0000 (11:40 +0100)
committer Ingo Molnar <mingo@elte.hu>
Thu, 26 Jan 2012 10:40:13 +0000 (11:40 +0100)
diff --combined arch/x86/kernel/cpu/mcheck/mce.c

index 5a11ae2e9e917a07eae73246c7a73ace07edda39,56e4e79387c3ccafc5500b98a73d140720d2cf03..ad573d8baf10c4820d5b8c4a32b3a18ae0c835ce
--- 1/arch/x86/kernel/cpu/mcheck/mce.c
--- 2/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@@ -19,7 -19,7 +19,7 @@@
   #include <linux/kernel.h>
   #include <linux/percpu.h>
   #include <linux/string.h>
- -#include <linux/sysdev.h>
+ +#include <linux/device.h>
   #include <linux/syscore_ops.h>
   #include <linux/delay.h>
   #include <linux/ctype.h>
@@@ -95,6 -95,13 +95,6 @@@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrd
   static DEFINE_PER_CPU(struct mce, mces_seen);
   static int                    cpu_missing;
   
- -/*
- - * CPU/chipset specific EDAC code can register a notifier call here to print
- - * MCE errors in a human-readable form.
- - */
- -ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
- -EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
- -
   /* MCA banks polled by the period polling timer for corrected events */
   DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
         [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
@@@ -102,12 -109,6 +102,12 @@@
   
   static DEFINE_PER_CPU(struct work_struct, mce_work);
   
+ +/*
+ + * CPU/chipset specific EDAC code can register a notifier call here to print
+ + * MCE errors in a human-readable form.
+ + */
+ +ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
+ +
   /* Do initial initialization of a struct mce */
   void mce_setup(struct mce *m)
   {
@@@ -118,7 -119,9 +118,7 @@@
         m->time = get_seconds();
         m->cpuvendor = boot_cpu_data.x86_vendor;
         m->cpuid = cpuid_eax(1);
- -#ifdef CONFIG_SMP
         m->socketid = cpu_data(m->extcpu).phys_proc_id;
- -#endif
         m->apicid = cpu_data(m->extcpu).initial_apicid;
         rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
   }
@@@ -187,57 -190,6 +187,57 @@@ void mce_log(struct mce *mce
         set_bit(0, &mce_need_notify);
   }
   
+ +static void drain_mcelog_buffer(void)
+ +{
+ +      unsigned int next, i, prev = 0;
+ +
+ +      next = rcu_dereference_check_mce(mcelog.next);
+ +
+ +      do {
+ +              struct mce *m;
+ +
+ +              /* drain what was logged during boot */
+ +              for (i = prev; i < next; i++) {
+ +                      unsigned long start = jiffies;
+ +                      unsigned retries = 1;
+ +
+ +                      m = &mcelog.entry[i];
+ +
+ +                      while (!m->finished) {
+ +                              if (time_after_eq(jiffies, start + 2*retries))
+ +                                      retries++;
+ +
+ +                              cpu_relax();
+ +
+ +                              if (!m->finished && retries >= 4) {
+ +                                      pr_err("MCE: skipping error being logged currently!\n");
+ +                                      break;
+ +                              }
+ +                      }
+ +                      smp_rmb();
+ +                      atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
+ +              }
+ +
+ +              memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
+ +              prev = next;
+ +              next = cmpxchg(&mcelog.next, prev, 0);
+ +      } while (next != prev);
+ +}
+ +
+ +
+ +void mce_register_decode_chain(struct notifier_block *nb)
+ +{
+ +      atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
+ +      drain_mcelog_buffer();
+ +}
+ +EXPORT_SYMBOL_GPL(mce_register_decode_chain);
+ +
+ +void mce_unregister_decode_chain(struct notifier_block *nb)
+ +{
+ +      atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
+ +}
+ +EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
+ +
   static void print_mce(struct mce *m)
   {
         int ret = 0;
@@@ -540,6 -492,27 +540,27 @@@ static void mce_report_event(struct pt_
         irq_work_queue(&__get_cpu_var(mce_irq_work));
   }
   
+ /*
+  * Read ADDR and MISC registers.
+  */
+ static void mce_read_aux(struct mce *m, int i)
+ {
+       if (m->status & MCI_STATUS_MISCV)
+               m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
+       if (m->status & MCI_STATUS_ADDRV) {
+               m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
+ 
+               /*
+                * Mask the reported address by the reported granularity.
+                */
+               if (mce_ser && (m->status & MCI_STATUS_MISCV)) {
+                       u8 shift = MCI_MISC_ADDR_LSB(m->misc);
+                       m->addr >>= shift;
+                       m->addr <<= shift;
+               }
+       }
+ }
+ 
   DEFINE_PER_CPU(unsigned, mce_poll_count);
   
   /*
@@@ -590,10 -563,7 +611,7 @@@ void machine_check_poll(enum mcp_flags 
                     (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
                         continue;
   
-               if (m.status & MCI_STATUS_MISCV)
-                       m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
-               if (m.status & MCI_STATUS_ADDRV)
-                       m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
+               mce_read_aux(&m, i);
   
                 if (!(flags & MCP_TIMESTAMP))
                         m.tsc = 0;
@@@ -916,6 -886,49 +934,49 @@@ static void mce_clear_state(unsigned lo
         }
   }
   
+ /*
+  * Need to save faulting physical address associated with a process
+  * in the machine check handler some place where we can grab it back
+  * later in mce_notify_process()
+  */
+ #define       MCE_INFO_MAX    16
+ 
+ struct mce_info {
+       atomic_t                inuse;
+       struct task_struct      *t;
+       __u64                   paddr;
+ } mce_info[MCE_INFO_MAX];
+ 
+ static void mce_save_info(__u64 addr)
+ {
+       struct mce_info *mi;
+ 
+       for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
+               if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
+                       mi->t = current;
+                       mi->paddr = addr;
+                       return;
+               }
+       }
+ 
+       mce_panic("Too many concurrent recoverable errors", NULL, NULL);
+ }
+ 
+ static struct mce_info *mce_find_info(void)
+ {
+       struct mce_info *mi;
+ 
+       for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
+               if (atomic_read(&mi->inuse) && mi->t == current)
+                       return mi;
+       return NULL;
+ }
+ 
+ static void mce_clear_info(struct mce_info *mi)
+ {
+       atomic_set(&mi->inuse, 0);
+ }
+ 
   /*
    * The actual machine check handler. This only handles real
    * exceptions when something got corrupted coming in through int 18.
@@@ -969,7 -982,9 +1030,9 @@@ void do_machine_check(struct pt_regs *r
         barrier();
   
         /*
-        * When no restart IP must always kill or panic.
+        * When no restart IP might need to kill or panic.
+        * Assume the worst for now, but if we find the
+        * severity is MCE_AR_SEVERITY we have other options.
          */
         if (!(m.mcgstatus & MCG_STATUS_RIPV))
                 kill_it = 1;
@@@ -1023,16 -1038,7 +1086,7 @@@
                         continue;
                 }
   
-               /*
-                * Kill on action required.
-                */
-               if (severity == MCE_AR_SEVERITY)
-                       kill_it = 1;
- 
-               if (m.status & MCI_STATUS_MISCV)
-                       m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
-               if (m.status & MCI_STATUS_ADDRV)
-                       m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
+               mce_read_aux(&m, i);
   
                 /*
                  * Action optional error. Queue address for later processing.
@@@ -1052,6 -1058,9 +1106,9 @@@
                 }
         }
   
+       /* mce_clear_state will clear *final, save locally for use later */
+       m = *final;
+ 
         if (!no_way_out)
                 mce_clear_state(toclear);
   
@@@ -1063,27 -1072,22 +1120,22 @@@
                 no_way_out = worst >= MCE_PANIC_SEVERITY;
   
         /*
-        * If we have decided that we just CAN'T continue, and the user
-        * has not set tolerant to an insane level, give up and die.
-        *
-        * This is mainly used in the case when the system doesn't
-        * support MCE broadcasting or it has been disabled.
+        * At insane "tolerant" levels we take no action. Otherwise
+        * we only die if we have no other choice. For less serious
+        * issues we try to recover, or limit damage to the current
+        * process.
          */
-       if (no_way_out && tolerant < 3)
-               mce_panic("Fatal machine check on current CPU", final, msg);
- 
-       /*
-        * If the error seems to be unrecoverable, something should be
-        * done.  Try to kill as little as possible.  If we can kill just
-        * one task, do that.  If the user has set the tolerance very
-        * high, don't try to do anything at all.
-        */
- 
-       if (kill_it && tolerant < 3)
-               force_sig(SIGBUS, current);
- 
-       /* notify userspace ASAP */
-       set_thread_flag(TIF_MCE_NOTIFY);
+       if (tolerant < 3) {
+               if (no_way_out)
+                       mce_panic("Fatal machine check on current CPU", &m, msg);
+               if (worst == MCE_AR_SEVERITY) {
+                       /* schedule action before return to userland */
+                       mce_save_info(m.addr);
+                       set_thread_flag(TIF_MCE_NOTIFY);
+               } else if (kill_it) {
+                       force_sig(SIGBUS, current);
+               }
+       }
   
         if (worst > 0)
                 mce_report_event(regs);
@@@ -1094,34 -1098,57 +1146,57 @@@ out
   }
   EXPORT_SYMBOL_GPL(do_machine_check);
   
- /* dummy to break dependency. actual code is in mm/memory-failure.c */
- void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
+ #ifndef CONFIG_MEMORY_FAILURE
+ int memory_failure(unsigned long pfn, int vector, int flags)
   {
-       printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
+       /* mce_severity() should not hand us an ACTION_REQUIRED error */
+       BUG_ON(flags & MF_ACTION_REQUIRED);
+       printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n"
+               "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn);
+ 
+       return 0;
   }
+ #endif
   
   /*
-  * Called after mce notification in process context. This code
-  * is allowed to sleep. Call the high level VM handler to process
-  * any corrupted pages.
-  * Assume that the work queue code only calls this one at a time
-  * per CPU.
-  * Note we don't disable preemption, so this code might run on the wrong
-  * CPU. In this case the event is picked up by the scheduled work queue.
-  * This is merely a fast path to expedite processing in some common
-  * cases.
+  * Called in process context that interrupted by MCE and marked with
+  * TIF_MCE_NOTIFY, just before returning to erroneous userland.
+  * This code is allowed to sleep.
+  * Attempt possible recovery such as calling the high level VM handler to
+  * process any corrupted pages, and kill/signal current process if required.
+  * Action required errors are handled here.
    */
   void mce_notify_process(void)
   {
         unsigned long pfn;
-       mce_notify_irq();
-       while (mce_ring_get(&pfn))
-               memory_failure(pfn, MCE_VECTOR);
+       struct mce_info *mi = mce_find_info();
+ 
+       if (!mi)
+               mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
+       pfn = mi->paddr >> PAGE_SHIFT;
+ 
+       clear_thread_flag(TIF_MCE_NOTIFY);
+ 
+       pr_err("Uncorrected hardware memory error in user-access at %llx",
+                mi->paddr);
+       if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) {
+               pr_err("Memory error not recovered");
+               force_sig(SIGBUS, current);
+       }
+       mce_clear_info(mi);
   }
   
+ /*
+  * Action optional processing happens here (picking up
+  * from the list of faulting pages that do_machine_check()
+  * placed into the "ring").
+  */
   static void mce_process_work(struct work_struct *dummy)
   {
-       mce_notify_process();
+       unsigned long pfn;
+ 
+       while (mce_ring_get(&pfn))
+               memory_failure(pfn, MCE_VECTOR, 0);
   }
   
   #ifdef CONFIG_X86_MCE_INTEL
@@@ -1211,8 -1238,6 +1286,6 @@@ int mce_notify_irq(void
         /* Not more than two messages every minute */
         static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
   
-       clear_thread_flag(TIF_MCE_NOTIFY);
- 
         if (test_and_clear_bit(0, &mce_need_notify)) {
                 /* wake processes polling /dev/mcelog */
                 wake_up_interruptible(&mce_chrdev_wait);
@@@ -1818,7 -1843,7 +1891,7 @@@ static struct syscore_ops mce_syscore_o
   };
   
   /*
- - * mce_sysdev: Sysfs support
+ + * mce_device: Sysfs support
    */
   
   static void mce_cpu_restart(void *data)
@@@ -1854,28 -1879,27 +1927,28 @@@ static void mce_enable_ce(void *all
                 __mcheck_cpu_init_timer();
   }
   
- -static struct sysdev_class mce_sysdev_class = {
+ +static struct bus_type mce_subsys = {
         .name           = "machinecheck",
+ +      .dev_name       = "machinecheck",
   };
   
- -DEFINE_PER_CPU(struct sys_device, mce_sysdev);
+ +struct device *mce_device[CONFIG_NR_CPUS];
   
   __cpuinitdata
   void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
   
- -static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr)
+ +static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
   {
         return container_of(attr, struct mce_bank, attr);
   }
   
- -static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
+ +static ssize_t show_bank(struct device *s, struct device_attribute *attr,
                          char *buf)
   {
         return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
   }
   
- -static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
+ +static ssize_t set_bank(struct device *s, struct device_attribute *attr,
                         const char *buf, size_t size)
   {
         u64 new;
@@@ -1890,14 -1914,14 +1963,14 @@@
   }
   
   static ssize_t
- -show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
+ +show_trigger(struct device *s, struct device_attribute *attr, char *buf)
   {
         strcpy(buf, mce_helper);
         strcat(buf, "\n");
         return strlen(mce_helper) + 1;
   }
   
- -static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
+ +static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
                                 const char *buf, size_t siz)
   {
         char *p;
@@@ -1912,8 -1936,8 +1985,8 @@@
         return strlen(mce_helper) + !!p;
   }
   
- -static ssize_t set_ignore_ce(struct sys_device *s,
- -                           struct sysdev_attribute *attr,
+ +static ssize_t set_ignore_ce(struct device *s,
+ +                           struct device_attribute *attr,
                              const char *buf, size_t size)
   {
         u64 new;
@@@ -1936,8 -1960,8 +2009,8 @@@
         return size;
   }
   
- -static ssize_t set_cmci_disabled(struct sys_device *s,
- -                               struct sysdev_attribute *attr,
+ +static ssize_t set_cmci_disabled(struct device *s,
+ +                               struct device_attribute *attr,
                                  const char *buf, size_t size)
   {
         u64 new;
@@@ -1959,117 -1983,108 +2032,117 @@@
         return size;
   }
   
- -static ssize_t store_int_with_restart(struct sys_device *s,
- -                                    struct sysdev_attribute *attr,
+ +static ssize_t store_int_with_restart(struct device *s,
+ +                                    struct device_attribute *attr,
                                       const char *buf, size_t size)
   {
- -      ssize_t ret = sysdev_store_int(s, attr, buf, size);
+ +      ssize_t ret = device_store_int(s, attr, buf, size);
         mce_restart();
         return ret;
   }
   
- -static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
- -static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
- -static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
- -static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
+ +static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
+ +static DEVICE_INT_ATTR(tolerant, 0644, tolerant);
+ +static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
+ +static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
   
- -static struct sysdev_ext_attribute attr_check_interval = {
- -      _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
- -                   store_int_with_restart),
+ +static struct dev_ext_attribute dev_attr_check_interval = {
+ +      __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
         &check_interval
   };
   
- -static struct sysdev_ext_attribute attr_ignore_ce = {
- -      _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce),
+ +static struct dev_ext_attribute dev_attr_ignore_ce = {
+ +      __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce),
         &mce_ignore_ce
   };
   
- -static struct sysdev_ext_attribute attr_cmci_disabled = {
- -      _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled),
+ +static struct dev_ext_attribute dev_attr_cmci_disabled = {
+ +      __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled),
         &mce_cmci_disabled
   };
   
- -static struct sysdev_attribute *mce_sysdev_attrs[] = {
- -      &attr_tolerant.attr,
- -      &attr_check_interval.attr,
- -      &attr_trigger,
- -      &attr_monarch_timeout.attr,
- -      &attr_dont_log_ce.attr,
- -      &attr_ignore_ce.attr,
- -      &attr_cmci_disabled.attr,
+ +static struct device_attribute *mce_device_attrs[] = {
+ +      &dev_attr_tolerant.attr,
+ +      &dev_attr_check_interval.attr,
+ +      &dev_attr_trigger,
+ +      &dev_attr_monarch_timeout.attr,
+ +      &dev_attr_dont_log_ce.attr,
+ +      &dev_attr_ignore_ce.attr,
+ +      &dev_attr_cmci_disabled.attr,
         NULL
   };
   
- -static cpumask_var_t mce_sysdev_initialized;
+ +static cpumask_var_t mce_device_initialized;
+ +
+ +static void mce_device_release(struct device *dev)
+ +{
+ +      kfree(dev);
+ +}
   
- -/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
- -static __cpuinit int mce_sysdev_create(unsigned int cpu)
+ +/* Per cpu device init. All of the cpus still share the same ctrl bank: */
+ +static __cpuinit int mce_device_create(unsigned int cpu)
   {
- -      struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
+ +      struct device *dev;
         int err;
         int i, j;
   
         if (!mce_available(&boot_cpu_data))
                 return -EIO;
   
- -      memset(&sysdev->kobj, 0, sizeof(struct kobject));
- -      sysdev->id  = cpu;
- -      sysdev->cls = &mce_sysdev_class;
+ +      dev = kzalloc(sizeof *dev, GFP_KERNEL);
+ +      if (!dev)
+ +              return -ENOMEM;
+ +      dev->id  = cpu;
+ +      dev->bus = &mce_subsys;
+ +      dev->release = &mce_device_release;
   
- -      err = sysdev_register(sysdev);
+ +      err = device_register(dev);
         if (err)
                 return err;
   
- -      for (i = 0; mce_sysdev_attrs[i]; i++) {
- -              err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]);
+ +      for (i = 0; mce_device_attrs[i]; i++) {
+ +              err = device_create_file(dev, mce_device_attrs[i]);
                 if (err)
                         goto error;
         }
         for (j = 0; j < banks; j++) {
- -              err = sysdev_create_file(sysdev, &mce_banks[j].attr);
+ +              err = device_create_file(dev, &mce_banks[j].attr);
                 if (err)
                         goto error2;
         }
- -      cpumask_set_cpu(cpu, mce_sysdev_initialized);
+ +      cpumask_set_cpu(cpu, mce_device_initialized);
+ +      mce_device[cpu] = dev;
   
         return 0;
   error2:
         while (--j >= 0)
- -              sysdev_remove_file(sysdev, &mce_banks[j].attr);
+ +              device_remove_file(dev, &mce_banks[j].attr);
   error:
         while (--i >= 0)
- -              sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
+ +              device_remove_file(dev, mce_device_attrs[i]);
   
- -      sysdev_unregister(sysdev);
+ +      device_unregister(dev);
   
         return err;
   }
   
- -static __cpuinit void mce_sysdev_remove(unsigned int cpu)
+ +static __cpuinit void mce_device_remove(unsigned int cpu)
   {
- -      struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
+ +      struct device *dev = mce_device[cpu];
         int i;
   
- -      if (!cpumask_test_cpu(cpu, mce_sysdev_initialized))
+ +      if (!cpumask_test_cpu(cpu, mce_device_initialized))
                 return;
   
- -      for (i = 0; mce_sysdev_attrs[i]; i++)
- -              sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
+ +      for (i = 0; mce_device_attrs[i]; i++)
+ +              device_remove_file(dev, mce_device_attrs[i]);
   
         for (i = 0; i < banks; i++)
- -              sysdev_remove_file(sysdev, &mce_banks[i].attr);
+ +              device_remove_file(dev, &mce_banks[i].attr);
   
- -      sysdev_unregister(sysdev);
- -      cpumask_clear_cpu(cpu, mce_sysdev_initialized);
+ +      device_unregister(dev);
+ +      cpumask_clear_cpu(cpu, mce_device_initialized);
+ +      mce_device[cpu] = NULL;
   }
   
   /* Make sure there are no machine checks on offlined CPUs. */
@@@ -2119,7 -2134,7 +2192,7 @@@ mce_cpu_callback(struct notifier_block 
         switch (action) {
         case CPU_ONLINE:
         case CPU_ONLINE_FROZEN:
- -              mce_sysdev_create(cpu);
+ +              mce_device_create(cpu);
                 if (threshold_cpu_callback)
                         threshold_cpu_callback(action, cpu);
                 break;
@@@ -2127,7 -2142,7 +2200,7 @@@
         case CPU_DEAD_FROZEN:
                 if (threshold_cpu_callback)
                         threshold_cpu_callback(action, cpu);
- -              mce_sysdev_remove(cpu);
+ +              mce_device_remove(cpu);
                 break;
         case CPU_DOWN_PREPARE:
         case CPU_DOWN_PREPARE_FROZEN:
@@@ -2161,7 -2176,7 +2234,7 @@@ static __init void mce_init_banks(void
   
         for (i = 0; i < banks; i++) {
                 struct mce_bank *b = &mce_banks[i];
- -              struct sysdev_attribute *a = &b->attr;
+ +              struct device_attribute *a = &b->attr;
   
                 sysfs_attr_init(&a->attr);
                 a->attr.name    = b->attrname;
@@@ -2181,16 -2196,16 +2254,16 @@@ static __init int mcheck_init_device(vo
         if (!mce_available(&boot_cpu_data))
                 return -EIO;
   
- -      zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL);
+ +      zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
   
         mce_init_banks();
   
- -      err = sysdev_class_register(&mce_sysdev_class);
+ +      err = subsys_system_register(&mce_subsys, NULL);
         if (err)
                 return err;
   
         for_each_online_cpu(i) {
- -              err = mce_sysdev_create(i);
+ +              err = mce_device_create(i);
                 if (err)
                         return err;
         }
diff --combined drivers/base/memory.c

index ed5de58c340f25e7c20b6db4189b279dd3110a2f,9a924440053f3c43e4fc65f7a90943aa308b6a93..9b3a71e391f4412b619db7b4094aecc8c0f0d6ec
--- 1/drivers/base/memory.c
--- 2/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@@ -1,5 -1,5 +1,5 @@@
   /*
- - * drivers/base/memory.c - basic Memory class support
+ + * Memory subsystem support
    *
    * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
    *            Dave Hansen <haveblue@us.ibm.com>
@@@ -10,6 -10,7 +10,6 @@@
    * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
    */
   
- -#include <linux/sysdev.h>
   #include <linux/module.h>
   #include <linux/init.h>
   #include <linux/topology.h>
@@@ -37,9 -38,26 +37,9 @@@ static inline int base_memory_block_id(
         return section_nr / sections_per_block;
   }
   
- -static struct sysdev_class memory_sysdev_class = {
+ +static struct bus_type memory_subsys = {
         .name = MEMORY_CLASS_NAME,
- -};
- -
- -static const char *memory_uevent_name(struct kset *kset, struct kobject *kobj)
- -{
- -      return MEMORY_CLASS_NAME;
- -}
- -
- -static int memory_uevent(struct kset *kset, struct kobject *obj,
- -                      struct kobj_uevent_env *env)
- -{
- -      int retval = 0;
- -
- -      return retval;
- -}
- -
- -static const struct kset_uevent_ops memory_uevent_ops = {
- -      .name           = memory_uevent_name,
- -      .uevent         = memory_uevent,
+ +      .dev_name = MEMORY_CLASS_NAME,
   };
   
   static BLOCKING_NOTIFIER_HEAD(memory_chain);
@@@ -78,21 -96,21 +78,21 @@@ int register_memory(struct memory_bloc
   {
         int error;
   
- -      memory->sysdev.cls = &memory_sysdev_class;
- -      memory->sysdev.id = memory->start_section_nr / sections_per_block;
+ +      memory->dev.bus = &memory_subsys;
+ +      memory->dev.id = memory->start_section_nr / sections_per_block;
   
- -      error = sysdev_register(&memory->sysdev);
+ +      error = device_register(&memory->dev);
         return error;
   }
   
   static void
   unregister_memory(struct memory_block *memory)
   {
- -      BUG_ON(memory->sysdev.cls != &memory_sysdev_class);
+ +      BUG_ON(memory->dev.bus != &memory_subsys);
   
         /* drop the ref. we got in remove_memory_block() */
- -      kobject_put(&memory->sysdev.kobj);
- -      sysdev_unregister(&memory->sysdev);
+ +      kobject_put(&memory->dev.kobj);
+ +      device_unregister(&memory->dev);
   }
   
   unsigned long __weak memory_block_size_bytes(void)
@@@ -120,22 -138,22 +120,22 @@@ static unsigned long get_memory_block_s
    * uses.
    */
   
- -static ssize_t show_mem_start_phys_index(struct sys_device *dev,
- -                      struct sysdev_attribute *attr, char *buf)
+ +static ssize_t show_mem_start_phys_index(struct device *dev,
+ +                      struct device_attribute *attr, char *buf)
   {
         struct memory_block *mem =
- -              container_of(dev, struct memory_block, sysdev);
+ +              container_of(dev, struct memory_block, dev);
         unsigned long phys_index;
   
         phys_index = mem->start_section_nr / sections_per_block;
         return sprintf(buf, "%08lx\n", phys_index);
   }
   
- -static ssize_t show_mem_end_phys_index(struct sys_device *dev,
- -                      struct sysdev_attribute *attr, char *buf)
+ +static ssize_t show_mem_end_phys_index(struct device *dev,
+ +                      struct device_attribute *attr, char *buf)
   {
         struct memory_block *mem =
- -              container_of(dev, struct memory_block, sysdev);
+ +              container_of(dev, struct memory_block, dev);
         unsigned long phys_index;
   
         phys_index = mem->end_section_nr / sections_per_block;
@@@ -145,13 -163,13 +145,13 @@@
   /*
    * Show whether the section of memory is likely to be hot-removable
    */
- -static ssize_t show_mem_removable(struct sys_device *dev,
- -                      struct sysdev_attribute *attr, char *buf)
+ +static ssize_t show_mem_removable(struct device *dev,
+ +                      struct device_attribute *attr, char *buf)
   {
         unsigned long i, pfn;
         int ret = 1;
         struct memory_block *mem =
- -              container_of(dev, struct memory_block, sysdev);
+ +              container_of(dev, struct memory_block, dev);
   
         for (i = 0; i < sections_per_block; i++) {
                 pfn = section_nr_to_pfn(mem->start_section_nr + i);
@@@ -164,11 -182,11 +164,11 @@@
   /*
    * online, offline, going offline, etc.
    */
- -static ssize_t show_mem_state(struct sys_device *dev,
- -                      struct sysdev_attribute *attr, char *buf)
+ +static ssize_t show_mem_state(struct device *dev,
+ +                      struct device_attribute *attr, char *buf)
   {
         struct memory_block *mem =
- -              container_of(dev, struct memory_block, sysdev);
+ +              container_of(dev, struct memory_block, dev);
         ssize_t len = 0;
   
         /*
@@@ -295,35 -313,24 +295,35 @@@ static int memory_block_change_state(st
   
         ret = memory_block_action(mem->start_section_nr, to_state);
   
- -      if (ret)
+ +      if (ret) {
                 mem->state = from_state_req;
- -      else
- -              mem->state = to_state;
+ +              goto out;
+ +      }
   
+ +      mem->state = to_state;
+ +      switch (mem->state) {
+ +      case MEM_OFFLINE:
+ +              kobject_uevent(&mem->dev.kobj, KOBJ_OFFLINE);
+ +              break;
+ +      case MEM_ONLINE:
+ +              kobject_uevent(&mem->dev.kobj, KOBJ_ONLINE);
+ +              break;
+ +      default:
+ +              break;
+ +      }
   out:
         mutex_unlock(&mem->state_mutex);
         return ret;
   }
   
   static ssize_t
- -store_mem_state(struct sys_device *dev,
- -              struct sysdev_attribute *attr, const char *buf, size_t count)
+ +store_mem_state(struct device *dev,
+ +              struct device_attribute *attr, const char *buf, size_t count)
   {
         struct memory_block *mem;
         int ret = -EINVAL;
   
- -      mem = container_of(dev, struct memory_block, sysdev);
+ +      mem = container_of(dev, struct memory_block, dev);
   
         if (!strncmp(buf, "online", min((int)count, 6)))
                 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
@@@ -344,41 -351,41 +344,41 @@@
    * s.t. if I offline all of these sections I can then
    * remove the physical device?
    */
- -static ssize_t show_phys_device(struct sys_device *dev,
- -                              struct sysdev_attribute *attr, char *buf)
+ +static ssize_t show_phys_device(struct device *dev,
+ +                              struct device_attribute *attr, char *buf)
   {
         struct memory_block *mem =
- -              container_of(dev, struct memory_block, sysdev);
+ +              container_of(dev, struct memory_block, dev);
         return sprintf(buf, "%d\n", mem->phys_device);
   }
   
- -static SYSDEV_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL);
- -static SYSDEV_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL);
- -static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state);
- -static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL);
- -static SYSDEV_ATTR(removable, 0444, show_mem_removable, NULL);
+ +static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL);
+ +static DEVICE_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL);
+ +static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state);
+ +static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL);
+ +static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL);
   
   #define mem_create_simple_file(mem, attr_name)        \
- -      sysdev_create_file(&mem->sysdev, &attr_##attr_name)
+ +      device_create_file(&mem->dev, &dev_attr_##attr_name)
   #define mem_remove_simple_file(mem, attr_name)        \
- -      sysdev_remove_file(&mem->sysdev, &attr_##attr_name)
+ +      device_remove_file(&mem->dev, &dev_attr_##attr_name)
   
   /*
    * Block size attribute stuff
    */
   static ssize_t
- -print_block_size(struct sysdev_class *class, struct sysdev_class_attribute *attr,
+ +print_block_size(struct device *dev, struct device_attribute *attr,
                  char *buf)
   {
         return sprintf(buf, "%lx\n", get_memory_block_size());
   }
   
- -static SYSDEV_CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL);
+ +static DEVICE_ATTR(block_size_bytes, 0444, print_block_size, NULL);
   
   static int block_size_init(void)
   {
- -      return sysfs_create_file(&memory_sysdev_class.kset.kobj,
- -                              &attr_block_size_bytes.attr);
+ +      return device_create_file(memory_subsys.dev_root,
+ +                                &dev_attr_block_size_bytes);
   }
   
   /*
@@@ -389,7 -396,7 +389,7 @@@
    */
   #ifdef CONFIG_ARCH_MEMORY_PROBE
   static ssize_t
- -memory_probe_store(struct class *class, struct class_attribute *attr,
+ +memory_probe_store(struct device *dev, struct device_attribute *attr,
                    const char *buf, size_t count)
   {
         u64 phys_addr;
@@@ -416,11 -423,12 +416,11 @@@
   out:
         return ret;
   }
- -static CLASS_ATTR(probe, S_IWUSR, NULL, memory_probe_store);
+ +static DEVICE_ATTR(probe, S_IWUSR, NULL, memory_probe_store);
   
   static int memory_probe_init(void)
   {
- -      return sysfs_create_file(&memory_sysdev_class.kset.kobj,
- -                              &class_attr_probe.attr);
+ +      return device_create_file(memory_subsys.dev_root, &dev_attr_probe);
   }
   #else
   static inline int memory_probe_init(void)
@@@ -436,8 -444,8 +436,8 @@@
   
   /* Soft offline a page */
   static ssize_t
- -store_soft_offline_page(struct class *class,
- -                      struct class_attribute *attr,
+ +store_soft_offline_page(struct device *dev,
+ +                      struct device_attribute *attr,
                         const char *buf, size_t count)
   {
         int ret;
@@@ -455,8 -463,8 +455,8 @@@
   
   /* Forcibly offline a page, including killing processes. */
   static ssize_t
- -store_hard_offline_page(struct class *class,
- -                      struct class_attribute *attr,
+ +store_hard_offline_page(struct device *dev,
+ +                      struct device_attribute *attr,
                         const char *buf, size_t count)
   {
         int ret;
@@@ -466,22 -474,22 +466,22 @@@
         if (strict_strtoull(buf, 0, &pfn) < 0)
                 return -EINVAL;
         pfn >>= PAGE_SHIFT;
-       ret = __memory_failure(pfn, 0, 0);
+       ret = memory_failure(pfn, 0, 0);
         return ret ? ret : count;
   }
   
- -static CLASS_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page);
- -static CLASS_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page);
+ +static DEVICE_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page);
+ +static DEVICE_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page);
   
   static __init int memory_fail_init(void)
   {
         int err;
   
- -      err = sysfs_create_file(&memory_sysdev_class.kset.kobj,
- -                              &class_attr_soft_offline_page.attr);
+ +      err = device_create_file(memory_subsys.dev_root,
+ +                              &dev_attr_soft_offline_page);
         if (!err)
- -              err = sysfs_create_file(&memory_sysdev_class.kset.kobj,
- -                              &class_attr_hard_offline_page.attr);
+ +              err = device_create_file(memory_subsys.dev_root,
+ +                              &dev_attr_hard_offline_page);
         return err;
   }
   #else
@@@ -501,23 -509,31 +501,23 @@@ int __weak arch_get_memory_phys_device(
         return 0;
   }
   
+ +/*
+ + * A reference for the returned object is held and the reference for the
+ + * hinted object is released.
+ + */
   struct memory_block *find_memory_block_hinted(struct mem_section *section,
                                               struct memory_block *hint)
   {
- -      struct kobject *kobj;
- -      struct sys_device *sysdev;
- -      struct memory_block *mem;
- -      char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1];
         int block_id = base_memory_block_id(__section_nr(section));
+ +      struct device *hintdev = hint ? &hint->dev : NULL;
+ +      struct device *dev;
   
- -      kobj = hint ? &hint->sysdev.kobj : NULL;
- -
- -      /*
- -       * This only works because we know that section == sysdev->id
- -       * slightly redundant with sysdev_register()
- -       */
- -      sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, block_id);
- -
- -      kobj = kset_find_obj_hinted(&memory_sysdev_class.kset, name, kobj);
- -      if (!kobj)
+ +      dev = subsys_find_device_by_id(&memory_subsys, block_id, hintdev);
+ +      if (hint)
+ +              put_device(&hint->dev);
+ +      if (!dev)
                 return NULL;
- -
- -      sysdev = container_of(kobj, struct sys_device, kobj);
- -      mem = container_of(sysdev, struct memory_block, sysdev);
- -
- -      return mem;
+ +      return container_of(dev, struct memory_block, dev);
   }
   
   /*
@@@ -526,7 -542,7 +526,7 @@@
    * this gets to be a real problem, we can always use a radix
    * tree or something here.
    *
- - * This could be made generic for all sysdev classes.
+ + * This could be made generic for all device subsystems.
    */
   struct memory_block *find_memory_block(struct mem_section *section)
   {
@@@ -582,7 -598,7 +582,7 @@@ static int add_memory_section(int nid, 
         mem = find_memory_block(section);
         if (mem) {
                 mem->section_count++;
- -              kobject_put(&mem->sysdev.kobj);
+ +              kobject_put(&mem->dev.kobj);
         } else
                 ret = init_memory_block(&mem, section, state);
   
@@@ -615,7 -631,7 +615,7 @@@ int remove_memory_block(unsigned long n
                 unregister_memory(mem);
                 kfree(mem);
         } else
- -              kobject_put(&mem->sysdev.kobj);
+ +              kobject_put(&mem->dev.kobj);
   
         mutex_unlock(&mem_sysfs_mutex);
         return 0;
@@@ -648,7 -664,8 +648,7 @@@ int __init memory_dev_init(void
         int err;
         unsigned long block_sz;
   
- -      memory_sysdev_class.kset.uevent_ops = &memory_uevent_ops;
- -      ret = sysdev_class_register(&memory_sysdev_class);
+ +      ret = subsys_system_register(&memory_subsys, NULL);
         if (ret)
                 goto out;
   
diff --combined include/linux/mm.h

index 17b27cd269c404e117416e468ca9853dd94d141b,bf169ca698126efbcc393818ecaec74319d6d762..3dc8f6b687217962e973ee99e6b8ea7b3140d034
--- 1/include/linux/mm.h
--- 2/include/linux/mm.h
+++ b/include/linux/mm.h
@@@ -1253,34 -1253,41 +1253,34 @@@ static inline void pgtable_page_dtor(st
   extern void free_area_init(unsigned long * zones_size);
   extern void free_area_init_node(int nid, unsigned long * zones_size,
                 unsigned long zone_start_pfn, unsigned long *zholes_size);
- -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+ +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
   /*
- - * With CONFIG_ARCH_POPULATES_NODE_MAP set, an architecture may initialise its
+ + * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its
    * zones, allocate the backing mem_map and account for memory holes in a more
    * architecture independent manner. This is a substitute for creating the
    * zone_sizes[] and zholes_size[] arrays and passing them to
    * free_area_init_node()
    *
    * An architecture is expected to register range of page frames backed by
- - * physical memory with add_active_range() before calling
+ + * physical memory with memblock_add[_node]() before calling
    * free_area_init_nodes() passing in the PFN each zone ends at. At a basic
    * usage, an architecture is expected to do something like
    *
    * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn,
    *                                                     max_highmem_pfn};
    * for_each_valid_physical_page_range()
- - *    add_active_range(node_id, start_pfn, end_pfn)
+ + *    memblock_add_node(base, size, nid)
    * free_area_init_nodes(max_zone_pfns);
    *
- - * If the architecture guarantees that there are no holes in the ranges
- - * registered with add_active_range(), free_bootmem_active_regions()
- - * will call free_bootmem_node() for each registered physical page range.
- - * Similarly sparse_memory_present_with_active_regions() calls
- - * memory_present() for each range when SPARSEMEM is enabled.
+ + * free_bootmem_with_active_regions() calls free_bootmem_node() for each
+ + * registered physical page range.  Similarly
+ + * sparse_memory_present_with_active_regions() calls memory_present() for
+ + * each range when SPARSEMEM is enabled.
    *
    * See mm/page_alloc.c for more information on each function exposed by
- - * CONFIG_ARCH_POPULATES_NODE_MAP
+ + * CONFIG_HAVE_MEMBLOCK_NODE_MAP.
    */
   extern void free_area_init_nodes(unsigned long *max_zone_pfn);
- -extern void add_active_range(unsigned int nid, unsigned long start_pfn,
- -                                      unsigned long end_pfn);
- -extern void remove_active_range(unsigned int nid, unsigned long start_pfn,
- -                                      unsigned long end_pfn);
- -extern void remove_all_active_ranges(void);
- -void sort_node_map(void);
   unsigned long node_map_pfn_alignment(void);
   unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn,
                                                 unsigned long end_pfn);
@@@ -1293,11 -1300,14 +1293,11 @@@ extern void free_bootmem_with_active_re
                                                 unsigned long max_low_pfn);
   int add_from_early_node_map(struct range *range, int az,
                                    int nr_range, int nid);
- -u64 __init find_memory_core_early(int nid, u64 size, u64 align,
- -                                      u64 goal, u64 limit);
- -typedef int (*work_fn_t)(unsigned long, unsigned long, void *);
- -extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data);
   extern void sparse_memory_present_with_active_regions(int nid);
- -#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
   
- -#if !defined(CONFIG_ARCH_POPULATES_NODE_MAP) && \
+ +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+ +
+ +#if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \
       !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID)
   static inline int __early_pfn_to_nid(unsigned long pfn)
   {
@@@ -1482,18 -1492,6 +1482,18 @@@ static inline unsigned long vma_pages(s
         return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
   }
   
+ +/* Look up the first VMA which exactly match the interval vm_start ... vm_end */
+ +static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
+ +                              unsigned long vm_start, unsigned long vm_end)
+ +{
+ +      struct vm_area_struct *vma = find_vma(mm, vm_start);
+ +
+ +      if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
+ +              vma = NULL;
+ +
+ +      return vma;
+ +}
+ +
   #ifdef CONFIG_MMU
   pgprot_t vm_get_page_prot(unsigned long vm_flags);
   #else
@@@ -1540,13 -1538,23 +1540,13 @@@ static inline void vm_stat_account(stru
   #endif /* CONFIG_PROC_FS */
   
   #ifdef CONFIG_DEBUG_PAGEALLOC
- -extern int debug_pagealloc_enabled;
- -
   extern void kernel_map_pages(struct page *page, int numpages, int enable);
- -
- -static inline void enable_debug_pagealloc(void)
- -{
- -      debug_pagealloc_enabled = 1;
- -}
   #ifdef CONFIG_HIBERNATION
   extern bool kernel_page_present(struct page *page);
   #endif /* CONFIG_HIBERNATION */
   #else
   static inline void
   kernel_map_pages(struct page *page, int numpages, int enable) {}
- -static inline void enable_debug_pagealloc(void)
- -{
- -}
   #ifdef CONFIG_HIBERNATION
   static inline bool kernel_page_present(struct page *page) { return true; }
   #endif /* CONFIG_HIBERNATION */
@@@ -1598,9 -1606,9 +1598,9 @@@ void vmemmap_populate_print_last(void)
   
   enum mf_flags {
         MF_COUNT_INCREASED = 1 << 0,
+       MF_ACTION_REQUIRED = 1 << 1,
   };
- extern void memory_failure(unsigned long pfn, int trapno);
- extern int __memory_failure(unsigned long pfn, int trapno, int flags);
+ extern int memory_failure(unsigned long pfn, int trapno, int flags);
   extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
   extern int unpoison_memory(unsigned long pfn);
   extern int sysctl_memory_failure_early_kill;
@@@ -1620,22 -1628,5 +1620,22 @@@ extern void copy_user_huge_page(struct 
                                 unsigned int pages_per_huge_page);
   #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
   
+ +#ifdef CONFIG_DEBUG_PAGEALLOC
+ +extern unsigned int _debug_guardpage_minorder;
+ +
+ +static inline unsigned int debug_guardpage_minorder(void)
+ +{
+ +      return _debug_guardpage_minorder;
+ +}
+ +
+ +static inline bool page_is_guard(struct page *page)
+ +{
+ +      return test_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+ +}
+ +#else
+ +static inline unsigned int debug_guardpage_minorder(void) { return 0; }
+ +static inline bool page_is_guard(struct page *page) { return false; }
+ +#endif /* CONFIG_DEBUG_PAGEALLOC */
+ +
   #endif /* __KERNEL__ */
   #endif /* _LINUX_MM_H */
diff --combined mm/memory-failure.c

index 56080ea361406090860493861d5253dc314debc1,95fd307ebb3025580d94abe7e4a0a4ade3605ed3..0f6033b01ffc23f23adf7fd43caeaac1d2aa96f4
--- 1/mm/memory-failure.c
--- 2/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@@ -187,33 -187,40 +187,40 @@@ int hwpoison_filter(struct page *p
   EXPORT_SYMBOL_GPL(hwpoison_filter);
   
   /*
-  * Send all the processes who have the page mapped an ``action optional''
-  * signal.
+  * Send all the processes who have the page mapped a signal.
+  * ``action optional'' if they are not immediately affected by the error
+  * ``action required'' if error happened in current execution context
    */
- static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
-                       unsigned long pfn, struct page *page)
+ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
+                       unsigned long pfn, struct page *page, int flags)
   {
         struct siginfo si;
         int ret;
   
         printk(KERN_ERR
-               "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
+               "MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
                 pfn, t->comm, t->pid);
         si.si_signo = SIGBUS;
         si.si_errno = 0;
-       si.si_code = BUS_MCEERR_AO;
         si.si_addr = (void *)addr;
   #ifdef __ARCH_SI_TRAPNO
         si.si_trapno = trapno;
   #endif
         si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
-       /*
-        * Don't use force here, it's convenient if the signal
-        * can be temporarily blocked.
-        * This could cause a loop when the user sets SIGBUS
-        * to SIG_IGN, but hopefully no one will do that?
-        */
-       ret = send_sig_info(SIGBUS, &si, t);  /* synchronous? */
+ 
+       if ((flags & MF_ACTION_REQUIRED) && t == current) {
+               si.si_code = BUS_MCEERR_AR;
+               ret = force_sig_info(SIGBUS, &si, t);
+       } else {
+               /*
+                * Don't use force here, it's convenient if the signal
+                * can be temporarily blocked.
+                * This could cause a loop when the user sets SIGBUS
+                * to SIG_IGN, but hopefully no one will do that?
+                */
+               si.si_code = BUS_MCEERR_AO;
+               ret = send_sig_info(SIGBUS, &si, t);  /* synchronous? */
+       }
         if (ret < 0)
                 printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
                        t->comm, t->pid, ret);
@@@ -338,8 -345,9 +345,9 @@@ static void add_to_kill(struct task_str
    * Also when FAIL is set do a force kill because something went
    * wrong earlier.
    */
- static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
-                         int fail, struct page *page, unsigned long pfn)
+ static void kill_procs(struct list_head *to_kill, int doit, int trapno,
+                         int fail, struct page *page, unsigned long pfn,
+                         int flags)
   {
         struct to_kill *tk, *next;
   
@@@ -363,8 -371,8 +371,8 @@@
                          * check for that, but we need to tell the
                          * process anyways.
                          */
-                       else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
-                                             pfn, page) < 0)
+                       else if (kill_proc(tk->tsk, tk->addr, trapno,
+                                             pfn, page, flags) < 0)
                                 printk(KERN_ERR
                 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
                                         pfn, tk->tsk->comm, tk->tsk->pid);
@@@ -844,7 -852,7 +852,7 @@@ static int page_action(struct page_stat
    * the pages and send SIGBUS to the processes if the data was dirty.
    */
   static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
-                                 int trapno)
+                                 int trapno, int flags)
   {
         enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
         struct address_space *mapping;
@@@ -962,8 -970,8 +970,8 @@@
          * use a more force-full uncatchable kill to prevent
          * any accesses to the poisoned memory.
          */
-       kill_procs_ao(&tokill, !!PageDirty(ppage), trapno,
-                     ret != SWAP_SUCCESS, p, pfn);
+       kill_procs(&tokill, !!PageDirty(ppage), trapno,
+                     ret != SWAP_SUCCESS, p, pfn, flags);
   
         return ret;
   }
@@@ -984,7 -992,25 +992,25 @@@ static void clear_page_hwpoison_huge_pa
                 ClearPageHWPoison(hpage + i);
   }
   
- int __memory_failure(unsigned long pfn, int trapno, int flags)
+ /**
+  * memory_failure - Handle memory failure of a page.
+  * @pfn: Page Number of the corrupted page
+  * @trapno: Trap number reported in the signal to user space.
+  * @flags: fine tune action taken
+  *
+  * This function is called by the low level machine check code
+  * of an architecture when it detects hardware memory corruption
+  * of a page. It tries its best to recover, which includes
+  * dropping pages, killing processes etc.
+  *
+  * The function is primarily of use for corruptions that
+  * happen outside the current execution context (e.g. when
+  * detected by a background scrubber)
+  *
+  * Must run in process context (e.g. a work queue) with interrupts
+  * enabled and no spinlocks hold.
+  */
+ int memory_failure(unsigned long pfn, int trapno, int flags)
   {
         struct page_state *ps;
         struct page *p;
@@@ -1130,7 -1156,7 +1156,7 @@@
          * Now take care of user space mappings.
          * Abort on fail: __delete_from_page_cache() assumes unmapped page.
          */
-       if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
+       if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) {
                 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
                 res = -EBUSY;
                 goto out;
@@@ -1156,29 -1182,7 +1182,7 @@@ out
         unlock_page(hpage);
         return res;
   }
- EXPORT_SYMBOL_GPL(__memory_failure);
- 
- /**
-  * memory_failure - Handle memory failure of a page.
-  * @pfn: Page Number of the corrupted page
-  * @trapno: Trap number reported in the signal to user space.
-  *
-  * This function is called by the low level machine check code
-  * of an architecture when it detects hardware memory corruption
-  * of a page. It tries its best to recover, which includes
-  * dropping pages, killing processes etc.
-  *
-  * The function is primarily of use for corruptions that
-  * happen outside the current execution context (e.g. when
-  * detected by a background scrubber)
-  *
-  * Must run in process context (e.g. a work queue) with interrupts
-  * enabled and no spinlocks hold.
-  */
- void memory_failure(unsigned long pfn, int trapno)
- {
-       __memory_failure(pfn, trapno, 0);
- }
+ EXPORT_SYMBOL_GPL(memory_failure);
   
   #define MEMORY_FAILURE_FIFO_ORDER     4
   #define MEMORY_FAILURE_FIFO_SIZE      (1 << MEMORY_FAILURE_FIFO_ORDER)
@@@ -1251,7 -1255,7 +1255,7 @@@ static void memory_failure_work_func(st
                 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
                 if (!gotten)
                         break;
-               __memory_failure(entry.pfn, entry.trapno, entry.flags);
+               memory_failure(entry.pfn, entry.trapno, entry.flags);
         }
   }
   
@@@ -1557,7 -1561,7 +1561,7 @@@ int soft_offline_page(struct page *page
                                             page_is_file_cache(page));
                 list_add(&page->lru, &pagelist);
                 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
- -                                                              0, true);
+ +                                                      0, MIGRATE_SYNC);
                 if (ret) {
                         putback_lru_pages(&pagelist);
                         pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
author	Ingo Molnar <mingo@elte.hu>
	Thu, 26 Jan 2012 10:40:13 +0000 (11:40 +0100)
committer	Ingo Molnar <mingo@elte.hu>
	Thu, 26 Jan 2012 10:40:13 +0000 (11:40 +0100)
		1	2
arch/x86/kernel/cpu/mcheck/mce.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/base/memory.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm.h	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memory-failure.c	patch \|	diff1 \|	diff2 \|	blob \| history