[PATCH] ppc64: PCI error event dispatcher

author Linas Vepstas <linas@linas.org>

Fri, 4 Nov 2005 00:50:04 +0000 (18:50 -0600)

committer Paul Mackerras <paulus@samba.org>

Thu, 10 Nov 2005 00:38:05 +0000 (11:38 +1100)
author Linas Vepstas <linas@linas.org>
Fri, 4 Nov 2005 00:50:04 +0000 (18:50 -0600)
committer Paul Mackerras <paulus@samba.org>
Thu, 10 Nov 2005 00:38:05 +0000 (11:38 +1100)
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile

index dbdffb2fe42999500929c799d9650512bad6b80b..27515476ad6c2cd2c0d0469188a61b02c356d837 100644 (file)
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -3,4 +3,4 @@ obj-y                   := pci.o lpar.o hvCall.o nvram.o reconfig.o \
  obj-$(CONFIG_SMP)      += smp.o
  obj-$(CONFIG_IBMVIO)   += vio.o
  obj-$(CONFIG_XICS)     += xics.o
-obj-$(CONFIG_EEH)    += eeh.o
+obj-$(CONFIG_EEH)    += eeh.o eeh_event.o
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c

index 9df1d5018363e9d32494cd8b02407638d2ab576c..1fec99d53311bb6f99875e1875172428547dabda 100644 (file)
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -19,7 +19,6 @@
  
  #include <linux/init.h>
  #include <linux/list.h>
-#include <linux/notifier.h>
  #include <linux/pci.h>
  #include <linux/proc_fs.h>
  #include <linux/rbtree.h>
@@ -27,12 +26,12 @@
  #include <linux/spinlock.h>
  #include <asm/atomic.h>
  #include <asm/eeh.h>
+#include <asm/eeh_event.h>
  #include <asm/io.h>
  #include <asm/machdep.h>
+#include <asm/ppc-pci.h>
  #include <asm/rtas.h>
-#include <asm/atomic.h>
  #include <asm/systemcfg.h>
-#include <asm/ppc-pci.h>
  
  #undef DEBUG
  
@@ -70,14 +69,6 @@
   *  and sent out for processing.
   */
  
-/* EEH event workqueue setup. */
-static DEFINE_SPINLOCK(eeh_eventlist_lock);
-LIST_HEAD(eeh_eventlist);
-static void eeh_event_handler(void *);
-DECLARE_WORK(eeh_event_wq, eeh_event_handler, NULL);
-
-static struct notifier_block *eeh_notifier_chain;
-
  /* If a device driver keeps reading an MMIO register in an interrupt
   * handler after a slot isolation event has occurred, we assume it
   * is broken and panic.  This sets the threshold for how many read
@@ -420,24 +411,6 @@ void eeh_slot_error_detail (struct pci_dn *pdn, int severity)
         spin_unlock_irqrestore(&slot_errbuf_lock, flags);
  }
  
-/**
- * eeh_register_notifier - Register to find out about EEH events.
- * @nb: notifier block to callback on events
- */
-int eeh_register_notifier(struct notifier_block *nb)
-{
-       return notifier_chain_register(&eeh_notifier_chain, nb);
-}
-
-/**
- * eeh_unregister_notifier - Unregister to an EEH event notifier.
- * @nb: notifier block to callback on events
- */
-int eeh_unregister_notifier(struct notifier_block *nb)
-{
-       return notifier_chain_unregister(&eeh_notifier_chain, nb);
-}
-
  /**
   * read_slot_reset_state - Read the reset state of a device node's slot
   * @dn: device node to read
@@ -460,73 +433,6 @@ static int read_slot_reset_state(struct pci_dn *pdn, int rets[])
                          BUID_HI(pdn->phb->buid), BUID_LO(pdn->phb->buid));
  }
  
-/**
- * eeh_panic - call panic() for an eeh event that cannot be handled.
- * The philosophy of this routine is that it is better to panic and
- * halt the OS than it is to risk possible data corruption by
- * oblivious device drivers that don't know better.
- *
- * @dev pci device that had an eeh event
- * @reset_state current reset state of the device slot
- */
-static void eeh_panic(struct pci_dev *dev, int reset_state)
-{
-       /*
-        * XXX We should create a separate sysctl for this.
-        *
-        * Since the panic_on_oops sysctl is used to halt the system
-        * in light of potential corruption, we can use it here.
-        */
-       if (panic_on_oops) {
-               struct device_node *dn = pci_device_to_OF_node(dev);
-               eeh_slot_error_detail (PCI_DN(dn), 2 /* Permanent Error */);
-               panic("EEH: MMIO failure (%d) on device:%s\n", reset_state,
-                     pci_name(dev));
-       }
-       else {
-               __get_cpu_var(ignored_failures)++;
-               printk(KERN_INFO "EEH: Ignored MMIO failure (%d) on device:%s\n",
-                      reset_state, pci_name(dev));
-       }
-}
-
-/**
- * eeh_event_handler - dispatch EEH events.  The detection of a frozen
- * slot can occur inside an interrupt, where it can be hard to do
- * anything about it.  The goal of this routine is to pull these
- * detection events out of the context of the interrupt handler, and
- * re-dispatch them for processing at a later time in a normal context.
- *
- * @dummy - unused
- */
-static void eeh_event_handler(void *dummy)
-{
-       unsigned long flags;
-       struct eeh_event        *event;
-
-       while (1) {
-               spin_lock_irqsave(&eeh_eventlist_lock, flags);
-               event = NULL;
-               if (!list_empty(&eeh_eventlist)) {
-                       event = list_entry(eeh_eventlist.next, struct eeh_event, list);
-                       list_del(&event->list);
-               }
-               spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
-               if (event == NULL)
-                       break;
-
-               printk(KERN_INFO "EEH: MMIO failure (%d), notifiying device "
-                      "%s\n", event->reset_state,
-                      pci_name(event->dev));
-
-               notifier_call_chain (&eeh_notifier_chain,
-                                    EEH_NOTIFY_FREEZE, event);
-
-               pci_dev_put(event->dev);
-               kfree(event);
-       }
-}
-
  /**
   * eeh_token_to_phys - convert EEH address token to phys address
   * @token i/o token, should be address in the form 0xA....
@@ -613,8 +519,6 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
         int ret;
         int rets[3];
         unsigned long flags;
-       int reset_state;
-       struct eeh_event  *event;
         struct pci_dn *pdn;
         struct device_node *pe_dn;
         int rc = 0;
@@ -722,33 +626,12 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
         __eeh_mark_slot (pe_dn);
         spin_unlock_irqrestore(&confirm_error_lock, flags);
  
-       reset_state = rets[0];
-
-       eeh_slot_error_detail (pdn, 1 /* Temporary Error */);
-
-       printk(KERN_INFO "EEH: MMIO failure (%d) on device: %s %s\n",
-              rets[0], dn->name, dn->full_name);
-       event = kmalloc(sizeof(*event), GFP_ATOMIC);
-       if (event == NULL) {
-               eeh_panic(dev, reset_state);
-               return 1;
-       }
-
-       event->dev = dev;
-       event->dn = dn;
-       event->reset_state = reset_state;
-
-       /* We may or may not be called in an interrupt context */
-       spin_lock_irqsave(&eeh_eventlist_lock, flags);
-       list_add(&event->list, &eeh_eventlist);
-       spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
-
+       eeh_send_failure_event (dn, dev, rets[0], rets[2]);
+       
         /* Most EEH events are due to device driver bugs.  Having
          * a stack trace will help the device-driver authors figure
          * out what happened.  So print that out. */
         if (rets[0] != 5) dump_stack();
-       schedule_work(&eeh_event_wq);
-
         return 1;
  
  dn_unlock:
@@ -793,6 +676,14 @@ unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned lon
  
  EXPORT_SYMBOL(eeh_check_failure);
  
+/* ------------------------------------------------------------- */
+/* The code below deals with enabling EEH for devices during  the
+ * early boot sequence.  EEH must be enabled before any PCI probing
+ * can be done.
+ */
+
+#define EEH_ENABLE 1
+
  struct eeh_early_enable_info {
         unsigned int buid_hi;
         unsigned int buid_lo;
@@ -850,8 +741,9 @@ static void *early_enable_eeh(struct device_node *dn, void *data)
                 /* First register entry is addr (00BBSS00)  */
                 /* Try to enable eeh */
                 ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL,
-                               regs[0], info->buid_hi, info->buid_lo,
-                               EEH_ENABLE);
+                               regs[0], info->buid_hi, info->buid_lo,
+                               EEH_ENABLE);
+
                 if (ret == 0) {
                         eeh_subsystem_enabled = 1;
                         pdn->eeh_mode |= EEH_MODE_SUPPORTED;
diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c

new file mode 100644 (file)

index 0000000..9249733
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/eeh_event.c
@@ -0,0 +1,155 @@
+/*
+ * eeh_event.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ * Copyright (c) 2005 Linas Vepstas <linas@linas.org>
+ */
+
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <asm/eeh_event.h>
+
+/** Overview:
+ *  EEH error states may be detected within exception handlers;
+ *  however, the recovery processing needs to occur asynchronously
+ *  in a normal kernel context and not an interrupt context.
+ *  This pair of routines creates an event and queues it onto a
+ *  work-queue, where a worker thread can drive recovery.
+ */
+
+/* EEH event workqueue setup. */
+static spinlock_t eeh_eventlist_lock = SPIN_LOCK_UNLOCKED;
+LIST_HEAD(eeh_eventlist);
+static void eeh_thread_launcher(void *);
+DECLARE_WORK(eeh_event_wq, eeh_thread_launcher, NULL);
+
+/**
+ * eeh_panic - call panic() for an eeh event that cannot be handled.
+ * The philosophy of this routine is that it is better to panic and
+ * halt the OS than it is to risk possible data corruption by
+ * oblivious device drivers that don't know better.
+ *
+ * @dev pci device that had an eeh event
+ * @reset_state current reset state of the device slot
+ */
+static void eeh_panic(struct pci_dev *dev, int reset_state)
+{
+       /*
+        * Since the panic_on_oops sysctl is used to halt the system
+        * in light of potential corruption, we can use it here.
+        */
+       if (panic_on_oops) {
+               panic("EEH: MMIO failure (%d) on device:%s\n", reset_state,
+                     pci_name(dev));
+       }
+       else {
+               printk(KERN_INFO "EEH: Ignored MMIO failure (%d) on device:%s\n",
+                      reset_state, pci_name(dev));
+       }
+}
+
+/**
+ * eeh_event_handler - dispatch EEH events.  The detection of a frozen
+ * slot can occur inside an interrupt, where it can be hard to do
+ * anything about it.  The goal of this routine is to pull these
+ * detection events out of the context of the interrupt handler, and
+ * re-dispatch them for processing at a later time in a normal context.
+ *
+ * @dummy - unused
+ */
+static int eeh_event_handler(void * dummy)
+{
+       unsigned long flags;
+       struct eeh_event        *event;
+
+       daemonize ("eehd");
+
+       while (1) {
+               set_current_state(TASK_INTERRUPTIBLE);
+
+               spin_lock_irqsave(&eeh_eventlist_lock, flags);
+               event = NULL;
+               if (!list_empty(&eeh_eventlist)) {
+                       event = list_entry(eeh_eventlist.next, struct eeh_event, list);
+                       list_del(&event->list);
+               }
+               spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+               if (event == NULL)
+                       break;
+
+               printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
+                      pci_name(event->dev));
+
+               eeh_panic (event->dev, event->state);
+
+               kfree(event);
+       }
+
+       return 0;
+}
+
+/**
+ * eeh_thread_launcher
+ *
+ * @dummy - unused
+ */
+static void eeh_thread_launcher(void *dummy)
+{
+       if (kernel_thread(eeh_event_handler, NULL, CLONE_KERNEL) < 0)
+               printk(KERN_ERR "Failed to start EEH daemon\n");
+}
+
+/**
+ * eeh_send_failure_event - generate a PCI error event
+ * @dev pci device
+ *
+ * This routine can be called within an interrupt context;
+ * the actual event will be delivered in a normal context
+ * (from a workqueue).
+ */
+int eeh_send_failure_event (struct device_node *dn,
+                            struct pci_dev *dev,
+                            int state,
+                            int time_unavail)
+{
+       unsigned long flags;
+       struct eeh_event *event;
+
+       event = kmalloc(sizeof(*event), GFP_ATOMIC);
+       if (event == NULL) {
+               printk (KERN_ERR "EEH: out of memory, event not handled\n");
+               return 1;
+       }
+
+       if (dev)
+               pci_dev_get(dev);
+
+       event->dn = dn;
+       event->dev = dev;
+       event->state = state;
+       event->time_unavail = time_unavail;
+
+       /* We may or may not be called in an interrupt context */
+       spin_lock_irqsave(&eeh_eventlist_lock, flags);
+       list_add(&event->list, &eeh_eventlist);
+       spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+
+       schedule_work(&eeh_event_wq);
+
+       return 0;
+}
+
+/********************** END OF FILE ******************************/
diff --git a/include/asm-powerpc/eeh_event.h b/include/asm-powerpc/eeh_event.h

new file mode 100644 (file)

index 0000000..d168a30
--- /dev/null
+++ b/include/asm-powerpc/eeh_event.h
@@ -0,0 +1,52 @@
+/*
+ *     eeh_event.h
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ * Copyright (c) 2005 Linas Vepstas <linas@linas.org>
+ */
+
+#ifndef ASM_PPC64_EEH_EVENT_H
+#define ASM_PPC64_EEH_EVENT_H
+
+/** EEH event -- structure holding pci controller data that describes
+ *  a change in the isolation status of a PCI slot.  A pointer
+ *  to this struct is passed as the data pointer in a notify callback.
+ */
+struct eeh_event {
+       struct list_head     list;
+       struct device_node      *dn;   /* struct device node */
+       struct pci_dev       *dev;  /* affected device */
+       int                  state;
+       int time_unavail;    /* milliseconds until device might be available */
+};
+
+/**
+ * eeh_send_failure_event - generate a PCI error event
+ * @dev pci device
+ *
+ * This routine builds a PCI error event which will be delivered
+ * to all listeners on the peh_notifier_chain.
+ *
+ * This routine can be called within an interrupt context;
+ * the actual event will be delivered in a normal context
+ * (from a workqueue).
+ */
+int eeh_send_failure_event (struct device_node *dn,
+                            struct pci_dev *dev,
+                            int reset_state,
+                            int time_unavail);
+
+#endif /* ASM_PPC64_EEH_EVENT_H */
diff --git a/include/asm-ppc64/eeh.h b/include/asm-ppc64/eeh.h

index 40c8eb57493eb124170ad721cbdf378813f23a05..89f26ab31908df6e72932291568340243ba7b4b3 100644 (file)
--- a/include/asm-ppc64/eeh.h
+++ b/include/asm-ppc64/eeh.h
@@ -1,4 +1,4 @@
-/* 
+/*
   * eeh.h
   * Copyright (C) 2001  Dave Engebretsen & Todd Inglett IBM Corporation.
   *
@@ -6,12 +6,12 @@
   * it under the terms of the GNU General Public License as published by
   * the Free Software Foundation; either version 2 of the License, or
   * (at your option) any later version.
- * 
+ *
   * This program is distributed in the hope that it will be useful,
   * but WITHOUT ANY WARRANTY; without even the implied warranty of
   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   * GNU General Public License for more details.
- * 
+ *
   * You should have received a copy of the GNU General Public License
   * along with this program; if not, write to the Free Software
   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
@@ -27,8 +27,6 @@
  
  struct pci_dev;
  struct device_node;
-struct device_node;
-struct notifier_block;
  
  #ifdef CONFIG_EEH
  
@@ -37,6 +35,10 @@ struct notifier_block;
  #define EEH_MODE_NOCHECK       (1<<1)
  #define EEH_MODE_ISOLATED      (1<<2)
  
+/* Max number of EEH freezes allowed before we consider the device
+ * to be permanently disabled. */
+#define EEH_MAX_ALLOWED_FREEZES 5
+
  void __init eeh_init(void);
  unsigned long eeh_check_failure(const volatile void __iomem *token,
                                 unsigned long val);
@@ -59,36 +61,14 @@ void eeh_add_device_late(struct pci_dev *);
   * eeh_remove_device - undo EEH setup for the indicated pci device
   * @dev: pci device to be removed
   *
- * This routine should be when a device is removed from a running
- * system (e.g. by hotplug or dlpar).
+ * This routine should be called when a device is removed from
+ * a running system (e.g. by hotplug or dlpar).  It unregisters
+ * the PCI device from the EEH subsystem.  I/O errors affecting
+ * this device will no longer be detected after this call; thus,
+ * i/o errors affecting this slot may leave this device unusable.
   */
  void eeh_remove_device(struct pci_dev *);
  
-#define EEH_DISABLE            0
-#define EEH_ENABLE             1
-#define EEH_RELEASE_LOADSTORE  2
-#define EEH_RELEASE_DMA                3
-
-/**
- * Notifier event flags.
- */
-#define EEH_NOTIFY_FREEZE  1
-
-/** EEH event -- structure holding pci slot data that describes
- *  a change in the isolation status of a PCI slot.  A pointer
- *  to this struct is passed as the data pointer in a notify callback.
- */
-struct eeh_event {
-       struct list_head     list;
-       struct pci_dev       *dev;
-       struct device_node   *dn;
-       int                  reset_state;
-};
-
-/** Register to find out about EEH events. */
-int eeh_register_notifier(struct notifier_block *nb);
-int eeh_unregister_notifier(struct notifier_block *nb);
-
  /**
   * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
   *
@@ -129,7 +109,7 @@ static inline void eeh_remove_device(struct pci_dev *dev) { }
  #define EEH_IO_ERROR_VALUE(size) (-1UL)
  #endif /* CONFIG_EEH */
  
-/* 
+/*
   * MMIO read/write operations with EEH support.
   */
  static inline u8 eeh_readb(const volatile void __iomem *addr)
author	Linas Vepstas <linas@linas.org>
	Fri, 4 Nov 2005 00:50:04 +0000 (18:50 -0600)
committer	Paul Mackerras <paulus@samba.org>
	Thu, 10 Nov 2005 00:38:05 +0000 (11:38 +1100)
arch/powerpc/platforms/pseries/Makefile		patch \| blob \| blame \| history
arch/powerpc/platforms/pseries/eeh.c		patch \| blob \| blame \| history
arch/powerpc/platforms/pseries/eeh_event.c	[new file with mode: 0644]	patch \| blob
include/asm-powerpc/eeh_event.h	[new file with mode: 0644]	patch \| blob
include/asm-ppc64/eeh.h		patch \| blob \| blame \| history